In [None]:
import torch
import json
import os
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

model_path = "./qwen2.5-vl-7b" 

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    dtype=torch.bfloat16,
    device_map="cuda"
)
processor = AutoProcessor.from_pretrained(model_path)


with open('simple_prompts.json', 'r') as f:
    prompts = json.load(f)

results = []

def run_judge(image_path, target_prompt):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": f"""
                You are an objective logic checker. You should check colors, count and spatial positions of objects.
                The requested prompt: "{target_prompt}"
                Does the image perfectly match the text? Answer only "YES" or "NO".
                """}, 
            ],
        }
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
    
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    response = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    
    score = 1 if "YES" in response[:15].upper() else 0
    return score, response

models = ["sd_baseline", "sd_smart", "flux"] #, "sd_baseline", "sd_smart" "flux"

for item in prompts:
    pid = item['id']
    target = item['simple_prompt']
    
    print(f"\n\n\nJudging {pid}...")
    
    for m in models:
        img_path = f"output_images/{pid}_{m}.png"
        
        if not os.path.exists(img_path):
            print(f"Skipping missing: {img_path}")
            continue
            
        try:
            score, reason = run_judge(img_path, target)
            print(f"  > {m}: {score} {reason}")
            
            results.append({
                "id": pid,
                "model": m,
                "score": score,
                "reason": reason
            })
        except Exception as e:
            print(f"  > Error judging {img_path}: {e}")

os.makedirs("output_scores", exist_ok=True)
with open('output_scores/final_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\nEvaluation Complete! Results saved.")

# ---------------------------
# Added Accuracy Report
# ---------------------------
print("\n=== MODEL ACCURACY REPORT ===")
model_scores = {m: {"correct": 0, "total": 0} for m in models}

for r in results:
    model_scores[r["model"]]["total"] += 1
    model_scores[r["model"]]["correct"] += r["score"]

for m in models:
    total = model_scores[m]["total"]
    correct = model_scores[m]["correct"]
    acc = (correct / total * 100) if total > 0 else 0
    print(f"{m}: {acc:.2f}% accuracy ({correct}/{total})")


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 5/5 [00:07<00:00,  1.45s/it]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.





Judging Attribute_0...
  > sd_baseline: 0 NO
  > sd_smart: 0 NO
  > flux: 1 YES



Judging Attribute_1...
  > sd_baseline: 0 NO
  > sd_smart: 0 NO
  > flux: 1 YES



Judging Attribute_2...
  > sd_baseline: 0 NO
  > sd_smart: 0 NO
  > flux: 1 YES



Judging Attribute_3...
  > sd_baseline: 0 NO
  > sd_smart: 0 NO
  > flux: 1 YES



Judging Attribute_4...
  > sd_baseline: 0 NO
  > sd_smart: 1 YES
  > flux: 1 YES



Judging Attribute_5...
  > sd_baseline: 0 NO
  > sd_smart: 0 NO
  > flux: 1 YES



Judging Attribute_6...
  > sd_baseline: 0 NO
  > sd_smart: 0 NO
  > flux: 1 YES



Judging Attribute_7...
  > sd_baseline: 0 NO
  > sd_smart: 0 NO
  > flux: 1 YES



Judging Attribute_8...
  > sd_baseline: 1 YES
  > sd_smart: 0 NO
  > flux: 1 YES



Judging Attribute_9...
  > sd_baseline: 0 NO
  > sd_smart: 0 NO
  > flux: 0 NO



Judging Attribute_10...
  > sd_baseline: 0 NO
  > sd_smart: 0 NO
  > flux: 1 YES



Judging Attribute_11...
  > sd_baseline: 0 NO
  > sd_smart: 0 NO
  > flux: 0 NO




In [None]:
import torch
import json
import os
import math
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Load Model
model_path = "./qwen2.5-vl-7b" 
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    dtype=torch.bfloat16,
    device_map="cuda"
)
processor = AutoProcessor.from_pretrained(model_path)

# --- NEW: Get Token IDs for Soft VQA ---
# We fetch the IDs for "YES" and "NO" dynamically to be safe.
# We also fetch "Yes" and "No" in case the model outputs mixed case, 
# though the prompt strictly asks for caps.
tokenizer = processor.tokenizer
YES_TOKEN_ID = tokenizer.convert_tokens_to_ids("YES")
NO_TOKEN_ID = tokenizer.convert_tokens_to_ids("NO")
# Fallback check to ensure we have valid IDs (handles different tokenizers)
if YES_TOKEN_ID is None: YES_TOKEN_ID = tokenizer.convert_tokens_to_ids("Yes")
if NO_TOKEN_ID is None: NO_TOKEN_ID = tokenizer.convert_tokens_to_ids("No")

print(f"Soft VQA Logic Initialized: YES_ID={YES_TOKEN_ID}, NO_ID={NO_TOKEN_ID}")
# ---------------------------------------

with open('simple_prompts.json', 'r') as f:
    prompts = json.load(f)

results = []

def run_judge(image_path, target_prompt):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": f"""
                You are an objective logic checker. You should check colors, count and spatial positions of objects.
                The requested prompt: "{target_prompt}"
                Does the image perfectly match the text? Answer only "YES" or "NO".
                """}, 
            ],
        }
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # --- UPDATED: Generate with Scores ---
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=128, 
            do_sample=False,
            return_dict_in_generate=True, # Required to get scores
            output_scores=True            # Required to get logits
        )
    
    # 1. Standard Hard Score (Text Generation)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, outputs.sequences)
    ]
    response = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    hard_score = 1 if "YES" in response[:15].upper() else 0

    # 2. Soft Score (Logit Calculation)
    # outputs.scores is a tuple of tensors (one per generation step).
    # We want the first step (index 0) where the Yes/No decision happens.
    first_token_logits = outputs.scores[0][0] # Shape: [vocab_size]
    
    yes_logit = first_token_logits[YES_TOKEN_ID].item()
    no_logit = first_token_logits[NO_TOKEN_ID].item()
    
    # Softmax specifically between YES and NO (Binary Forced Choice)
    # Formula: e^yes / (e^yes + e^no)
    try:
        soft_score = math.exp(yes_logit) / (math.exp(yes_logit) + math.exp(no_logit))
    except OverflowError:
        # Handle rare edge cases where logits are too high/low
        soft_score = 1.0 if yes_logit > no_logit else 0.0

    return hard_score, soft_score, response

models = ["sd_baseline", "sd_smart", "flux", "flux_smart"]

for item in prompts:
    pid = item['id']
    target = item['simple_prompt']
    
    print(f"\nJudging {pid}...")
    
    for m in models:
        img_path = f"output_images/{pid}_{m}.png"
        
        if not os.path.exists(img_path):
            print(f"Skipping missing: {img_path}")
            continue
            
        try:
            # Get both scores
            hard_score, soft_score, reason = run_judge(img_path, target)
            print(f"  > {m}: Hard={hard_score} | Soft={soft_score:.4f}")
            
            results.append({
                "id": pid,
                "model": m,
                "score": hard_score,      # Binary 0/1
                "soft_score": soft_score, # Continuous 0.0-1.0
                "reason": reason
            })
        except Exception as e:
            print(f"  > Error judging {img_path}: {e}")

# Save results
os.makedirs("output_scores", exist_ok=True)
with open('output_scores/final_results_soft.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\nEvaluation Complete! Results saved.")

# ---------------------------
# Nuanced Accuracy Report
# ---------------------------
print("\n=== MODEL PERFORMANCE REPORT ===")
metrics = {m: {"hard_correct": 0, "soft_total": 0.0, "count": 0} for m in models}

for r in results:
    m = r["model"]
    metrics[m]["count"] += 1
    metrics[m]["hard_correct"] += r["score"]
    metrics[m]["soft_total"] += r["soft_score"]

print(f"{'Model':<15} | {'Hard Acc':<10} | {'Soft Score (Confidence)':<25}")
print("-" * 55)

for m in models:
    count = metrics[m]["count"]
    if count > 0:
        hard_acc = (metrics[m]["hard_correct"] / count) * 100
        soft_acc = (metrics[m]["soft_total"] / count) * 100
        print(f"{m:<15} | {hard_acc:.2f}%     | {soft_acc:.2f}%")
    else:
        print(f"{m:<15} | N/A        | N/A")

# sd_baseline     | 23.33%     | 24.62%
# sd_smart        | 42.00%     | 38.58%
# flux            | 72.67%     | 69.70%
# flux_smart      | 73.33%     | 70.96%

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 5/5 [00:07<00:00,  1.45s/it]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Soft VQA Logic Initialized: YES_ID=14004, NO_ID=8996

Judging Attribute_0...
  > sd_baseline: Hard=0 | Soft=0.0014
  > sd_smart: Hard=0 | Soft=0.0009
  > flux: Hard=1 | Soft=0.9947
  > flux_smart: Hard=1 | Soft=0.9828

Judging Attribute_1...
  > sd_baseline: Hard=0 | Soft=0.0172
  > sd_smart: Hard=0 | Soft=0.0014
  > flux: Hard=1 | Soft=0.9915
  > flux_smart: Hard=1 | Soft=0.9783

Judging Attribute_2...
  > sd_baseline: Hard=0 | Soft=0.0013
  > sd_smart: Hard=0 | Soft=0.0007
  > flux: Hard=1 | Soft=0.9828
  > flux_smart: Hard=0 | Soft=0.4408

Judging Attribute_3...
  > sd_baseline: Hard=0 | Soft=0.0006
  > sd_smart: Hard=0 | Soft=0.0059
  > flux: Hard=1 | Soft=0.9905
  > flux_smart: Hard=1 | Soft=0.6713

Judging Attribute_4...
  > sd_baseline: Hard=0 | Soft=0.0846
  > sd_smart: Hard=1 | Soft=0.8246
  > flux: Hard=1 | Soft=0.9893
  > flux_smart: Hard=1 | Soft=0.9864

Judging Attribute_5...
  > sd_baseline: Hard=0 | Soft=0.0307
  > sd_smart: Hard=0 | Soft=0.0033
  > flux: Hard=1 | Soft=0

In [None]:
import torch
import json
import os
import math
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# ---------------------------
# 1. SETUP MODEL
# ---------------------------
model_path = "./qwen2.5-vl-7b" 
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path,
    dtype=torch.bfloat16,
    device_map="cuda"
)
processor = AutoProcessor.from_pretrained(model_path)

# Soft VQA Token IDs
tokenizer = processor.tokenizer
YES_TOKEN_ID = tokenizer.convert_tokens_to_ids("YES")
NO_TOKEN_ID = tokenizer.convert_tokens_to_ids("NO")
if YES_TOKEN_ID is None: YES_TOKEN_ID = tokenizer.convert_tokens_to_ids("Yes")
if NO_TOKEN_ID is None: NO_TOKEN_ID = tokenizer.convert_tokens_to_ids("No")
print(f"Soft VQA Logic Initialized: YES_ID={YES_TOKEN_ID}, NO_ID={NO_TOKEN_ID}")

# ---------------------------
# 2. LOAD PROMPTS & MAP CATEGORIES
# ---------------------------
with open('simple_prompts.json', 'r') as f:
    prompts = json.load(f)

# Create a map to look up category by ID later
# e.g., {'Attribute_0': 'Attribute', 'Spatial_0': 'Spatial'}
id_to_category = {item['id']: item.get('category', 'Unknown') for item in prompts}
all_categories = sorted(list(set(id_to_category.values())))

results = []

# ---------------------------
# 3. JUDGE FUNCTION
# ---------------------------
def run_judge(image_path, target_prompt):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": f"""
                You are an objective logic checker. You should check colors, count and spatial positions of objects.
                The requested prompt: "{target_prompt}"
                Does the image perfectly match the text? Answer only "YES" or "NO".
                """}, 
            ],
        }
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=128, 
            do_sample=False,
            return_dict_in_generate=True,
            output_scores=True
        )
    
    # Hard Score
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, outputs.sequences)
    ]
    response = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    hard_score = 1 if "YES" in response[:15].upper() else 0

    # Soft Score
    first_token_logits = outputs.scores[0][0]
    yes_logit = first_token_logits[YES_TOKEN_ID].item()
    no_logit = first_token_logits[NO_TOKEN_ID].item()
    try:
        soft_score = math.exp(yes_logit) / (math.exp(yes_logit) + math.exp(no_logit))
    except OverflowError:
        soft_score = 1.0 if yes_logit > no_logit else 0.0

    return hard_score, soft_score, response

# ---------------------------
# 4. EVALUATION LOOP
# ---------------------------
models = ["sd_baseline", "sd_smart", "flux", "flux_smart"]

for item in prompts:
    pid = item['id']
    target = item['simple_prompt']
    
    print(f"\nJudging {pid}...")
    
    for m in models:
        img_path = f"output_images/{pid}_{m}.png"
        
        if not os.path.exists(img_path):
            # For flux_smart we might have saved in a different folder in previous steps, 
            # check logic if needed. Assuming all in output_images here based on standard logic.
            # If you saved flux_smart in 'output_images_flux_smart', handle that path here:
            if m == "flux_smart" and os.path.exists(f"output_images_flux_smart/{pid}_flux_smart.png"):
                 img_path = f"output_images_flux_smart/{pid}_flux_smart.png"
            else:
                 print(f"Skipping missing: {img_path}")
                 continue
            
        try:
            hard_score, soft_score, reason = run_judge(img_path, target)
            print(f"  > {m}: Hard={hard_score} | Soft={soft_score:.4f}")
            
            results.append({
                "id": pid,
                "model": m,
                "category": id_to_category.get(pid, "Unknown"), # Store category here
                "score": hard_score,
                "soft_score": soft_score,
                "reason": reason
            })
        except Exception as e:
            print(f"  > Error judging {img_path}: {e}")

# Save raw results
os.makedirs("output_scores", exist_ok=True)
with open('output_scores/final_results_soft.json', 'w') as f:
    json.dump(results, f, indent=2)

print("\nEvaluation Complete! Results saved.")

# ---------------------------
# 5. DETAILED REPORTING
# ---------------------------
print("\n" + "="*60)
print("             DETAILED MODEL PERFORMANCE REPORT")
print("="*60)

# Initialize Aggregation Structure
# Structure: metrics[category][model] = {'correct': 0, 'soft_sum': 0.0, 'total': 0}
metrics = {}
for cat in all_categories + ["OVERALL"]:
    metrics[cat] = {}
    for m in models:
        metrics[cat][m] = {'correct': 0, 'soft_sum': 0.0, 'total': 0}

# Aggregation Logic
for r in results:
    m = r['model']
    cat = r['category']
    s_hard = r['score']
    s_soft = r['soft_score']
    
    # Add to specific category
    if cat in metrics:
        metrics[cat][m]['correct'] += s_hard
        metrics[cat][m]['soft_sum'] += s_soft
        metrics[cat][m]['total'] += 1
        
    # Add to OVERALL
    metrics["OVERALL"][m]['correct'] += s_hard
    metrics["OVERALL"][m]['soft_sum'] += s_soft
    metrics["OVERALL"][m]['total'] += 1

# Print Tables
def print_table(category_name, data_dict):
    print(f"\n>>> Category: {category_name.upper()}")
    print(f"{'Model':<15} | {'Hard Accuracy (Count)':<25} | {'Soft Score':<15}")
    print("-" * 60)
    
    for m in models:
        stats = data_dict.get(m, {'total': 0})
        total = stats['total']
        if total == 0:
            print(f"{m:<15} | {'N/A':<25} | {'N/A':<15}")
            continue
            
        correct = stats['correct']
        soft_sum = stats['soft_sum']
        
        hard_acc = (correct / total) * 100
        soft_acc = (soft_sum / total) * 100
        
        # Format: "72.33% (217/300)"
        acc_str = f"{hard_acc:.2f}% ({correct}/{total})"
        print(f"{m:<15} | {acc_str:<25} | {soft_acc:.2f}%")

# 1. Print Overall First
print_table("OVERALL", metrics["OVERALL"])

# 2. Print Each Category
for cat in all_categories:
    if cat != "OVERALL":
        print_table(cat, metrics[cat])

print("\n" + "="*60)


# ============================================================
#              DETAILED MODEL PERFORMANCE REPORT
# ============================================================

# >>> Category: OVERALL
# Model           | Hard Accuracy (Count)     | Soft Score     
# ------------------------------------------------------------
# sd_baseline     | 23.33% (70/300)           | 24.62%
# sd_smart        | 42.00% (126/300)          | 38.58%
# flux            | 72.67% (218/300)          | 69.70%
# flux_smart      | 73.33% (220/300)          | 70.96%

# >>> Category: ATTRIBUTE
# Model           | Hard Accuracy (Count)     | Soft Score     
# ------------------------------------------------------------
# sd_baseline     | 12.00% (12/100)           | 13.96%
# sd_smart        | 27.00% (27/100)           | 25.70%
# flux            | 92.00% (92/100)           | 89.81%
# flux_smart      | 73.00% (73/100)           | 71.87%

# >>> Category: NUMERACY
# Model           | Hard Accuracy (Count)     | Soft Score     
# ------------------------------------------------------------
# sd_baseline     | 36.00% (36/100)           | 37.52%
# sd_smart        | 62.00% (62/100)           | 56.13%
# flux            | 79.00% (79/100)           | 72.27%
# flux_smart      | 73.00% (73/100)           | 70.69%

# >>> Category: SPATIAL
# Model           | Hard Accuracy (Count)     | Soft Score     
# ------------------------------------------------------------
# sd_baseline     | 22.00% (22/100)           | 22.39%
# sd_smart        | 37.00% (37/100)           | 33.92%
# flux            | 47.00% (47/100)           | 47.03%
# flux_smart      | 74.00% (74/100)           | 70.33%

# ============================================================



  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 5/5 [00:07<00:00,  1.44s/it]
The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Soft VQA Logic Initialized: YES_ID=14004, NO_ID=8996

Judging Attribute_0...
  > sd_baseline: Hard=0 | Soft=0.0014
  > sd_smart: Hard=0 | Soft=0.0009
  > flux: Hard=1 | Soft=0.9947
  > flux_smart: Hard=1 | Soft=0.9828

Judging Attribute_1...
  > sd_baseline: Hard=0 | Soft=0.0172
  > sd_smart: Hard=0 | Soft=0.0014
  > flux: Hard=1 | Soft=0.9915
  > flux_smart: Hard=1 | Soft=0.9783

Judging Attribute_2...
  > sd_baseline: Hard=0 | Soft=0.0013
  > sd_smart: Hard=0 | Soft=0.0007
  > flux: Hard=1 | Soft=0.9828
  > flux_smart: Hard=0 | Soft=0.4408

Judging Attribute_3...
  > sd_baseline: Hard=0 | Soft=0.0006
  > sd_smart: Hard=0 | Soft=0.0059
  > flux: Hard=1 | Soft=0.9905
  > flux_smart: Hard=1 | Soft=0.6713

Judging Attribute_4...
  > sd_baseline: Hard=0 | Soft=0.0846
  > sd_smart: Hard=1 | Soft=0.8246
  > flux: Hard=1 | Soft=0.9893
  > flux_smart: Hard=1 | Soft=0.9864

Judging Attribute_5...
  > sd_baseline: Hard=0 | Soft=0.0307
  > sd_smart: Hard=0 | Soft=0.0033
  > flux: Hard=1 | Soft=0

In [None]:
prompt = """
You are an expert prompt engineer for Stable Diffusion 1.5.
The model struggles with complex sentences and attribute bleeding.
Rewrite the user's prompt to maximize structural accuracy, NOT aesthetics.
Rules:
1. Use short, fragmented sentences separated by periods (e.g., 'A cat. A dog. Side by side.').
2. For specific numbers, ALWAYS add 'isolated on white background' and remove scene descriptions.
3. For colors, mention the object and color together in a distinct sentence (e.g., 'A red car.').
4. Avoid flowery adjectives (e.g., 'vibrant', 'plush', 'sun-drenched').
Return ONLY the final rewritten prompt.
"""
