In [1]:
import pandas as pd
import base64
from pydantic import BaseModel
from openai import OpenAI
import os
from tqdm.auto import tqdm

In [40]:
df_sample = pd.read_csv('gemma_detailed.csv')

In [3]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

In [19]:
class EvaluationOutput(BaseModel):
    text_image_score: float

In [35]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
system_message = """
You are a multimodal evaluator. Given an image and a caption, your task is to assess how well the image matches the caption, considering both image quality and semantic alignment.

Follow this checklist strictly:

1. Check for visual artifacts or implausibility in the image:
   - Are human or animal bodies or faces distorted, merged, or missing extra parts?
   - Are objects malformed (e.g., strange furniture or cars)?
   - Is there any distorted, nonsensical, or unreadable text?
   - Is the image nonsensical or unrealistic?
   - Is there excessive blurriness or a lack of sharpness?
   - Any other visual problems not described in the caption?

2. Check if the image content semantically aligns with the caption:
   - Are all people, animals, or objects mentioned in the caption present?
   - Do they have the correct attributes (color, size, shape)?
   - Are the described actions depicted accurately?
   - Are the quantities (e.g., two dogs) correct?
   - Are spatial relations correct (e.g., “on top of the car”)?
   - Any other mismatch between image and text?

Scoring Rule: the float number between 0 and 1, where 0 means the image is completely unrelated to the caption and 1 means the image perfectly matches the caption.
Output only the SCORE using the following format without any explanations:
SCORE
"""

In [36]:
import re
def extract_score(text):
    """
    Extracts score and justification from model output text.
    - Finds first float after SCORE-like keyword.
    - Captures justification after 'Explanation:' or similar.
    """
    # Score pattern
    score_patterns = [
        r"<score>\s*([0-1](?:\.\d+)?)\s*</score>",
        r"[Ss][Cc][Oo][Rr][Ee][^\d]{0,5}([0-1](?:\.\d+)?)",
        r"([0-1](?:\.\d+)?)(?=\s*(?:is the final score|/10|$))"
    ]

    score = None
    for pattern in score_patterns:
        match = re.search(pattern, text)
        if match:
            score = float(match.group(1))
            break
    if score is None:
        raise ValueError(f"No valid score found in text: {text}")
    return score

In [37]:
def get_scores(image_path, prompt, n_t=5):

    base64_image = encode_image(image_path)

    # Create the user message including both text and image information
    user_message = [
        {"type": "text", "text": f"Text prompt: {prompt}"},
        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
    ]



    # Call the API using the beta parse method for structured output
    
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        response_format=EvaluationOutput,
        max_tokens=30
    )

    # Extract the parsed evaluation output
    res = completion.choices[0].message.parsed
    return res.text_image_score

In [38]:
res_arr = []
n = 500
root_dir='./dataset'
for row in tqdm(df_sample[:n].iterrows(), total=n):
    idx = row[0]
    sample = row[1]
    image_path = os.path.join(root_dir, sample['name'])
    res = get_scores(image_path=image_path, prompt=sample['prompt'])
    res_arr.append(res)

  0%|          | 0/500 [00:00<?, ?it/s]

In [41]:
df_sample["gpt_al"] = res_arr

In [42]:
df_sample.to_csv('gpt_detailed_new.csv')

In [32]:
df_sample

Unnamed: 0.1,Unnamed: 0,name,prompt,adj1,adj2,style,mos_quality,std_quality,mos_align,std_align,gemma_al,gpt_al
0,2409,sd1.5_normal_027.jpg,the large cresting waves,,,,1.678770,0.674855,2.379343,0.963562,0.95,1.0
1,1547,sd1.5_highcorr_065.jpg,"an environmental concept art, anime style",,,anime style,2.994112,0.822402,3.158021,1.092774,0.95,1.0
2,881,glide_normal_291.jpg,oil painting portrait of anthropomorphic femal...,long-shot,hyper detail,sci-fi style,1.816769,0.617245,2.166015,0.787201,0.65,0.5
3,331,DALLE2_normal_032.jpg,an emo portrait painting,,,,2.478791,1.026343,2.777653,0.550425,0.75,0.8
4,2225,sd1.5_lowstep_143.jpg,"portrait of squids swarming the ocean, elevati...",elevation view,,,2.851759,0.894127,2.166015,0.550344,0.75,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...
495,2576,sd1.5_normal_194.jpg,"a cyborg jellyfish swimming in a vast ocean, c...",close-up,,baroque style,3.063451,0.550197,3.285945,0.479664,0.45,0.8
496,254,AttnGAN_normal_254.jpg,portrait of beautiful woman with magical nebul...,long-shot,soft lighting,,1.179543,0.346499,0.369090,0.685083,0.40,0.0
497,1902,sd1.5_lowcorr_120.jpg,"die cut sticker of a pair of lips , warm color",warm color,,,2.659179,0.846417,2.759039,0.770235,0.75,0.4
498,1073,midjourney_lowstep_185.jpg,"die cut sticker of chibi cute golf player , lo...",long-shot,,anime style,2.560586,0.642368,2.999705,0.863080,0.95,0.7


In [None]:


# Load data from CSV.
# The dataset should contain the columns "mos_align" (human rating) and "gpt_al" (LLM rating, range 0-10)
df = df_sample.copy()

# Determine the minimum and maximum values for the human ratings
human_min = df["mos_align"].min()
human_max = df["mos_align"].max()

# Normalize human ratings to the 0–10 range using:
# normalized_rating = (rating - min) / (max - min) * 10
df["human_norm"] = (df["mos_align"] - human_min) / (human_max - human_min) * 10

# Visualization: Scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(df["human_norm"], df["gpt_al"], alpha=0.7, edgecolors='k')
plt.xlabel("Human Rating (normalized, 0-10)")
plt.ylabel("LLM Rating (0-10)")
plt.title("Comparison of LLM Ratings and Normalized Human Ratings (Alignment)")
plt.grid(True)
plt.show()

# Calculate the Pearson correlation coefficient
pearson_corr, pearson_p = pearsonr(df["human_norm"], df["gpt_al"])
print(f"Pearson correlation coefficient: {pearson_corr:.3f} (p-value: {pearson_p:.3f})")

# Calculate the Spearman correlation coefficient
spearman_corr, spearman_p = spearmanr(df["human_norm"], df["gpt_al"])
print(f"Spearman correlation coefficient: {spearman_corr:.3f} (p-value: {spearman_p:.3f})")

# Build a linear regression model
X = df["human_norm"].values.reshape(-1, 1)
y = df["gpt_al"].values
model = LinearRegression()
model.fit(X, y)
r2 = model.score(X, y)
print(f"Coefficient of determination (R²): {r2:.3f}")

# Visualization of the regression model
plt.figure(figsize=(8, 6))
plt.scatter(df["human_norm"], df["gpt_al"], alpha=0.7, edgecolors='k', label="Data")
x_line = np.linspace(df["human_norm"].min(), df["human_norm"].max(), 100)
y_line = model.predict(x_line.reshape(-1, 1))
plt.plot(x_line, y_line, color='red', linewidth=2, label="Linear Regression")
plt.xlabel("Human Rating (normalized, 0-10)")
plt.ylabel("LLM Rating (0-10)")
plt.title("Linear Regression: LLM Rating vs. Normalized Human Rating (Alignment)")
plt.legend()
plt.grid(True)
plt.show()
