In [1]:
import requests
import json
import random
import base64
from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from PIL import Image
from io import BytesIO

In [2]:
# Load the dataset
dataset = load_dataset("mozilla/flickr30k-transformed-captions-gpt4o", split='test')

# Load a pre-trained model for sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Randomly select 500 samples
random_samples = random.sample(range(len(dataset)), 1000)



In [3]:
# Function to encode image data to base64
def encode_image(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")  # Save the image to a buffer
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

def query_model(instruction, image):
    system_prompt = (
        "Generate a detailed description of the image in approximately 40 words. "
        "Include key elements and vivid details, and ensure it accurately reflects the content of the image."
    )
    
    full_instruction = f"{system_prompt}"
    
    # Encode the image data to base64
    base64_image = encode_image(image)
    
    url = "https://proxy.tune.app/chat/completions"
    headers = {
        "Authorization": "YOUR_TUNE_API_KEY",
        "Content-Type": "application/json"
    }
    payload = {
        "temperature": 0.4,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": full_instruction},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }
        ],
        "model": "mistral/pixtral-12B-2409",
        "stream": False,
        "frequency_penalty": 0.2,
        "max_tokens": 300
    }
    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error querying model: {e}")
        return {}

In [7]:
# Initialize lists
valid_samples = []
similarity_scores = []
gpt_scores_list = []

# Process dataset and accumulate results
for idx in random_samples:  # Use idx to get each sample
    sample = dataset[idx]  # Access the sample using the index
    image = sample['image']  # Directly use the image object
    filename = sample['filename']  # Ensure this key is correct for filename
    
    # Query the model
    response = query_model("Describe this image", image)
    model_output = response.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
    
    # Extract Real captions
    reference_outputs = sample['original_alt_text']  # Ensure this key is correct for captions
    
    # Compute embeddings for reference outputs and model output
    reference_embeddings = model.encode(reference_outputs, convert_to_tensor=True)
    model_embedding = model.encode(model_output, convert_to_tensor=True)
    
    # Calculate cosine similarity
    similarity_scores = [util.pytorch_cos_sim(model_embedding, ref_emb).item() for ref_emb in reference_embeddings]
    
    # Extract GPT-4 Caption
    gpt_output = sample['alt_text']  # Ensure this key is correct for captions
    
    # Compute embedding for GPT-4 output
    gpt_embedding = model.encode(gpt_output, convert_to_tensor=True)
    
    # Ensure the GPT embedding is a 2D tensor for batch operations
    if gpt_embedding.dim() == 1:
        gpt_embedding = gpt_embedding.unsqueeze(0)  # Add batch dimension if needed
    
    # Calculate cosine similarity for each reference caption
    gpt_scores = [util.pytorch_cos_sim(gpt_embedding, ref_emb.unsqueeze(0)).item() for ref_emb in reference_embeddings]
    
    # Calculate win rates based on a threshold
    model_win_rate = sum(score > 0.8 for score in similarity_scores) / len(similarity_scores) if len(similarity_scores) > 0 else 0
    gpt_win_rate = sum(score > 0.8 for score in gpt_scores) / len(gpt_scores) if len(gpt_scores) > 0 else 0
    
    # Collect the results
    valid_samples.append({
        "filename": filename,
        "model_output": model_output,
        "reference_outputs": reference_outputs,
        "similarity_scores": similarity_scores,
        "GPT similarity score": gpt_scores
    })
    gpt_scores_list.extend(gpt_scores)  # Store GPT-4 scores for final metrics calculations


In [8]:
# Calculate win rate for model
win_rate = sum(score > 0.8 for score in similarity_scores) / len(similarity_scores) if len(similarity_scores) > 0 else 0

# Calculate top-1, top-5, and top-10 average scores for the model
sorted_scores = sorted(similarity_scores, reverse=True)
top_1_score = sorted_scores[0] if len(sorted_scores) >= 1 else 0
top_5_average = sum(sorted_scores[:5]) / 5 if len(sorted_scores) >= 5 else 0
top_10_average = sum(sorted_scores[:10]) / 10 if len(sorted_scores) >= 10 else 0

# Calculate win rate for GPT-4
gpt_win_rate = sum(score > 0.8 for score in gpt_scores_list) / len(gpt_scores_list) if len(gpt_scores_list) > 0 else 0

# Calculate top-1, top-5, and top-10 average scores for GPT-4
gsorted_scores = sorted(gpt_scores_list, reverse=True)
gtop_1_score = gsorted_scores[0] if len(gsorted_scores) >= 1 else 0
gtop_5_average = sum(gsorted_scores[:5]) / 5 if len(gsorted_scores) >= 5 else 0
gtop_10_average = sum(gsorted_scores[:10]) / 10 if len(gsorted_scores) >= 10 else 0

# Save inference results to a file
results_df = pd.DataFrame(valid_samples)
results_df.to_csv('flickr30k_model_results.csv', index=False)

# Save metrics to a file
metrics_df = pd.DataFrame([{
    "win_rate": win_rate,
    "top_1_score": top_1_score,
    "top_5_average_score": top_5_average,
    "top_10_average_score": top_10_average
}])
metrics_df.to_csv('flickr30k_dataset_scores.csv', index=False)

# Save GPT-4 metrics to a file
gpt_metrics_df = pd.DataFrame([{
    "win_rate": gpt_win_rate,
    "top_1_score": gtop_1_score,
    "top_5_average_score": gtop_5_average,
    "top_10_average_score": gtop_10_average
}])
gpt_metrics_df.to_csv('flickr30k_gpt4_scores.csv', index=False)

print(metrics_df)
print(gpt_metrics_df)

   win_rate  top_1_score  top_5_average_score  top_10_average_score
0       0.0     0.336796             0.279737                     0
   win_rate  top_1_score  top_5_average_score  top_10_average_score
0    0.0506     0.987752             0.978322              0.966494
