In [1]:
import requests
import json
from datasets import load_dataset
import pandas as pd
import nltk
from sentence_transformers import SentenceTransformer, util  # For cosine similarity
from collections import defaultdict

In [2]:
# Load the dataset
dataset = load_dataset("mlfoundations/VisIT-Bench", split='test')

# Load a pre-trained model for sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

Resolving data files:   0%|          | 0/575 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/575 [00:00<?, ?it/s]



In [3]:
def query_model(instruction, image_url):
    # Define a system prompt that sets the limit for the description
    system_prompt = "Describe this image"
    
    # Combine the system prompt with the user's instruction
    full_instruction = f"{system_prompt}"
    
    url = "https://proxy.tune.app/chat/completions"
    headers = {
        "Authorization": "YOUR_TUNE_API_KEY",
        "Content-Type": "application/json"
    }
    payload = {
        "temperature": 0.9,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": full_instruction},
                    {"type": "image_url", "image_url": {"url": image_url}}
                ]
            }
        ],
        "model": "mistral/pixtral-12B-2409",
        "stream": False,
        "frequency_penalty": 0.2,
        "max_tokens": 200
    }
    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        #print(f"Error querying model: {e}")
        return {}


In [4]:
# Initialize lists
valid_samples = []
win_count = 0
total_samples = 0

# Process dataset and accumulate results
for sample in dataset:
    # Extract the instruction and image URL
    instruction = sample.get('instruction', 'No instruction available')
    public_images_metadata = sample.get('public_images_metadata', '{}')
    if isinstance(public_images_metadata, str):
        try:
            public_images_metadata = json.loads(public_images_metadata)
        except json.JSONDecodeError:
            #print("Error decoding JSON from 'public_images_metadata', skipping...")
            continue  # Skip the sample if metadata is invalid
    
    image_url = public_images_metadata.get('OriginalURL', None)
    if not image_url:
        #print("No valid image URL found, skipping...")
        continue  # Skip if no image URL is found

    # Query the model
    response = query_model(instruction, image_url)
    if not response or not response.get("choices"):
        #print(f"Invalid model response for instruction: {instruction}, skipping...")
        continue  # Skip if model response is invalid

    model_output = response["choices"][0].get("message", {}).get("content", "").strip()
    if not model_output:
        #print("No model output, skipping...")
        continue  # Skip if no model output

    # Extract reference output
    reference_output = sample.get('instruction_conditioned_caption', 'No reference output available')
    if not reference_output:
        #print("No reference output, skipping...")
        continue  # Skip if no reference output

    # Encode the texts using a sentence embedding model
    reference_embedding = model.encode(reference_output, convert_to_tensor=True)
    model_embedding = model.encode(model_output, convert_to_tensor=True)

    # Compute cosine similarity
    similarity_score = util.pytorch_cos_sim(model_embedding, reference_embedding).item()

    # Increment win count based on a similarity threshold (e.g., if similarity > 0.8)
    if similarity_score > 0.8:
        win_count += 1
    
    total_samples += 1

    # Append valid data to the list
    valid_samples.append({
        "instruction": instruction,
        "reference_output": reference_output,
        "model_output": model_output,
        "similarity_score": similarity_score
    })


In [5]:
# Calculate win rate
win_rate = win_count / total_samples if total_samples > 0 else 0

# Calculate additional metrics
similarity_scores = [sample['similarity_score'] for sample in valid_samples]

# Average similarity score
average_similarity = sum(similarity_scores) / total_samples

# Sort scores to find top-k scores
sorted_scores = sorted(similarity_scores, reverse=True)

# Top 1, Top 5, and Top 10 average scores
top_1_score = sorted_scores[0] if total_samples >= 1 else 0
top_5_average = sum(sorted_scores[:5]) / 5 if total_samples >= 5 else 0
top_10_average = sum(sorted_scores[:10]) / 10 if total_samples >= 10 else 0

# Save results only for valid samples
results_df = pd.DataFrame(valid_samples)
results_df.to_csv('visit_caption.csv', index=False)

# Save metrics to a file
metrics_df = pd.DataFrame([{
    "win_rate": win_rate,
    "average_similarity": average_similarity,
    "top_1_score": top_1_score,
    "top_5_average_score": top_5_average,
    "top_10_average_score": top_10_average
}])

metrics_df.to_csv('visit_caption_score.csv', index=False)

print("Win rate, average similarity, and top-k scores saved successfully.")

Win rate, average similarity, and top-k scores saved successfully.
