In [12]:
import requests
import json
import random
import base64
import os
from sentence_transformers import SentenceTransformer, util
from PIL import Image
from io import BytesIO
import pandas as pd

In [13]:
# Paths
IMAGE_DIR = 'data/images'
JSON_FILE = 'data/arxivqa.jsonl'

# Load the dataset from JSON Lines file
with open(JSON_FILE, 'r') as f:
    dataset = [json.loads(line) for line in f]

# Convert dataset to DataFrame for easier handling
df = pd.DataFrame(dataset)

# Load a pre-trained model for sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Randomly select 1000 samples
random_samples = random.sample(range(len(df)), 1000)

In [14]:
# Function to encode image data to base64
def encode_image(image_path):
    with open(image_path, 'rb') as f:
        image = Image.open(f)
        buffered = BytesIO()
        image.save(buffered, format="JPEG")  # Save the image to a buffer
        return base64.b64encode(buffered.getvalue()).decode('utf-8')

def query_model(instruction, base64_image):
    system_prompt = (
        "Answer Query Based on Image"
    )
    
    full_instruction = f"{system_prompt}\n\n{instruction}"
    
    url = "https://proxy.tune.app/chat/completions"
    headers = {
        "Authorization": "YOUR_TUNE_API_KEY",
        "Content-Type": "application/json"
    }
    payload = {
        "temperature": 0.4,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": full_instruction},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }
        ],
        "model": "mistral/pixtral-12B-2409",
        "stream": False,
        "frequency_penalty": 0.2,
        "max_tokens": 300
    }
    try:
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error querying model: {e}")
        return {}

In [15]:
# Initialize lists
valid_samples = []
similarity_scores = []
rationale_scores = []

# Process dataset and accumulate results
for idx in random_samples:  # Use idx to get each sample
    sample = df.iloc[idx]  # Access the sample using the index
    filename = sample.get('image', 'No filename available')
    image_path = os.path.join(IMAGE_DIR, filename)
    
    if not os.path.isfile(image_path):
        print(f"Image file not found: {image_path}, skipping...")
        continue  # Skip if image file is not found
    
    # Encode image to base64
    base64_image = encode_image(image_path)
    
    # Prepare instruction
    instruction = f"Question: {sample.get('question', '')}\nOptions: {', '.join(sample.get('options', []))}"
    
    # Query the model
    response = query_model(instruction, base64_image)
    
    # Extract model outputs
    choices = response.get("choices", [{}])[0].get("message", {}).get("content", "").strip().split("\n")
    model_label = choices[0] if len(choices) > 0 else ""
    model_rationale = "\n".join(choices[1:]) if len(choices) > 1 else ""
    
    # Extract reference outputs
    reference_label = sample.get('label', 'No label available')
    reference_rationale = sample.get('rationale', 'No rationale available')
    
    # Compute embeddings for reference outputs and model outputs
    label_embedding = model.encode(reference_label, convert_to_tensor=True)
    model_label_embedding = model.encode(model_label, convert_to_tensor=True)
    rationale_embedding = model.encode(reference_rationale, convert_to_tensor=True)
    model_rationale_embedding = model.encode(model_rationale, convert_to_tensor=True)
    
    # Calculate cosine similarity
    label_similarity = util.pytorch_cos_sim(model_label_embedding, label_embedding).item()
    rationale_similarity = util.pytorch_cos_sim(model_rationale_embedding, rationale_embedding).item()
    
    # Collect the results
    valid_samples.append({
        "filename": filename,
        "instruction": instruction,
        "model_label": model_label,
        "model_rationale": model_rationale,
        "reference_label": reference_label,
        "reference_rationale": reference_rationale,
        "label_similarity": label_similarity,
        "rationale_similarity": rationale_similarity
    })

In [18]:
# Calculate win rate based on similarity thresholds (e.g., 0.8)
label_win_rate = sum(score > 0.8 for score in [sample['label_similarity'] for sample in valid_samples]) / len(valid_samples) if len(valid_samples) > 0 else 0
rationale_win_rate = sum(score > 0.8 for score in [sample['rationale_similarity'] for sample in valid_samples]) / len(valid_samples) if len(valid_samples) > 0 else 0

# Calculate top-1, top-5, and top-10 average scores
sorted_label_scores = sorted([sample['label_similarity'] for sample in valid_samples], reverse=True)
top_1_label_score = sorted_label_scores[0] if len(sorted_label_scores) >= 1 else 0
top_5_label_average = sum(sorted_label_scores[:5]) / 5 if len(sorted_label_scores) >= 5 else 0
top_10_label_average = sum(sorted_label_scores[:10]) / 10 if len(sorted_label_scores) >= 10 else 0

sorted_rationale_scores = sorted([sample['rationale_similarity'] for sample in valid_samples], reverse=True)
top_1_rationale_score = sorted_rationale_scores[0] if len(sorted_rationale_scores) >= 1 else 0
top_5_rationale_average = sum(sorted_rationale_scores[:5]) / 5 if len(sorted_rationale_scores) >= 5 else 0
top_10_rationale_average = sum(sorted_rationale_scores[:10]) / 10 if len(sorted_rationale_scores) >= 10 else 0

# Save inference results to a file
results_df = pd.DataFrame(valid_samples)
results_df.to_csv('arxivqa_model_results.csv', index=False)

# Save metrics to a file
metrics_df = pd.DataFrame([{
    "label_win_rate": label_win_rate,
    "rationale_win_rate": rationale_win_rate,
    "top_1_label_score": top_1_label_score,
    "top_5_label_average_score": top_5_label_average,
    "top_10_label_average_score": top_10_label_average,
    "top_1_rationale_score": top_1_rationale_score,
    "top_5_rationale_average_score": top_5_rationale_average,
    "top_10_rationale_average_score": top_10_rationale_average
}])
metrics_df.to_csv('arxivqa_dataset_scores.csv', index=False)

print("Evaluation complete. Results and metrics saved.")

Evaluation complete. Results and metrics saved.
