# Hallucination Dataset Analysis

This notebook loads and analyzes hallucination datasets from three Vision-Language Models:
- Gemma 3 12B
- Qwen2.5-VL-7B
- Molmo-7B-O-0924

Each dataset contains 10,000 VQA samples with model-generated answers and ground truth.

## 1. Import Libraries

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


## 2. Load Datasets

In [13]:
# Define paths
base_path = Path('/Users/saiakhil/Documents/Thesis/HALP_EACL')

gemma_path = base_path / 'Gemma_3' / 'gemma3_hallucination_dataset.csv'
qwen_path = base_path / 'Qwen25_VL' / 'qwen25vl_hallucination_dataset.csv'
molmo_path = base_path / 'Molmo_V1' / 'molmo_hallucination_dataset.csv'
original_path = "/Users/saiakhil/Documents/Thesis/HALP_EACL/Datasets/final_data/sampled_10k_relational_dataset.csv"

# Load datasets
print("Loading datasets...\n")

gemma_df = pd.read_csv(gemma_path)
print(f"✓ Gemma 3 12B: {len(gemma_df):,} samples")

qwen_df = pd.read_csv(qwen_path)
print(f"✓ Qwen2.5-VL-7B: {len(qwen_df):,} samples")

molmo_df = pd.read_csv(molmo_path)
print(f"✓ Molmo-7B-O-0924: {len(molmo_df):,} samples")
 

ori_df = pd.read_csv(original_path)
print(f"✓ original_df: {len(ori_df):,} samples")

print("\n✅ All datasets loaded successfully!")

Loading datasets...

✓ Gemma 3 12B: 10,000 samples
✓ Qwen2.5-VL-7B: 10,000 samples
✓ Molmo-7B-O-0924: 10,000 samples
✓ original_df: 10,000 samples

✅ All datasets loaded successfully!


In [8]:
gemma_df.head(3)

Unnamed: 0,question_id,image_id,question,ground_truth_answer,model_answer
0,question_comb_1,haloquest_2082.png,How many sharks are present in the travel brochure?,There are no sharks in the brochure ; The travel brochure is without sharks ; There are zero sha...,"Based on the image, there are **no sharks** visible in the travel brochure. \n\nThe water appear..."
1,question_comb_10,AMBER_348.jpg,Is the forest withering in this image?,"n, o",Let's analyze the image to determine if the forest is withering.\n\n**Observations:**\n\n* **T...
2,question_comb_100,haloquest_1937.png,What does Donald Trump's hat say?,He does not have a hat; He is without a hat; A hat is missing from the image,"Based on the image, Donald Trump's hat appears to say ""**Make America Great Again**"". It's the i..."


In [9]:
gemma_df.columns

Index(['question_id', 'image_id', 'question', 'ground_truth_answer',
       'model_answer'],
      dtype='object')

## 3. Dataset Overview

In [None]:
# Display column names
print("Column Names:")
print(gemma_df.columns.tolist())
print("\n" + "="*80)

In [None]:
# Display first few rows of each dataset
print("\n📊 GEMMA 3 12B - First 3 Samples")
print("="*80)
display(gemma_df.head(3))

In [None]:
print("\n📊 QWEN2.5-VL-7B - First 3 Samples")
print("="*80)
display(qwen_df.head(3))

In [None]:
print("\n📊 MOLMO-7B - First 3 Samples")
print("="*80)
display(molmo_df.head(3))

## 4. Basic Statistics

In [None]:
# Summary statistics
stats_data = {
    'Model': ['Gemma 3 12B', 'Qwen2.5-VL-7B', 'Molmo-7B'],
    'Total Samples': [len(gemma_df), len(qwen_df), len(molmo_df)],
    'Unique Images': [
        gemma_df['image_id'].nunique(),
        qwen_df['image_id'].nunique(),
        molmo_df['image_id'].nunique()
    ],
    'Unique Questions': [
        gemma_df['question_id'].nunique(),
        qwen_df['question_id'].nunique(),
        molmo_df['question_id'].nunique()
    ],
    'Avg Answer Length': [
        gemma_df['model_answer'].str.len().mean(),
        qwen_df['model_answer'].str.len().mean(),
        molmo_df['model_answer'].str.len().mean()
    ]
}

stats_df = pd.DataFrame(stats_data)
print("\n📈 Dataset Statistics")
print("="*80)
display(stats_df)

## 5. Answer Length Analysis

In [None]:
# Calculate answer lengths
gemma_df['answer_length'] = gemma_df['model_answer'].str.len()
qwen_df['answer_length'] = qwen_df['model_answer'].str.len()
molmo_df['answer_length'] = molmo_df['model_answer'].str.len()
gemma_df['gt_length'] = gemma_df['ground_truth_answer'].str.len()

# Plot answer length distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

axes[0].hist(gemma_df['answer_length'], bins=50, color='#1f77b4', alpha=0.7, edgecolor='black')
axes[0].set_title('Gemma 3 12B - Answer Length Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Answer Length (characters)')
axes[0].set_ylabel('Frequency')
axes[0].axvline(gemma_df['answer_length'].mean(), color='red', linestyle='--', 
                label=f'Mean: {gemma_df["answer_length"].mean():.0f}')
axes[0].legend()

axes[1].hist(qwen_df['answer_length'], bins=50, color='#ff7f0e', alpha=0.7, edgecolor='black')
axes[1].set_title('Qwen2.5-VL-7B - Answer Length Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Answer Length (characters)')
axes[1].set_ylabel('Frequency')
axes[1].axvline(qwen_df['answer_length'].mean(), color='red', linestyle='--', 
                label=f'Mean: {qwen_df["answer_length"].mean():.0f}')
axes[1].legend()

axes[2].hist(molmo_df['answer_length'], bins=50, color='#2ca02c', alpha=0.7, edgecolor='black')
axes[2].set_title('Molmo-7B - Answer Length Distribution', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Answer Length (characters)')
axes[2].set_ylabel('Frequency')
axes[2].axvline(molmo_df['answer_length'].mean(), color='red', linestyle='--', 
                label=f'Mean: {molmo_df["answer_length"].mean():.0f}')
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Box plot comparison
fig, ax = plt.subplots(figsize=(10, 6))

data_to_plot = [
    gemma_df['answer_length'],
    qwen_df['answer_length'],
    molmo_df['answer_length']
]

bp = ax.boxplot(data_to_plot, labels=['Gemma 3 12B', 'Qwen2.5-VL-7B', 'Molmo-7B'],
                patch_artist=True, showmeans=True)

colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax.set_title('Answer Length Comparison Across Models', fontsize=16, fontweight='bold')
ax.set_ylabel('Answer Length (characters)', fontsize=12)
ax.set_xlabel('Model', fontsize=12)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Compare Answers for Same Questions

In [None]:
# Find common question IDs
common_ids = set(gemma_df['question_id']) & set(qwen_df['question_id']) & set(molmo_df['question_id'])
print(f"Common questions across all models: {len(common_ids):,}")

# Create merged dataset
merged_df = gemma_df[['question_id', 'image_id', 'question', 'ground_truth_answer', 'model_answer']].copy()
merged_df = merged_df.rename(columns={'model_answer': 'gemma_answer'})

qwen_temp = qwen_df[['question_id', 'model_answer']].copy()
qwen_temp = qwen_temp.rename(columns={'model_answer': 'qwen_answer'})

molmo_temp = molmo_df[['question_id', 'model_answer']].copy()
molmo_temp = molmo_temp.rename(columns={'model_answer': 'molmo_answer'})

merged_df = merged_df.merge(qwen_temp, on='question_id', how='inner')
merged_df = merged_df.merge(molmo_temp, on='question_id', how='inner')

print(f"\n✓ Merged dataset created with {len(merged_df):,} samples")
print("\nColumns:", merged_df.columns.tolist())

## 7. Sample Comparisons

In [None]:
# Function to display a comparison
def display_comparison(idx):
    row = merged_df.iloc[idx]
    
    print("="*100)
    print(f"SAMPLE {idx + 1}")
    print("="*100)
    print(f"\n📷 Image: {row['image_id']}")
    print(f"📝 Question ID: {row['question_id']}")
    print(f"\n❓ Question:\n   {row['question']}")
    print(f"\n✅ Ground Truth:\n   {row['ground_truth_answer']}")
    print("\n" + "-"*100)
    print("\n🤖 Gemma 3 12B:")
    print(f"   {row['gemma_answer']}")
    print("\n🤖 Qwen2.5-VL-7B:")
    print(f"   {row['qwen_answer']}")
    print("\n🤖 Molmo-7B:")
    print(f"   {row['molmo_answer']}")
    print("\n")

# Display first 5 samples
for i in range(5):
    display_comparison(i)

## 8. Random Sample Comparison

In [None]:
# Display a random sample (run this cell multiple times to see different samples)
import random

random_idx = random.randint(0, len(merged_df) - 1)
display_comparison(random_idx)

## 9. Search for Specific Questions

In [None]:
# Search for questions containing specific keywords
search_term = "shark"  # Change this to search for different terms

matches = merged_df[merged_df['question'].str.contains(search_term, case=False, na=False)]
print(f"Found {len(matches)} questions containing '{search_term}'\n")

if len(matches) > 0:
    # Display first match
    idx = matches.index[0]
    row_idx = merged_df.index.get_loc(idx)
    display_comparison(row_idx)

## 10. Export Merged Dataset

In [None]:
# Save merged dataset to CSV
output_path = base_path / 'merged_hallucination_dataset.csv'
merged_df.to_csv(output_path, index=False)

print(f"✅ Merged dataset saved to: {output_path}")
print(f"   Total samples: {len(merged_df):,}")
print(f"   Columns: {len(merged_df.columns)}")

## 11. Summary Statistics Table

In [None]:
# Create comprehensive summary
summary_stats = pd.DataFrame({
    'Metric': [
        'Total Samples',
        'Mean Answer Length',
        'Median Answer Length',
        'Min Answer Length',
        'Max Answer Length',
        'Std Answer Length'
    ],
    'Gemma 3 12B': [
        len(gemma_df),
        f"{gemma_df['answer_length'].mean():.1f}",
        f"{gemma_df['answer_length'].median():.1f}",
        f"{gemma_df['answer_length'].min():.0f}",
        f"{gemma_df['answer_length'].max():.0f}",
        f"{gemma_df['answer_length'].std():.1f}"
    ],
    'Qwen2.5-VL-7B': [
        len(qwen_df),
        f"{qwen_df['answer_length'].mean():.1f}",
        f"{qwen_df['answer_length'].median():.1f}",
        f"{qwen_df['answer_length'].min():.0f}",
        f"{qwen_df['answer_length'].max():.0f}",
        f"{qwen_df['answer_length'].std():.1f}"
    ],
    'Molmo-7B': [
        len(molmo_df),
        f"{molmo_df['answer_length'].mean():.1f}",
        f"{molmo_df['answer_length'].median():.1f}",
        f"{molmo_df['answer_length'].min():.0f}",
        f"{molmo_df['answer_length'].max():.0f}",
        f"{molmo_df['answer_length'].std():.1f}"
    ]
})

print("\n📊 Comprehensive Summary Statistics")
print("="*80)
display(summary_stats)

## 12. Data Quality Checks

In [None]:
# Check for missing values
print("Missing Values:\n")
print("Gemma 3:")
print(gemma_df.isnull().sum())
print("\nQwen2.5-VL:")
print(qwen_df.isnull().sum())
print("\nMolmo-7B:")
print(molmo_df.isnull().sum())

In [None]:
# Check for duplicate question IDs
print("Duplicate Question IDs:\n")
print(f"Gemma 3: {gemma_df['question_id'].duplicated().sum()}")
print(f"Qwen2.5-VL: {qwen_df['question_id'].duplicated().sum()}")
print(f"Molmo-7B: {molmo_df['question_id'].duplicated().sum()}")

## 13. Quick Access to Individual Datasets

In [None]:
# Quick access variables
print("Available DataFrames:")
print("  - gemma_df: Gemma 3 12B dataset")
print("  - qwen_df: Qwen2.5-VL-7B dataset")
print("  - molmo_df: Molmo-7B dataset")
print("  - merged_df: Merged dataset with all three models")
print("\nExample usage:")
print("  gemma_df.head()")
print("  merged_df[merged_df['question'].str.contains('color')]")