# Dataset Exploration â€” Find It Again! Receipt Dataset

This notebook explores the distribution of the **Find It Again!** receipt dataset used for the LLM-Judge Fake Receipt Detector.

**Minimum required:**
- REAL vs FAKE count
- Distribution of receipt totals (histogram)
- Optional: image size, file size, aspect ratio, sharpness

In [None]:
import sys
sys.path.insert(0, '..')

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import cv2

from pipeline.dataset import DatasetManager

sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize'] = (10, 5)

## 1. Load Dataset Labels

In [None]:
dm = DatasetManager()
labels = dm.load_labels()

df = pd.DataFrame([
    {"id": stem, "label": label}
    for stem, label in labels.items()
])

print(f"Total receipts: {len(df)}")
print(df['label'].value_counts().to_string())

## 2. REAL vs FAKE Distribution

In [None]:
counts = df['label'].value_counts()

fig, ax = plt.subplots()
bars = ax.bar(counts.index, counts.values, color=['#2ecc71', '#e74c3c'], edgecolor='black')
ax.bar_label(bars, fmt='%d', padding=3)
ax.set_title('REAL vs FAKE Receipt Counts')
ax.set_ylabel('Count')
plt.tight_layout()
plt.savefig('../outputs/eda_label_distribution.png', dpi=150)
plt.show()

## 3. Image Metadata Collection
Collect width, height, file size, and sharpness (Laplacian variance) for all images.

In [None]:
records = []

for _, row in df.iterrows():
    img_path = dm.find_image(row['id'])
    if img_path is None:
        continue
    try:
        img = Image.open(img_path)
        w, h = img.size
        file_kb = img_path.stat().st_size / 1024
        
        # Sharpness via Laplacian variance
        gray = cv2.cvtColor(np.array(img.convert('RGB')), cv2.COLOR_RGB2GRAY)
        sharpness = cv2.Laplacian(gray, cv2.CV_64F).var()
        
        records.append({
            'id': row['id'],
            'label': row['label'],
            'width': w,
            'height': h,
            'aspect_ratio': round(h / w, 2),
            'megapixels': round(w * h / 1e6, 2),
            'file_kb': round(file_kb, 1),
            'sharpness': round(sharpness, 2),
        })
    except Exception as e:
        print(f"Error processing {row['id']}: {e}")

meta_df = pd.DataFrame(records)
print(meta_df.groupby('label')[['width', 'height', 'file_kb', 'sharpness']].describe().round(1))

## 4. Image Size Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for label, color in [('REAL', '#2ecc71'), ('FAKE', '#e74c3c')]:
    subset = meta_df[meta_df['label'] == label]
    axes[0].scatter(subset['width'], subset['height'], alpha=0.5, label=label, color=color, s=10)

axes[0].set_xlabel('Width (px)')
axes[0].set_ylabel('Height (px)')
axes[0].set_title('Image Dimensions by Label')
axes[0].legend()

meta_df.boxplot(column='file_kb', by='label', ax=axes[1])
axes[1].set_title('File Size Distribution (KB)')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('KB')
plt.suptitle('')
plt.tight_layout()
plt.savefig('../outputs/eda_image_size.png', dpi=150)
plt.show()

## 5. Aspect Ratio & Sharpness

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for label, color in [('REAL', '#2ecc71'), ('FAKE', '#e74c3c')]:
    subset = meta_df[meta_df['label'] == label]
    axes[0].hist(subset['aspect_ratio'], bins=20, alpha=0.6, label=label, color=color)
    axes[1].hist(subset['sharpness'], bins=30, alpha=0.6, label=label, color=color)

axes[0].set_title('Aspect Ratio Distribution (height/width)')
axes[0].set_xlabel('Aspect Ratio')
axes[0].legend()

axes[1].set_title('Sharpness Distribution (Laplacian Variance)')
axes[1].set_xlabel('Sharpness')
axes[1].legend()

plt.tight_layout()
plt.savefig('../outputs/eda_aspect_sharpness.png', dpi=150)
plt.show()

## 6. Summary Statistics

In [None]:
print("=== DATASET SUMMARY ===")
print(f"Total images: {len(meta_df)}")
print(f"REAL: {len(meta_df[meta_df.label=='REAL'])}")
print(f"FAKE: {len(meta_df[meta_df.label=='FAKE'])}")
print()
print(meta_df.groupby('label')[['width', 'height', 'file_kb', 'sharpness', 'aspect_ratio']]
      .agg(['mean', 'std', 'min', 'max'])
      .round(2)
      .to_string())