# 04 - LLM Classification Analysis

Analysis of the LLM-based market classification results.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

DATA_DIR = Path('../data/processed')
df = pd.read_csv(DATA_DIR / 'llm_classifications.csv')
print(f'Loaded {len(df)} markets')
df[['event_title', 'primary_theme', 'secondary_theme', 'confidence', 'reasoning']].head(10)

## Theme Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Theme counts
counts = df['primary_theme'].value_counts()
counts.plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Markets per Theme')
axes[0].set_xlabel('Count')

# Confidence distribution
df['confidence'].hist(bins=20, ax=axes[1], color='steelblue', edgecolor='black')
axes[1].set_title('Confidence Distribution')
axes[1].set_xlabel('Confidence')
axes[1].axvline(0.7, color='red', linestyle='--', label='0.7 threshold')
axes[1].legend()

plt.tight_layout()
plt.savefig(str(DATA_DIR / 'classification_plots.png'), dpi=150)
plt.show()

print(f'\nMarkets with confidence >= 0.7: {(df["confidence"] >= 0.7).sum()} ({(df["confidence"] >= 0.7).mean()*100:.1f}%)')
print(f'Markets with confidence < 0.7: {(df["confidence"] < 0.7).sum()}')

## Confidence by Theme

In [None]:
theme_stats = df.groupby('primary_theme').agg(
    count=('confidence', 'size'),
    mean_confidence=('confidence', 'mean'),
    min_confidence=('confidence', 'min'),
    has_secondary=('secondary_theme', lambda x: x.notna().sum())
).sort_values('count', ascending=False)
theme_stats

## Secondary Theme Analysis

In [None]:
secondary = df[df['secondary_theme'].notna()]
print(f'Markets with secondary theme: {len(secondary)} ({len(secondary)/len(df)*100:.1f}%)')
print(f'\nMost common secondary themes:')
print(secondary['secondary_theme'].value_counts().head(10))

# Cross-tab of primary vs secondary
if len(secondary) > 0:
    ct = pd.crosstab(secondary['primary_theme'], secondary['secondary_theme'])
    print('\nPrimary → Secondary theme overlap:')
    display(ct)

## Uncategorized Markets

In [None]:
uncat = df[df['primary_theme'] == 'uncategorized'].sort_values('confidence', ascending=False)
print(f'Uncategorized: {len(uncat)} markets')
print('\nSample uncategorized markets:')
for _, row in uncat.head(20).iterrows():
    print(f'  [{row["confidence"]:.2f}] {str(row["event_title"])[:80]}')
    if pd.notna(row.get('reasoning')):
        print(f'         {str(row["reasoning"])[:100]}')

## Comparison with Original Basket Labels

In [None]:
# Compare LLM themes with original basket assignments
if 'basket' in df.columns:
    comparison = pd.crosstab(df['basket'], df['primary_theme'])
    print('Original basket → LLM theme mapping:')
    display(comparison)
    
    # Agreement analysis
    print(f'\nOriginal baskets: {df["basket"].nunique()} unique')
    print(f'LLM themes: {df["primary_theme"].nunique()} unique')

## Validation Checks

In [None]:
# 1. Coverage
print('VALIDATION CHECKS')
print('='*50)
print(f'1. Coverage: {df["primary_theme"].notna().sum()}/{len(df)} markets classified ✓')

# 2. Confidence distribution
pct_above_07 = (df['confidence'] >= 0.7).mean() * 100
print(f'2. Confidence ≥ 0.7: {pct_above_07:.1f}% {"✓" if pct_above_07 > 70 else "✗"}')

# 3. Theme balance
theme_counts = df[df['primary_theme'] != 'uncategorized']['primary_theme'].value_counts()
print(f'3. Theme balance:')
print(f'   Min theme size: {theme_counts.min()} ({theme_counts.idxmin()})')
print(f'   Max theme size: {theme_counts.max()} ({theme_counts.idxmax()})')
print(f'   No theme > 500: {"✓" if theme_counts.max() <= 500 else "✗"}')
print(f'   All themes > 0: {"✓" if theme_counts.min() > 0 else "✗"}')

# 4. No empty themes
import sys; sys.path.insert(0, '..')
from src.classification.taxonomy import list_themes
all_themes = list_themes()
missing = set(all_themes) - set(theme_counts.index)
print(f'4. Themes with 0 markets: {missing if missing else "none ✓"}')