# IMDb Show Vote Distribution Analysis

This notebook analyzes the vote distribution of TV shows in the IMDb dataset to help determine optimal cutoff thresholds for the heatmap application.

**Prerequisites:**
- `pip install -r requirements-analysis.txt`
- Run `python scripts/create_dataset.py -n <large_number>` first to generate `data/titleId.json` with all shows
- Or use the existing titleId.json (will only analyze shows currently in the catalog)

In [None]:
# =============================================================================
# PARAMETERS - Adjust these to customize the analysis
# =============================================================================

# Paths
DATA_DIR = "../data"
VOTES_URL = "https://datasets.imdbws.com/title.ratings.tsv.gz"

# Thresholds to analyze (minimum vote counts)
VOTE_THRESHOLDS = [10000, 5000, 2500, 1000, 500, 250, 100, 50]

# Top-N values to analyze
TOP_N_VALUES = [1000, 2500, 5000, 7500, 10000, 15000, 20000, 25000, 50000]

# Rank positions to show sample shows
SAMPLE_RANKS = [2500, 5000, 10000, 15000, 25000]

# Cutoff lines to display on charts (with colors)
CHART_CUTOFFS = [
    (2500, 'red'),
    (5000, 'orange'),
    (10000, 'green'),
    (15000, 'purple'),
    (25000, 'brown'),
]

# Chart display limits
MAX_RANK_DISPLAY = 30000  # Max rank to show on x-axis
RATE_OF_CHANGE_WINDOW = 100  # Smoothing window for rate-of-change chart

# Size estimation constants (for projecting data sizes)
EST_KB_PER_SHOW = 1.5  # Average show JSON file size in KB
EST_CATALOG_BYTES_PER_SHOW = 44  # Bytes per entry in titleId.json
EST_GZIP_RATIO = 0.32  # Typical gzip compression ratio for catalog

# Custom analysis
CUSTOM_THRESHOLD = 250  # For the custom threshold lookup cell
SEARCH_TERM = ""  # For the show search cell (empty = skip)

In [None]:
import json
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

## 1. Load Data

In [None]:
# Load show IDs from titleId.json
print("Loading titleId.json...")
with open(Path(DATA_DIR) / "titleId.json") as f:
    shows = json.load(f)
    
show_ids = set(s["id"] for s in shows)
titles = {s["id"]: s["title"] for s in shows}
print(f"Found {len(show_ids):,} shows in titleId.json")

In [None]:
# Download IMDb ratings data
print("Downloading IMDb ratings data (this may take a minute)...")
votes_df = pl.read_csv(
    VOTES_URL,
    separator="\t",
    columns=["tconst", "numVotes"],
    null_values=["\\N"],
    quote_char=None,
)
print(f"Downloaded {len(votes_df):,} ratings")

# Filter to our shows and sort by votes
show_votes = (
    votes_df
    .filter(pl.col("tconst").is_in(show_ids))
    .sort("numVotes", descending=True)
)
print(f"Matched {len(show_votes):,} shows with vote data")

## 2. Basic Statistics

In [None]:
votes_col = show_votes["numVotes"]

print("=== Vote Distribution Stats ===")
print(f"Min votes:    {votes_col.min():>10,}")
print(f"Max votes:    {votes_col.max():>10,}")
print(f"Median votes: {votes_col.median():>10,.0f}")
print(f"Mean votes:   {votes_col.mean():>10,.0f}")
print(f"Std dev:      {votes_col.std():>10,.0f}")

In [None]:
print("=== Vote Percentiles (position from top) ===")
n_shows = len(show_votes)
for pct in [1, 5, 10, 25, 50, 75, 90, 95, 99]:
    n = int(n_shows * pct / 100)
    votes_at_n = show_votes.row(n - 1)[1] if n > 0 else show_votes.row(0)[1]
    print(f"Top {pct:>2}% ({n:>6,} shows): >= {votes_at_n:>7,} votes")

## 3. Threshold Analysis

In [None]:
def analyze_threshold(min_votes: int) -> dict:
    """Analyze a vote threshold and return key metrics."""
    above = show_votes.filter(pl.col("numVotes") >= min_votes)
    total_votes = show_votes["numVotes"].sum()
    
    return {
        "threshold": min_votes,
        "num_shows": len(above),
        "vote_coverage_pct": above["numVotes"].sum() / total_votes * 100,
        "est_data_mb": len(above) * EST_KB_PER_SHOW / 1024,
        "est_catalog_kb_gzip": len(above) * EST_CATALOG_BYTES_PER_SHOW / 1024 * EST_GZIP_RATIO,
    }

analysis = [analyze_threshold(t) for t in VOTE_THRESHOLDS]

print("=== Threshold Comparison ===")
print(f"{'Threshold':>12} {'Shows':>10} {'Coverage':>10} {'Data':>10} {'Catalog':>12}")
print("-" * 56)
for row in analysis:
    print(f">={row['threshold']:<10,} {row['num_shows']:>10,} {row['vote_coverage_pct']:>9.1f}% {row['est_data_mb']:>8.1f} MB {row['est_catalog_kb_gzip']:>10.0f} KB")

In [None]:
def analyze_top_n(n: int) -> dict:
    """Analyze top N shows by vote count."""
    n = min(n, len(show_votes))
    
    top_n = show_votes.head(n)
    total_votes = show_votes["numVotes"].sum()
    min_votes = top_n.row(-1)[1] if len(top_n) > 0 else 0
    
    return {
        "top_n": n,
        "min_votes": min_votes,
        "vote_coverage_pct": top_n["numVotes"].sum() / total_votes * 100,
        "est_data_mb": n * EST_KB_PER_SHOW / 1024,
        "est_catalog_kb_gzip": n * EST_CATALOG_BYTES_PER_SHOW / 1024 * EST_GZIP_RATIO,
    }

top_n_analysis = [analyze_top_n(n) for n in TOP_N_VALUES]

print("=== Top-N Comparison ===")
print(f"{'Top N':>10} {'Min Votes':>12} {'Coverage':>10} {'Data':>10} {'Catalog':>12}")
print("-" * 56)
for row in top_n_analysis:
    print(f"{row['top_n']:>10,} {row['min_votes']:>12,} {row['vote_coverage_pct']:>9.1f}% {row['est_data_mb']:>8.1f} MB {row['est_catalog_kb_gzip']:>10.0f} KB")

## 4. Sample Shows at Boundaries

See what shows exist at different cutoff points to assess quality.

In [None]:
def show_boundary_samples(threshold: int, n_samples: int = 10):
    """Show sample shows around a vote threshold boundary."""
    boundary = show_votes.filter(
        (pl.col("numVotes") >= threshold - 20) & 
        (pl.col("numVotes") <= threshold + 20)
    ).head(n_samples)
    
    print(f"\n=== Shows around {threshold:,} votes ===")
    for row in boundary.iter_rows(named=True):
        rank = show_votes.filter(pl.col("numVotes") > row["numVotes"]).height + 1
        title = titles.get(row["tconst"], "Unknown")[:45]
        print(f"  {rank:>6}: {title:<45} ({row['numVotes']:>5,} votes)")

# Show samples at key thresholds
for threshold in VOTE_THRESHOLDS[:6]:  # First 6 thresholds
    show_boundary_samples(threshold)

In [None]:
def show_rank_samples(rank: int):
    """Show shows around a specific rank."""
    print(f"\n=== Shows around rank {rank:,} ===")
    for offset in [-5, -2, 0, 2, 5]:
        idx = rank + offset - 1
        if 0 <= idx < len(show_votes):
            row = show_votes.row(idx, named=True)
            title = titles.get(row["tconst"], "Unknown")[:45]
            print(f"  {idx+1:>6}: {title:<45} ({row['numVotes']:>6,} votes)")

for rank in SAMPLE_RANKS:
    show_rank_samples(rank)

## 5. Visualizations

In [None]:
# Convert to numpy for matplotlib
votes_array = show_votes["numVotes"].to_numpy()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Rate of Vote Decline by Rank (smoothed derivative)
ax1 = axes[0, 0]
max_rank_for_rate = min(20000, len(votes_array) - 1)
vote_drops = np.diff(votes_array[:max_rank_for_rate + 1]) * -1  # Make positive
smoothed_drops = np.convolve(vote_drops, np.ones(RATE_OF_CHANGE_WINDOW) / RATE_OF_CHANGE_WINDOW, mode='valid')
ax1.plot(range(1, len(smoothed_drops) + 1), smoothed_drops, color='steelblue', linewidth=1)
ax1.set_xlabel('Show Rank')
ax1.set_ylabel(f'Avg Vote Drop per Rank ({RATE_OF_CHANGE_WINDOW}-rank window)')
ax1.set_title('Rate of Vote Decline by Rank')
ax1.set_yscale('log')
for cutoff, color in CHART_CUTOFFS[:4]:
    ax1.axvline(x=cutoff, color=color, linestyle='--', alpha=0.7, label=f'{cutoff:,}')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Cumulative vote coverage
ax2 = axes[0, 1]
cumulative_votes = np.cumsum(votes_array)
total_votes = votes_array.sum()
pct_coverage = (cumulative_votes / total_votes) * 100
ax2.plot(range(1, len(pct_coverage) + 1), pct_coverage, color='steelblue', linewidth=1.5)
ax2.set_xlabel('Number of Shows Included')
ax2.set_ylabel('% of Total Votes Covered')
ax2.set_title('Cumulative Vote Coverage')
ax2.set_xlim(0, MAX_RANK_DISPLAY)
ax2.grid(True, alpha=0.3)
for cutoff, color in CHART_CUTOFFS:
    if cutoff < len(pct_coverage):
        pct = pct_coverage[cutoff - 1]
        ax2.axvline(x=cutoff, color=color, linestyle='--', alpha=0.7)
        ax2.annotate(f'{cutoff:,}: {pct:.1f}%', xy=(cutoff, pct), 
                    xytext=(cutoff + 500, pct - 3), fontsize=9)

# 3. Log-scale histogram of vote counts
ax3 = axes[1, 0]
bins = np.logspace(np.log10(votes_array.min()), np.log10(votes_array.max()), 50)
ax3.hist(votes_array, bins=bins, edgecolor='black', alpha=0.7, color='steelblue')
ax3.set_xscale('log')
ax3.set_xlabel('Number of Votes (log scale)')
ax3.set_ylabel('Number of Shows')
ax3.set_title('Distribution of Show Vote Counts')
for cutoff, color in CHART_CUTOFFS[:4]:
    ax3.axvline(x=cutoff, color=color, linestyle='--', linewidth=2, label=f'{cutoff:,} votes')
ax3.legend()

# 4. Votes vs Rank
ax4 = axes[1, 1]
max_rank = min(MAX_RANK_DISPLAY, len(show_votes))
ax4.plot(range(1, max_rank + 1), votes_array[:max_rank], color='steelblue', linewidth=1)
ax4.set_xlabel('Show Rank')
ax4.set_ylabel('Number of Votes')
ax4.set_title(f'Top {max_rank:,} Shows by Vote Count')
ax4.set_yscale('log')
ax4.grid(True, alpha=0.3)
for cutoff, color in CHART_CUTOFFS:
    if cutoff < max_rank:
        votes_at = votes_array[cutoff - 1]
        ax4.axhline(y=votes_at, color=color, linestyle=':', alpha=0.5)
        ax4.axvline(x=cutoff, color=color, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

## 6. Custom Threshold Lookup

Use this cell to analyze any specific threshold (set `CUSTOM_THRESHOLD` in parameters).

In [None]:
result = analyze_threshold(CUSTOM_THRESHOLD)
print(f"=== Analysis for >= {CUSTOM_THRESHOLD:,} votes ===")
print(f"Number of shows:     {result['num_shows']:,}")
print(f"Vote coverage:       {result['vote_coverage_pct']:.1f}%")
print(f"Est. data size:      {result['est_data_mb']:.1f} MB")
print(f"Est. catalog (gzip): {result['est_catalog_kb_gzip']:.0f} KB")

print(f"\nSample shows at this boundary:")
show_boundary_samples(CUSTOM_THRESHOLD, n_samples=15)

In [None]:
# Search for a specific show (set SEARCH_TERM in parameters)
if SEARCH_TERM:
    matches = [(tid, title) for tid, title in titles.items() 
               if SEARCH_TERM.lower() in title.lower()]
    
    print(f"Found {len(matches)} matches for '{SEARCH_TERM}':\n")
    for tid, title in matches[:20]:
        votes_row = show_votes.filter(pl.col("tconst") == tid)
        if len(votes_row) > 0:
            votes = votes_row.row(0, named=True)["numVotes"]
            rank = show_votes.filter(pl.col("numVotes") > votes).height + 1
            print(f"  {title:<50} ({votes:>6,} votes, rank {rank:,})")
        else:
            print(f"  {title:<50} (no vote data)")
else:
    print("Set SEARCH_TERM in the parameters cell to search for a show.")