# Confidence-Adjusted Ratings: Empirical Tuning

This notebook analyzes IMDb episode data to empirically tune the confidence adjustment formula.

**Current formula:**
```
adjusted = raw - (raw - baseline) × halvingFactor^(-log_logBase(votes + 1))
```

With `halvingFactor=2`, `logBase=10`, `baseline=7.4`

**Problem:** With median episode votes of ~31, this gives ~35% pull to baseline, which may be too aggressive.

**Goals:**
1. Understand vote/rating distributions
2. Find where ratings stabilize (variance analysis)
3. Validate the baseline (7.4)
4. Grid search to find optimal constants
5. Compare current vs. optimal formula

## 1. Data Loading

In [None]:
import json
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
import seaborn as sns

# Set style
plt.style.use('dark_background')
sns.set_palette("husl")

print("Loading episode data...")

In [None]:
# Load all episode data from JSON files
episodes = []

for filepath in glob.glob('../data/tt*.json'):
    show_id = filepath.split('/')[-1].replace('.json', '')
    with open(filepath) as f:
        seasons = json.load(f)
    for season_idx, season in enumerate(seasons):
        for ep_idx, ep in enumerate(season):
            if ep is not None:
                episodes.append({
                    'show_id': show_id,
                    'season': season_idx + 1,
                    'episode': ep_idx + 1,
                    'rating': ep[0],
                    'votes': ep[1],
                    'episode_id': ep[2] if len(ep) > 2 else None,
                })

df = pd.DataFrame(episodes)
print(f"Loaded {len(df):,} episodes from {df['show_id'].nunique():,} shows")

In [None]:
# Basic data overview
print("=== Data Overview ===")
print(f"Total episodes: {len(df):,}")
print(f"Unique shows: {df['show_id'].nunique():,}")
print(f"\nRating range: {df['rating'].min():.1f} - {df['rating'].max():.1f}")
print(f"Vote range: {df['votes'].min():,} - {df['votes'].max():,}")
print(f"\nNull ratings: {df['rating'].isna().sum():,}")
print(f"Null votes: {df['votes'].isna().sum():,}")

# Filter to valid episodes
df_valid = df[(df['rating'] > 0) & (df['votes'] > 0)].copy()
print(f"\nValid episodes (rating > 0, votes > 0): {len(df_valid):,}")

## 2. Vote Distribution Analysis

In [None]:
# Vote distribution statistics
print("=== Vote Distribution ===")
print(df_valid['votes'].describe().apply(lambda x: f"{x:,.0f}" if x > 100 else f"{x:.2f}"))

print("\n=== Vote Percentiles ===")
percentiles = [10, 25, 50, 75, 90, 95, 99, 99.9]
for p in percentiles:
    val = df_valid['votes'].quantile(p / 100)
    print(f"  P{p:4}: {val:>10,.0f} votes")

In [None]:
# Vote distribution histogram (log scale)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Linear scale (truncated)
ax1 = axes[0]
df_valid['votes'].clip(upper=5000).hist(bins=100, ax=ax1, alpha=0.7, color='cyan')
ax1.set_xlabel('Votes (capped at 5,000)')
ax1.set_ylabel('Count')
ax1.set_title('Vote Distribution (Linear Scale)')
ax1.axvline(df_valid['votes'].median(), color='red', linestyle='--', label=f"Median: {df_valid['votes'].median():,.0f}")
ax1.legend()

# Log scale
ax2 = axes[1]
ax2.hist(np.log10(df_valid['votes']), bins=50, alpha=0.7, color='cyan')
ax2.set_xlabel('Log10(Votes)')
ax2.set_ylabel('Count')
ax2.set_title('Vote Distribution (Log Scale)')

# Add reference lines
for votes, label in [(10, '10'), (100, '100'), (1000, '1K'), (10000, '10K')]:
    ax2.axvline(np.log10(votes), color='yellow', linestyle='--', alpha=0.5)
    ax2.text(np.log10(votes), ax2.get_ylim()[1] * 0.9, label, ha='center', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# Cumulative distribution - what % of episodes are above N votes?
thresholds = [10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000]
print("=== Cumulative Vote Distribution ===")
print("Threshold | Episodes ≥ | % of Total")
print("-" * 40)
for t in thresholds:
    count = (df_valid['votes'] >= t).sum()
    pct = count / len(df_valid) * 100
    print(f"{t:>9,} | {count:>10,} | {pct:>6.2f}%")

## 3. Rating Distribution Analysis

In [None]:
# Overall rating distribution
print("=== Rating Distribution ===")
print(df_valid['rating'].describe())

print("\n=== Rating Percentiles ===")
for p in [10, 25, 50, 75, 90, 95, 99]:
    val = df_valid['rating'].quantile(p / 100)
    print(f"  P{p:4}: {val:.2f}")

In [None]:
# Rating histogram
fig, ax = plt.subplots(figsize=(10, 5))
df_valid['rating'].hist(bins=50, ax=ax, alpha=0.7, color='cyan')
ax.set_xlabel('Rating')
ax.set_ylabel('Count')
ax.set_title('Rating Distribution (All Episodes)')
ax.axvline(df_valid['rating'].mean(), color='red', linestyle='--', label=f"Mean: {df_valid['rating'].mean():.2f}")
ax.axvline(df_valid['rating'].median(), color='yellow', linestyle='--', label=f"Median: {df_valid['rating'].median():.2f}")
ax.legend()
plt.show()

In [None]:
# Rating distribution by vote bucket
vote_buckets = [(0, 50, '<50'), (50, 100, '50-100'), (100, 500, '100-500'), 
                (500, 1000, '500-1K'), (1000, 5000, '1K-5K'), (5000, float('inf'), '5K+')]

fig, ax = plt.subplots(figsize=(12, 5))
colors = plt.cm.viridis(np.linspace(0.2, 0.9, len(vote_buckets)))

for (low, high, label), color in zip(vote_buckets, colors):
    bucket = df_valid[(df_valid['votes'] >= low) & (df_valid['votes'] < high)]
    if len(bucket) > 0:
        ax.hist(bucket['rating'], bins=30, alpha=0.4, label=f"{label} (n={len(bucket):,})", color=color)

ax.set_xlabel('Rating')
ax.set_ylabel('Count')
ax.set_title('Rating Distribution by Vote Count')
ax.legend()
plt.show()

## 4. Variance Analysis

Key question: At what vote count do ratings stabilize? If variance drops significantly from 10→100 but barely changes from 1000→10000, that tells us where confidence should peak.

In [None]:
# Create log-scale vote buckets
df_valid['log_votes'] = np.log10(df_valid['votes'])
df_valid['vote_bucket'] = pd.cut(df_valid['log_votes'], 
                                  bins=[0, 1, 1.5, 2, 2.5, 3, 3.5, 4, 5],
                                  labels=['1-10', '10-32', '32-100', '100-316', '316-1K', '1K-3.2K', '3.2K-10K', '10K+'])

# Calculate variance by bucket
variance_by_bucket = df_valid.groupby('vote_bucket', observed=True).agg(
    count=('rating', 'count'),
    mean_rating=('rating', 'mean'),
    std_rating=('rating', 'std'),
    var_rating=('rating', 'var'),
    median_votes=('votes', 'median'),
).reset_index()

print("=== Variance by Vote Bucket ===")
print(variance_by_bucket.to_string(index=False))

In [None]:
# Variance vs. votes plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Standard deviation by bucket
ax1 = axes[0]
x = range(len(variance_by_bucket))
ax1.bar(x, variance_by_bucket['std_rating'], color='cyan', alpha=0.7)
ax1.set_xticks(x)
ax1.set_xticklabels(variance_by_bucket['vote_bucket'], rotation=45)
ax1.set_xlabel('Vote Bucket')
ax1.set_ylabel('Standard Deviation of Ratings')
ax1.set_title('Rating Standard Deviation by Vote Count')

# Variance (squared) by bucket
ax2 = axes[1]
ax2.bar(x, variance_by_bucket['var_rating'], color='orange', alpha=0.7)
ax2.set_xticks(x)
ax2.set_xticklabels(variance_by_bucket['vote_bucket'], rotation=45)
ax2.set_xlabel('Vote Bucket')
ax2.set_ylabel('Variance of Ratings')
ax2.set_title('Rating Variance by Vote Count')

plt.tight_layout()
plt.show()

In [None]:
# Finer-grained variance analysis using continuous vote values
# Group into 20 buckets based on log(votes)
n_buckets = 20
df_valid['vote_percentile'] = pd.qcut(df_valid['votes'], q=n_buckets, labels=False, duplicates='drop')

variance_continuous = df_valid.groupby('vote_percentile', observed=True).agg(
    count=('rating', 'count'),
    std_rating=('rating', 'std'),
    var_rating=('rating', 'var'),
    median_votes=('votes', 'median'),
    min_votes=('votes', 'min'),
    max_votes=('votes', 'max'),
).reset_index()

fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(variance_continuous['median_votes'], variance_continuous['std_rating'], 'o-', color='cyan', markersize=8)
ax.set_xscale('log')
ax.set_xlabel('Median Votes in Bucket (log scale)')
ax.set_ylabel('Standard Deviation of Ratings')
ax.set_title('Rating Standard Deviation vs. Vote Count')
ax.grid(True, alpha=0.3)

# Mark potential confidence thresholds
for votes in [100, 500, 1000, 5000]:
    ax.axvline(votes, color='yellow', linestyle='--', alpha=0.5)
    ax.text(votes, ax.get_ylim()[1] * 0.95, f'{votes}', ha='center', fontsize=9)

plt.show()

## 5. Midpoint (Baseline) Validation

The current formula uses 7.4 as the baseline. Let's verify this is appropriate.

In [None]:
# Calculate different baseline candidates
unweighted_mean = df_valid['rating'].mean()
weighted_mean = np.average(df_valid['rating'], weights=df_valid['votes'])
median_rating = df_valid['rating'].median()

print("=== Baseline Candidates ===")
print(f"Unweighted Mean: {unweighted_mean:.3f}")
print(f"Vote-Weighted Mean: {weighted_mean:.3f}")
print(f"Median Rating: {median_rating:.3f}")
print(f"\nCurrent Baseline: 7.4")

# Also check by show (average of show averages)
show_means = df_valid.groupby('show_id')['rating'].mean()
mean_of_show_means = show_means.mean()
print(f"\nMean of Show Averages: {mean_of_show_means:.3f}")

In [None]:
# Visualize where ratings cluster
fig, ax = plt.subplots(figsize=(10, 5))
df_valid['rating'].hist(bins=50, ax=ax, alpha=0.7, color='cyan', density=True)
ax.axvline(unweighted_mean, color='red', linestyle='-', linewidth=2, label=f'Unweighted Mean: {unweighted_mean:.2f}')
ax.axvline(weighted_mean, color='yellow', linestyle='-', linewidth=2, label=f'Weighted Mean: {weighted_mean:.2f}')
ax.axvline(median_rating, color='lime', linestyle='-', linewidth=2, label=f'Median: {median_rating:.2f}')
ax.axvline(7.4, color='magenta', linestyle='--', linewidth=2, label='Current Baseline: 7.4')
ax.set_xlabel('Rating')
ax.set_ylabel('Density')
ax.set_title('Rating Distribution with Baseline Candidates')
ax.legend()
plt.show()

## 6. Adjustment Formula

Define the adjustment formula with configurable parameters.

In [None]:
def calculate_adjusted_rating(rating, votes, baseline=7.4, log_base=10, halving_factor=2):
    """
    Calculate confidence-adjusted rating.
    
    Formula: adjusted = raw - (raw - baseline) × halvingFactor^(-log_logBase(votes + 1))
    
    Args:
        rating: Raw rating (0-10)
        votes: Number of votes
        baseline: The "expected" rating for unknown episodes (default 7.4)
        log_base: Base of logarithm for vote scaling (default 10)
        halving_factor: How much uncertainty halves per order of magnitude (default 2)
    
    Returns:
        Adjusted rating, pulled toward baseline based on vote confidence
    """
    if pd.isna(rating) or pd.isna(votes) or rating <= 0 or votes <= 0:
        return np.nan
    
    # uncertainty goes from 1 (at 0 votes) toward 0 (at high votes)
    uncertainty = halving_factor ** (-np.log(votes + 1) / np.log(log_base))
    adjusted = rating - (rating - baseline) * uncertainty
    return np.clip(adjusted, 0, 10)

# Vectorized version for efficiency
def calculate_adjusted_rating_vec(ratings, votes, baseline=7.4, log_base=10, halving_factor=2):
    uncertainty = halving_factor ** (-np.log(votes + 1) / np.log(log_base))
    adjusted = ratings - (ratings - baseline) * uncertainty
    return np.clip(adjusted, 0, 10)

In [None]:
# Show adjustment amount at different vote counts with current formula
test_votes = [10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]
test_rating = 9.0

print(f"=== Adjustment for {test_rating} Rating at Different Vote Counts ===")
print("Current formula: log_base=10, halving_factor=2, baseline=7.4")
print("\nVotes     | Adjusted | Pull to Baseline")
print("-" * 45)
for v in test_votes:
    adj = calculate_adjusted_rating(test_rating, v)
    pull = test_rating - adj
    pct = pull / (test_rating - 7.4) * 100
    print(f"{v:>9,} | {adj:>8.2f} | {pull:>5.2f} ({pct:>5.1f}%)")

## 7. Grid Search

Test different parameter combinations and evaluate using Spearman correlation on high-vote episodes.

In [None]:
# Define high-confidence threshold (adjust based on variance analysis above)
CONFIDENCE_THRESHOLD = 1000  # Episodes with this many votes are "ground truth"

# Filter to high-vote episodes
df_high_vote = df_valid[df_valid['votes'] >= CONFIDENCE_THRESHOLD].copy()
print(f"High-confidence episodes (≥{CONFIDENCE_THRESHOLD} votes): {len(df_high_vote):,}")
print(f"This is {len(df_high_vote) / len(df_valid) * 100:.1f}% of all episodes")

# Compute raw rankings for high-vote episodes
df_high_vote['raw_rank'] = df_high_vote['rating'].rank(ascending=False, method='average')

In [None]:
# Grid search parameters
log_bases = [5, 10, 20, 50]
halving_factors = [1.5, 2, 2.5, 3]
baselines = [7.0, 7.2, 7.4, 7.6]

results = []

for baseline in baselines:
    for log_base in log_bases:
        for halving in halving_factors:
            # Calculate adjusted ratings for high-vote episodes
            adjusted = calculate_adjusted_rating_vec(
                df_high_vote['rating'].values,
                df_high_vote['votes'].values,
                baseline=baseline,
                log_base=log_base,
                halving_factor=halving
            )
            
            # Compute adjusted rankings
            adjusted_rank = pd.Series(adjusted).rank(ascending=False, method='average')
            
            # Spearman correlation between raw and adjusted rankings
            spearman, _ = spearmanr(df_high_vote['raw_rank'], adjusted_rank)
            
            # Also compute mean absolute adjustment
            mean_adjustment = np.abs(adjusted - df_high_vote['rating'].values).mean()
            
            results.append({
                'baseline': baseline,
                'log_base': log_base,
                'halving_factor': halving,
                'spearman': spearman,
                'mean_adjustment': mean_adjustment,
            })

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('spearman', ascending=False)
print("=== Top 10 Parameter Combinations (by Spearman) ===")
print(results_df.head(10).to_string(index=False))

In [None]:
# Heatmap of Spearman correlation for each baseline
fig, axes = plt.subplots(1, len(baselines), figsize=(16, 4))

for idx, baseline in enumerate(baselines):
    ax = axes[idx]
    subset = results_df[results_df['baseline'] == baseline]
    pivot = subset.pivot(index='halving_factor', columns='log_base', values='spearman')
    
    sns.heatmap(pivot, annot=True, fmt='.4f', ax=ax, cmap='YlGnBu', 
                vmin=results_df['spearman'].min(), vmax=results_df['spearman'].max())
    ax.set_title(f'Baseline = {baseline}')
    ax.set_xlabel('Log Base')
    ax.set_ylabel('Halving Factor')

plt.suptitle('Spearman Correlation by Parameters (Higher = Better Ranking Preservation)', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Heatmap of mean adjustment amount for each baseline
fig, axes = plt.subplots(1, len(baselines), figsize=(16, 4))

for idx, baseline in enumerate(baselines):
    ax = axes[idx]
    subset = results_df[results_df['baseline'] == baseline]
    pivot = subset.pivot(index='halving_factor', columns='log_base', values='mean_adjustment')
    
    sns.heatmap(pivot, annot=True, fmt='.3f', ax=ax, cmap='YlOrRd')
    ax.set_title(f'Baseline = {baseline}')
    ax.set_xlabel('Log Base')
    ax.set_ylabel('Halving Factor')

plt.suptitle('Mean Absolute Adjustment (Lower = Less Aggressive)', y=1.02)
plt.tight_layout()
plt.show()

## 8. Compare Current vs. Optimal Formula

In [None]:
# Get best parameters
best = results_df.iloc[0]
print("=== Best Parameters ===")
print(f"Baseline: {best['baseline']}")
print(f"Log Base: {best['log_base']}")
print(f"Halving Factor: {best['halving_factor']}")
print(f"Spearman Correlation: {best['spearman']:.4f}")
print(f"Mean Adjustment: {best['mean_adjustment']:.4f}")

# Compare with current formula
current = results_df[(results_df['baseline'] == 7.4) & 
                      (results_df['log_base'] == 10) & 
                      (results_df['halving_factor'] == 2)].iloc[0]
print("\n=== Current Formula ===")
print(f"Baseline: 7.4, Log Base: 10, Halving Factor: 2")
print(f"Spearman Correlation: {current['spearman']:.4f}")
print(f"Mean Adjustment: {current['mean_adjustment']:.4f}")

In [None]:
# Side-by-side comparison of adjustment amounts at different vote counts
test_votes = [10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]
test_rating = 9.0

print(f"=== Adjustment Comparison for Rating {test_rating} ===")
print(f"Current: baseline=7.4, log_base=10, halving=2")
print(f"Optimal: baseline={best['baseline']}, log_base={best['log_base']}, halving={best['halving_factor']}")
print("\nVotes     | Current  | Optimal  | Difference")
print("-" * 50)
for v in test_votes:
    current_adj = calculate_adjusted_rating(test_rating, v, 7.4, 10, 2)
    optimal_adj = calculate_adjusted_rating(test_rating, v, best['baseline'], best['log_base'], best['halving_factor'])
    diff = optimal_adj - current_adj
    print(f"{v:>9,} | {current_adj:>8.3f} | {optimal_adj:>8.3f} | {diff:>+.3f}")

## 9. Sanity Check: Top Episodes

In [None]:
# Load show catalog for titles
with open('../data/titleId-expanded.json') as f:
    catalog = json.load(f)
catalog_dict = {show['id']: show['title'] for show in catalog}

# Add show titles
df_valid['show_title'] = df_valid['show_id'].map(catalog_dict)

In [None]:
# Calculate adjusted ratings with both formulas
df_valid['adj_current'] = calculate_adjusted_rating_vec(
    df_valid['rating'].values,
    df_valid['votes'].values,
    baseline=7.4, log_base=10, halving_factor=2
)

df_valid['adj_optimal'] = calculate_adjusted_rating_vec(
    df_valid['rating'].values,
    df_valid['votes'].values,
    baseline=best['baseline'], log_base=best['log_base'], halving_factor=best['halving_factor']
)

In [None]:
# Top 20 by raw rating
print("=== Top 20 Episodes by RAW Rating ===")
top_raw = df_valid.nlargest(20, 'rating')[['show_title', 'season', 'episode', 'rating', 'votes', 'adj_current', 'adj_optimal']]
print(top_raw.to_string(index=False))

In [None]:
# Top 20 by CURRENT adjusted rating
print("=== Top 20 Episodes by CURRENT Adjusted Rating ===")
top_current = df_valid.nlargest(20, 'adj_current')[['show_title', 'season', 'episode', 'rating', 'votes', 'adj_current', 'adj_optimal']]
print(top_current.to_string(index=False))

In [None]:
# Top 20 by OPTIMAL adjusted rating
print("=== Top 20 Episodes by OPTIMAL Adjusted Rating ===")
top_optimal = df_valid.nlargest(20, 'adj_optimal')[['show_title', 'season', 'episode', 'rating', 'votes', 'adj_current', 'adj_optimal']]
print(top_optimal.to_string(index=False))

## 10. Conclusions

Add your conclusions here based on the analysis above.

In [None]:
# Summary of findings
print("=" * 60)
print("ANALYSIS SUMMARY")
print("=" * 60)
print(f"\nTotal episodes analyzed: {len(df_valid):,}")
print(f"Median votes: {df_valid['votes'].median():.0f}")
print(f"Mean votes: {df_valid['votes'].mean():.0f}")
print(f"\nRating statistics:")
print(f"  Unweighted mean: {df_valid['rating'].mean():.3f}")
print(f"  Vote-weighted mean: {weighted_mean:.3f}")
print(f"  Median: {df_valid['rating'].median():.3f}")
print(f"\nOptimal parameters:")
print(f"  Baseline: {best['baseline']}")
print(f"  Log base: {best['log_base']}")
print(f"  Halving factor: {best['halving_factor']}")
print(f"\nImprovement over current formula:")
print(f"  Spearman correlation: {current['spearman']:.4f} -> {best['spearman']:.4f}")
print(f"  Mean adjustment: {current['mean_adjustment']:.4f} -> {best['mean_adjustment']:.4f}")

## 11. Refined Parameter Analysis

The previous grid search optimized for Spearman correlation on high-vote episodes, but that's not quite what we want. We need parameters that:
1. **Preserve separation** between well-established classics (high votes) and obscure episodes (low votes)
2. **Don't over-flatten** median-vote episodes

Let's analyze with a focus on the "gap" between obscure 10.0 and established 9.9 episodes.

In [None]:
# Define reference episodes for sanity checking
# We want: established classics should rank ABOVE obscure 10.0 episodes

# Find some key reference episodes
breaking_bad_finale = df_valid[(df_valid['show_title'] == 'Breaking Bad') & 
                                (df_valid['season'] == 5) & (df_valid['episode'] == 14)]
got_battle_bastards = df_valid[(df_valid['show_title'] == 'Game of Thrones') & 
                                (df_valid['season'] == 6) & (df_valid['episode'] == 9)]

# Find obscure 10.0 episodes (high rating, low-ish votes)
obscure_10s = df_valid[(df_valid['rating'] == 10.0) & 
                        (df_valid['votes'] >= 100) & 
                        (df_valid['votes'] < 2000)].nlargest(10, 'votes')

print("=== Reference Episodes ===")
print("\nEstablished Classics:")
if len(breaking_bad_finale) > 0:
    bb = breaking_bad_finale.iloc[0]
    print(f"  Breaking Bad S5E14: rating={bb['rating']}, votes={bb['votes']:,}")
if len(got_battle_bastards) > 0:
    got = got_battle_bastards.iloc[0]
    print(f"  Game of Thrones S6E9: rating={got['rating']}, votes={got['votes']:,}")

print("\nObscure 10.0 Episodes (100-2000 votes):")
for _, row in obscure_10s.iterrows():
    print(f"  {row['show_title']} S{row['season']}E{row['episode']}: votes={row['votes']:,}")

In [None]:
# Refined grid search with better evaluation metric
# Goal: maximize gap between established classics and obscure 10.0s

baselines = [6.8, 7.0, 7.2, 7.4]
log_bases = [6, 8, 10, 12]
halving_factors = [2.0, 2.2, 2.4, 2.6]

# Reference values
bb_rating, bb_votes = 10.0, 276754  # Breaking Bad finale
got_rating, got_votes = 9.9, 249535  # GoT Battle of Bastards
obscure_rating, obscure_votes = 10.0, 715  # "Made" S9E14 (example obscure 10.0)

refined_results = []

for baseline in baselines:
    for log_base in log_bases:
        for halving in halving_factors:
            # Calculate adjusted ratings for reference episodes
            bb_adj = calculate_adjusted_rating(bb_rating, bb_votes, baseline, log_base, halving)
            got_adj = calculate_adjusted_rating(got_rating, got_votes, baseline, log_base, halving)
            obscure_adj = calculate_adjusted_rating(obscure_rating, obscure_votes, baseline, log_base, halving)
            
            # Gap: how much higher is BB than the obscure 10.0?
            gap_bb_obscure = bb_adj - obscure_adj
            
            # Also check adjustment at median votes (31)
            median_adj_9 = calculate_adjusted_rating(9.0, 31, baseline, log_base, halving)
            median_pull = 9.0 - median_adj_9
            
            refined_results.append({
                'baseline': baseline,
                'log_base': log_base,
                'halving': halving,
                'bb_adj': bb_adj,
                'got_adj': got_adj,
                'obscure_adj': obscure_adj,
                'gap_bb_obscure': gap_bb_obscure,
                'median_pull_pct': median_pull / (9.0 - baseline) * 100,
            })

refined_df = pd.DataFrame(refined_results)
refined_df = refined_df.sort_values('gap_bb_obscure', ascending=False)

print("=== Top 15 Parameter Combinations (by Gap: BB vs Obscure 10.0) ===")
print("Larger gap = better separation between established and obscure episodes")
print(refined_df.head(15).to_string(index=False))

In [None]:
# Visualize the tradeoff: Gap vs Median Pull
fig, ax = plt.subplots(figsize=(12, 8))

# Color by baseline
colors = {6.8: 'red', 7.0: 'orange', 7.2: 'yellow', 7.4: 'cyan'}
for baseline in baselines:
    subset = refined_df[refined_df['baseline'] == baseline]
    ax.scatter(subset['median_pull_pct'], subset['gap_bb_obscure'], 
               c=colors[baseline], label=f'baseline={baseline}', alpha=0.7, s=100)

ax.set_xlabel('Pull at Median Votes (31) for 9.0 Rating (%)')
ax.set_ylabel('Gap: Breaking Bad vs Obscure 10.0')
ax.set_title('Tradeoff: Separation vs Aggression')
ax.legend()
ax.grid(True, alpha=0.3)

# Mark current formula
current_row = refined_df[(refined_df['baseline'] == 7.4) & 
                          (refined_df['log_base'] == 10) & 
                          (refined_df['halving'] == 2.0)]
if len(current_row) > 0:
    row = current_row.iloc[0]
    ax.scatter([row['median_pull_pct']], [row['gap_bb_obscure']], 
               c='white', s=300, marker='*', edgecolors='black', linewidths=2, 
               label='Current', zorder=10)

plt.show()

In [None]:
# Compare specific candidate formulas
candidates = [
    ('Current', 7.4, 10, 2.0),
    ('Proposed', 7.0, 8, 2.2),
    ('Alt A: Lower baseline only', 7.0, 10, 2.0),
    ('Alt B: Faster decay only', 7.4, 8, 2.2),
    ('Alt C: Aggressive', 6.8, 10, 2.0),
    ('Alt D: Gentle', 7.4, 6, 2.4),
]

print("=== Candidate Formula Comparison ===\n")

# Test at different vote counts
test_cases = [
    (10.0, 276754, "Breaking Bad S5E14"),
    (9.9, 249535, "Game of Thrones S6E9"),
    (10.0, 715, "Made S9E14 (obscure)"),
    (10.0, 150, "Very low vote 10.0"),
    (9.0, 31, "Median votes (9.0)"),
    (9.0, 100, "100 votes (9.0)"),
    (9.0, 500, "500 votes (9.0)"),
    (9.0, 1000, "1000 votes (9.0)"),
]

# Print header
print(f"{'Episode':<25}", end="")
for name, _, _, _ in candidates:
    print(f"{name:>12}", end="")
print()
print("-" * (25 + 12 * len(candidates)))

for rating, votes, desc in test_cases:
    print(f"{desc:<25}", end="")
    for name, baseline, log_base, halving in candidates:
        adj = calculate_adjusted_rating(rating, votes, baseline, log_base, halving)
        print(f"{adj:>12.3f}", end="")
    print()

print("\n" + "-" * (25 + 12 * len(candidates)))

# Show the gaps
print(f"{'Gap: BB - Obscure 715':<25}", end="")
for name, baseline, log_base, halving in candidates:
    bb = calculate_adjusted_rating(10.0, 276754, baseline, log_base, halving)
    obs = calculate_adjusted_rating(10.0, 715, baseline, log_base, halving)
    gap = bb - obs
    print(f"{gap:>12.3f}", end="")
print()

print(f"{'Gap: BB - Obscure 150':<25}", end="")
for name, baseline, log_base, halving in candidates:
    bb = calculate_adjusted_rating(10.0, 276754, baseline, log_base, halving)
    obs = calculate_adjusted_rating(10.0, 150, baseline, log_base, halving)
    gap = bb - obs
    print(f"{gap:>12.3f}", end="")
print()

In [None]:
# Find "sweet spot" formulas: good gap, but not too aggressive at median
# Define criteria:
# - Gap (BB vs 715-vote 10.0) should be >= 0.30 (enough separation)
# - Pull at median (31 votes) should be <= 35% (not too aggressive)

print("=== Sweet Spot Analysis ===")
print("Criteria: Gap >= 0.30, Median Pull <= 35%\n")

sweet_spot = refined_df[
    (refined_df['gap_bb_obscure'] >= 0.30) & 
    (refined_df['median_pull_pct'] <= 35)
].copy()

sweet_spot = sweet_spot.sort_values(['gap_bb_obscure'], ascending=False)

if len(sweet_spot) > 0:
    print(f"Found {len(sweet_spot)} formulas meeting criteria:\n")
    print(sweet_spot[['baseline', 'log_base', 'halving', 'gap_bb_obscure', 'median_pull_pct', 'bb_adj', 'obscure_adj']].to_string(index=False))
else:
    print("No formulas meet both criteria. Let's relax them...")
    
    # Try with relaxed criteria
    relaxed = refined_df[
        (refined_df['gap_bb_obscure'] >= 0.25) & 
        (refined_df['median_pull_pct'] <= 40)
    ].copy()
    relaxed = relaxed.sort_values(['gap_bb_obscure'], ascending=False)
    print(f"\nRelaxed criteria (Gap >= 0.25, Pull <= 40%): {len(relaxed)} formulas")
    print(relaxed.head(10).to_string(index=False))

In [None]:
# Full sanity check: Top 20 episodes with proposed formula
proposed_baseline, proposed_log_base, proposed_halving = 7.0, 8, 2.2

df_valid['adj_proposed'] = calculate_adjusted_rating_vec(
    df_valid['rating'].values,
    df_valid['votes'].values,
    baseline=proposed_baseline, 
    log_base=proposed_log_base, 
    halving_factor=proposed_halving
)

print(f"=== Top 30 Episodes by PROPOSED Formula (baseline={proposed_baseline}, log_base={proposed_log_base}, halving={proposed_halving}) ===")
top_proposed = df_valid.nlargest(30, 'adj_proposed')[
    ['show_title', 'season', 'episode', 'rating', 'votes', 'adj_current', 'adj_proposed']
]
print(top_proposed.to_string(index=False))

In [None]:
# Also try Alt A: Lower baseline only (keeps current decay rate)
alt_a_baseline, alt_a_log_base, alt_a_halving = 7.0, 10, 2.0

df_valid['adj_alt_a'] = calculate_adjusted_rating_vec(
    df_valid['rating'].values,
    df_valid['votes'].values,
    baseline=alt_a_baseline, 
    log_base=alt_a_log_base, 
    halving_factor=alt_a_halving
)

print(f"=== Top 30 Episodes by ALT A (baseline={alt_a_baseline}, log_base={alt_a_log_base}, halving={alt_a_halving}) ===")
print("(Lower baseline only - keeps current decay rate)")
top_alt_a = df_valid.nlargest(30, 'adj_alt_a')[
    ['show_title', 'season', 'episode', 'rating', 'votes', 'adj_current', 'adj_alt_a']
]
print(top_alt_a.to_string(index=False))

## 12. Key Insights

**The tradeoff:**
- Lower baseline → more separation between high and low vote episodes
- Faster decay (lower log_base, higher halving) → less aggressive at median votes, but ALSO less separation

**The tension:**
Your proposed formula (7.0, 8, 2.2) combines:
1. Lower baseline (punishes low-vote episodes more)
2. Faster decay (less aggressive overall)

These two effects partially cancel out. The net result depends on where in the vote distribution you look.

**Key question:** Which sanity check looks better to you?
- Current formula: More separation, but more aggressive at median
- Proposed formula: Less aggressive at median, but less separation
- Alt A (7.0, 10, 2.0): Lower baseline with current decay - MOST separation, but also most aggressive