# Problem C EDA

数据文件：`2026_MCM_Problem_C_Data.csv`


In [None]:
import re
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

df = pd.read_csv('2026_MCM_Problem_C_Data.csv')
df.head()


In [None]:
print('shape', df.shape)
print('columns', len(df.columns))
print('dtypes counts:')
print(df.dtypes.value_counts())
print('duplicate rows:', df.duplicated().sum())


In [None]:
miss = df.isna().sum()
miss_rate = (miss / len(df)).sort_values(ascending=False)
miss_rate.head(20)


In [None]:
cat_cols = [c for c in df.columns if df[c].dtype == object]
cat_cols


In [None]:
for c in cat_cols:
    nunq = df[c].nunique(dropna=True)
    top = df[c].value_counts(dropna=True).head(5)
    print(f"\n{c}: uniques={nunq}")
    print(top)


In [None]:
df[['celebrity_age_during_season', 'season', 'placement']].describe()


In [None]:
score_cols = [c for c in df.columns if c.startswith('week') and c.endswith('_score')]
all_scores = df[score_cols].stack(dropna=True)

print('score columns:', len(score_cols))
print('score non-null count', all_scores.shape[0])
print(all_scores.describe(percentiles=[.01, .05, .5, .95, .99]))
print('score zeros count:', (all_scores == 0).sum())
print('score negatives count:', (all_scores < 0).sum())
print('score >10 count:', (all_scores > 10).sum())


In [None]:
season_counts = df.groupby('season')['celebrity_name'].nunique().sort_index()
season_counts.head(10)


In [None]:
print('min/median/max contestants:', season_counts.min(), season_counts.median(), season_counts.max())


In [None]:
week_cols = {
    w: [c for c in df.columns if c.startswith(f'week{w}_judge') and c.endswith('_score')]
    for w in range(1, 12)
}

def season_max_week(sdf):
    active = []
    for w, cols in week_cols.items():
        if sdf[cols].notna().any().any():
            active.append(w)
    return max(active) if active else None

max_week = df.groupby('season').apply(season_max_week)
max_week.value_counts().sort_index()


In [None]:
pat = re.compile(r"Eliminated Week\s*(\d+)")
def parse_elim(x):
    if isinstance(x, str):
        m = pat.search(x)
        if m:
            return int(m.group(1))
    return np.nan

df['elim_week'] = df['results'].map(parse_elim)
df['results'].value_counts().head(10)


In [None]:
print('rows with elim_week:', df['elim_week'].notna().sum())

last_pos = pd.Series(np.nan, index=df.index)
for w in range(1, 12):
    cols = week_cols[w]
    positive_any = df[cols].fillna(0).astype(float).gt(0).any(axis=1)
    last_pos = last_pos.where(~positive_any, w)

cmp = pd.DataFrame({'elim_week': df['elim_week'], 'last_pos_week': last_pos}).dropna()
(cmp['last_pos_week'] - cmp['elim_week']).value_counts().sort_index()


In [None]:
{w: int(df[f'week{w}_judge4_score'].notna().sum()) for w in range(1, 12) if f'week{w}_judge4_score' in df.columns}
