In [25]:
import pandas as pd
import numpy as np

In [26]:
# ===================== 统一文件命名常量区 =====================
# 输入：已清洗的基础数据集文件名（可在此统一修改）
INPUT_CLEANED_DATA = '1_cleaned_data.csv'
# 输出：百分比机制建模数据集保存文件名（可在此统一修改）
OUTPUT_MODELING_DATA = '2_PERCENT_modeling_data.csv'
# ==============================================================

In [27]:
# 重新加载数据集并检查列名
df = pd.read_csv(INPUT_CLEANED_DATA)
print("数据集列名:", df.columns.tolist())
print("数据集形状:", df.shape)
print("\n数据集前几行:")
print(df.head())

# 检查关键列是否存在
required_columns = ['season', 'week', 'celebrity_name', 'weekly_score_percentage', 'was_eliminated']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"警告: 缺失列: {missing_columns}")
else:
    print("所有必需列都存在")

数据集列名: ['celebrity_name', 'season', 'week', 'weekly_avg_score', 'weekly_total_score', 'num_judges', 'was_eliminated', 'weekly_rank_by_avg', 'weekly_rank_by_total', 'weekly_score_percentage', 'results', 'placement']
数据集形状: (4631, 12)

数据集前几行:
  celebrity_name  season  week  weekly_avg_score  weekly_total_score  \
0      AJ McLean      29   1.0          6.000000                18.0   
1      AJ McLean      29   2.0          6.333333                19.0   
2      AJ McLean      29   3.0          7.000000                21.0   
3      AJ McLean      29   4.0          8.000000                24.0   
4      AJ McLean      29   5.0          8.000000                24.0   

   num_judges  was_eliminated  weekly_rank_by_avg  weekly_rank_by_total  \
0           3           False                 6.0                   6.0   
1           3           False                 5.0                   5.0   
2           3           False                 7.0                   7.0   
3           3           F

In [28]:
# 重新理解淘汰机制 - 基于数据实际观察
print("=== 重新分析淘汰机制 ===")

# 观察was_eliminated列的实际模式
print("was_eliminated列的值分布:")
print(df['was_eliminated'].value_counts())

# 观察几个选手的完整轨迹来理解淘汰机制
sample_players = df['celebrity_name'].unique()[:3]
for player in sample_players:
    player_data = df[df['celebrity_name'] == player].sort_values(['season', 'week'])
    print(f"\n{player}的参赛轨迹:")
    print(player_data[['season', 'week', 'weekly_score_percentage', 'was_eliminated']].head())

=== 重新分析淘汰机制 ===
was_eliminated列的值分布:
was_eliminated
False    2777
True     1854
Name: count, dtype: int64

AJ McLean的参赛轨迹:
   season  week  weekly_score_percentage  was_eliminated
0      29   1.0                 6.923077           False
1      29   2.0                 6.959707           False
2      29   3.0                 7.473310           False
3      29   4.0                 7.973422           False
4      29   5.0                 8.362369           False

Aaron Carter的参赛轨迹:
    season  week  weekly_score_percentage  was_eliminated
11       9   1.0                 7.881780           False
12       9   2.0                 9.183673           False
13       9   3.0                 8.139535           False
14       9   4.0                 7.438017           False
15       9   5.0                 9.917355           False

Adam Carolla的参赛轨迹:
    season  week  weekly_score_percentage  was_eliminated
22       6   1.0                 6.172840           False
23       6   2.0              

In [29]:
# 修正淘汰规则理解
def correct_elimination_understanding(df):
    """基于数据重新理解淘汰机制"""
    
    # 筛选百分比法赛季 (3-27季)
    percentage_seasons = list(range(3, 28))
    season_data = df[df['season'].isin(percentage_seasons)].copy()
    
    print(f"百分比赛季数据: {len(season_data)} 行")
    
    # 按赛季和周次分析
    season_week_groups = season_data.groupby(['season', 'week'])
    
    analysis_results = []
    
    for (season, week), group in season_week_groups:
        # 本周参赛选手
        competing_players = group[group['weekly_score_percentage'] > 0]
        
        # 本周被淘汰的选手
        eliminated_this_week = group[group['was_eliminated'] == True]
        
        # 本周未被淘汰的选手（进入下一周）
        safe_players = group[group['was_eliminated'] == False]
        
        analysis_results.append({
            'season': season,
            'week': week,
            'n_competing': len(competing_players),
            'n_eliminated': len(eliminated_this_week),
            'n_safe': len(safe_players),
            'has_percentage_data': len(competing_players) > 0
        })
    
    analysis_df = pd.DataFrame(analysis_results)
    return analysis_df, season_data

# 执行分析
elimination_analysis, percentage_data = correct_elimination_understanding(df)

print("\n淘汰模式分析摘要:")
print(f"总周次: {len(elimination_analysis)}")
print(f"有百分比数据的周次: {elimination_analysis['has_percentage_data'].sum()}")
print(f"平均每周淘汰人数: {elimination_analysis['n_eliminated'].mean():.2f}")

百分比赛季数据: 3366 行

淘汰模式分析摘要:
总周次: 275
有百分比数据的周次: 248
平均每周淘汰人数: 4.98


In [30]:
# 创建稳健的建模数据集
def create_robust_modeling_dataset(data):
    """创建稳健的百分比机制建模数据集"""
    
    data = data.copy()
    data = data.sort_values(['celebrity_name', 'season', 'week'])
    
    # 标记每位选手的最后一周（用于识别淘汰周）
    data['next_week'] = data.groupby(['celebrity_name', 'season'])['week'].shift(-1)
    data['eliminated_this_week'] = data['next_week'].isna()
    if 'placement' in data.columns:
        data.loc[data['placement'] == 1, 'eliminated_this_week'] = False
    
    # 确保有百分比数据
    valid_data = data[data['weekly_score_percentage'].notna() & (data['weekly_score_percentage'] > 0)].copy()
    
    if valid_data.empty:
        print("没有有效的百分比数据")
        return pd.DataFrame()
    
    print(f"有效数据记录数: {len(valid_data)}")
    print(f"包含赛季: {sorted(valid_data['season'].unique())}")
    print(f"包含周次: {valid_data[['season', 'week']].drop_duplicates().shape[0]}个")
    
    # 添加百分比排名
    valid_data['percentage_rank'] = valid_data.groupby(['season', 'week'])['weekly_score_percentage'].rank(
        method='dense', ascending=False
    )
    
    # 数据质量检查（宽松标准）
    def check_data_quality(group):
        if len(group) == 0:
            return 'empty'
        
        pct_sum = group['weekly_score_percentage'].sum()
        if 90 <= pct_sum <= 110:  # 放宽到±10%的误差
            return 'good'
        else:
            return f'check_needed (sum: {pct_sum:.1f}%)'
    
    # 按组检查数据质量
    quality_info = []
    for (season, week), group in valid_data.groupby(['season', 'week']):
        quality = check_data_quality(group)
        quality_info.append({
            'season': season,
            'week': week,
            'n_players': len(group),
            'percentage_sum': group['weekly_score_percentage'].sum(),
            'quality': quality
        })
    
    quality_df = pd.DataFrame(quality_info)
    print(f"\n数据质量分布:")
    print(quality_df['quality'].value_counts())
    
    # 保存前移除辅助列
    valid_data = valid_data.drop(columns=['next_week'])
    
    return valid_data

# 创建数据集
modeling_data = create_robust_modeling_dataset(percentage_data)

有效数据记录数: 1997
包含赛季: [np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27)]
包含周次: 248个

数据质量分布:
quality
good    248
Name: count, dtype: int64


In [31]:
# 验证数据集并保存
if not modeling_data.empty:
    print(f"\n=== 最终建模数据集 ===")
    print(f"数据形状: {modeling_data.shape}")
    print(f"季节数量: {modeling_data['season'].nunique()}")
    print(f"选手数量: {modeling_data['celebrity_name'].nunique()}")
    
    # 显示一些统计信息
    seasons = modeling_data['season'].unique()
    print(f"包含的赛季: {sorted(seasons)}")
    
    # 保存数据集
    modeling_data.to_csv(OUTPUT_MODELING_DATA, index=False)
    print(f"数据集已保存为: {OUTPUT_MODELING_DATA}")
    
    # 显示样例
    print("\n数据样例:")
    sample_cols = ['season', 'week', 'celebrity_name', 'weekly_score_percentage', 
                   'percentage_rank', 'eliminated_this_week']
    print(modeling_data[sample_cols].head(10))
else:
    print("无法创建建模数据集")


=== 最终建模数据集 ===
数据形状: (1997, 14)
季节数量: 25
选手数量: 295
包含的赛季: [np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27)]
数据集已保存为: 2_PERCENT_modeling_data.csv

数据样例:
    season  week celebrity_name  weekly_score_percentage  percentage_rank  \
11       9   1.0   Aaron Carter                 7.881780              2.0   
12       9   2.0   Aaron Carter                 9.183673              1.0   
13       9   3.0   Aaron Carter                 8.139535              4.0   
14       9   4.0   Aaron Carter                 7.438017              6.0   
15       9   5.0   Aaron Carter                 9.917355              4.0   
16       9   6.0   Aaron Carter                12.692275              3.0   
17       9   

In [32]:
# 额外的数据验证
def validate_elimination_patterns(data):
    """验证淘汰模式是否合理（放宽决赛/半决赛多淘汰情况）"""
    
    print("=== 淘汰模式验证 ===")
    
    if 'eliminated_this_week' not in data.columns:
        print("缺少 eliminated_this_week 列，无法验证淘汰模式")
        return []
    
    # 计算每季最大周次，用于识别决赛/半决赛周
    season_max_week = data.groupby('season')['week'].max()
    
    issues = []
    
    for (season, week), group in data.groupby(['season', 'week']):
        n_eliminated = int(group['eliminated_this_week'].sum())
        n_total = len(group)
        max_week = season_max_week.loc[season]
        is_final_stage = week >= (max_week - 1)  # 最后两周
        
        # 决赛/半决赛允许多淘汰；或人数很少时放宽
        if is_final_stage or n_total <= 4:
            continue
        
        # 检查淘汰比例是否合理（通常每周淘汰1-2人）
        if n_eliminated > n_total * 0.5:  # 如果淘汰超过50%
            issues.append(f"赛季{season}第{week}周: 淘汰{n_eliminated}/{n_total}人 (比例异常)")
        elif n_eliminated > 3:  # 如果单周淘汰超过3人
            issues.append(f"赛季{season}第{week}周: 淘汰{n_eliminated}人 (可能过多)")
    
    if issues:
        print(f"发现{len(issues)}个潜在问题:")
        for issue in issues[:5]:  # 只显示前5个
            print(f"  {issue}")
        if len(issues) > 5:
            print(f"  ... 以及{len(issues)-5}个其他问题")
    else:
        print("淘汰模式验证通过")
    
    return issues

# 执行验证
if not modeling_data.empty:
    validation_issues = validate_elimination_patterns(modeling_data)

print("\n=== 数据处理完成 ===")

=== 淘汰模式验证 ===
淘汰模式验证通过

=== 数据处理完成 ===
