In [1]:
import pandas as pd
import os
import random

In [2]:
import pandas as pd
import os

CATEGORIES = [
    'Social Issues',
    'Politics & International',
    'Culture & Lifestyle',
    'Environment & Safety',
    'Science & Technology',
    'Economy & Business'
]

def split_datasets_by_category(input_file, content_file, output_dir):
    """
    加载分类结果，按类别拆分数据集。
    为每个类别生成一个CSV文件，包含属于该类别的所有test_id的headline、CTR和pred_CTR_i。
    """
    
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    
    # 加载完整数据集
    df = pd.read_csv(input_file)
    
    # 加载content_level_categories.csv（test_id级别类别）
    content_df = pd.read_csv(content_file)
    
    # 选择需要的列：test_id, headline, CTR, pred_CTR_1 到 pred_CTR_63
    pred_columns = [f'pred_CTR_{i}' for i in range(1, 64)]
    base_columns = ['test_id', 'headline', 'CTR'] + pred_columns
    
    # 为每个类别创建数据集
    for category in CATEGORIES:
        # 过滤属于该类别的test_id（category == 1）
        test_ids = content_df[content_df[category] == 1]['test_id'].tolist()
        
        if not test_ids:
            print(f"警告：类别 '{category}' 没有test_id，跳过。")
            continue
        
        # 从df中提取这些test_id的所有行
        category_df = df[df['test_id'].isin(test_ids)][base_columns]
        
        if category_df.empty:
            print(f"警告：类别 '{category}' 没有数据，跳过。")
            continue
        
        # 保存为CSV
        output_file = os.path.join(output_dir, f"{category}_dataset.csv")
        category_df.to_csv(output_file, index=False)
        print(f"保存类别 '{category}' 的数据集到 {output_file}，行数：{len(category_df)}")


# script_dir = os.path.dirname(os.path.abspath(__file__))
script_dir = os.getcwd()
print("当前脚本的文件夹位置为: ", script_dir)
input_file = os.path.join(script_dir, "classified_dataset.csv")
content_file = os.path.join(script_dir, "content_level_categories.csv")
output_dir = os.path.join(script_dir, "category_datasets")

# 运行函数
split_datasets_by_category(input_file = input_file, content_file = content_file, output_dir = output_dir)

当前脚本的文件夹位置为:  e:\VS code project\Cross Attention for Combo\Classification_ClassAccuracy
保存类别 'Social Issues' 的数据集到 e:\VS code project\Cross Attention for Combo\Classification_ClassAccuracy\category_datasets\Social Issues_dataset.csv，行数：9935
保存类别 'Politics & International' 的数据集到 e:\VS code project\Cross Attention for Combo\Classification_ClassAccuracy\category_datasets\Politics & International_dataset.csv，行数：2759
保存类别 'Culture & Lifestyle' 的数据集到 e:\VS code project\Cross Attention for Combo\Classification_ClassAccuracy\category_datasets\Culture & Lifestyle_dataset.csv，行数：10253
保存类别 'Environment & Safety' 的数据集到 e:\VS code project\Cross Attention for Combo\Classification_ClassAccuracy\category_datasets\Environment & Safety_dataset.csv，行数：2154
保存类别 'Science & Technology' 的数据集到 e:\VS code project\Cross Attention for Combo\Classification_ClassAccuracy\category_datasets\Science & Technology_dataset.csv，行数：2180
保存类别 'Economy & Business' 的数据集到 e:\VS code project\Cross Attention for Combo\Classif

In [None]:
CATEGORIES = [
    'Social Issues',
    'Politics & International',
    'Culture & Lifestyle',
    'Environment & Safety',
    'Science & Technology',
    'Economy & Business'
]

def evaluate_accuracy(df):
    """
    计算准确率函数（用户提供的逻辑）。
    """
    if df.empty or "pred_CTR" not in df.columns:
        print("错误：数据集为空或缺少 pred_CTR 列，跳过准确率计算")
        return 0.0, 0.0, 0, 0, 0
    
    correct = 0
    random_correct = 0
    total = 0
    
    for test_id, group in df.groupby("test_id"):
        if len(group) == 0:
            continue
        
        max_ctr = group["CTR"].max()
        true_max_indices = group[group["CTR"] == max_ctr].index.tolist()
        
        pred_max_idx = group["pred_CTR"].idxmax()
        valid_indices = group.index.tolist()
        
        total += 1
        if pred_max_idx in true_max_indices:
            correct += 1
        
        random_idx = random.choice(valid_indices)
        if random_idx in true_max_indices:
            random_correct += 1
    
    accuracy = correct / total if total > 0 else 0.0
    random_accuracy = random_correct / total if total > 0 else 0.0
    print(f"准确率计算：正确 {correct}/{total}，准确率 {accuracy:.3f}")
    print(f"随机准确率：正确 {random_correct}/{total}，随机准确率 {random_accuracy:.3f}")
    return accuracy, random_accuracy, correct, random_correct, total

def compute_accuracy_for_category(input_dir='category_datasets', output_dir='accuracy_results', combination_file='all_combinations.csv'):
    """
    为每个类别计算63个模型的准确率，并映射到Combination，保存为CSV。
    """
    # 加载Combination映射
    try:
        combination_df = pd.read_csv(combination_file)
        # 创建pred_CTR_{i}到Combination的映射
        pred_to_combination = dict(zip(combination_df['New_Column_Name'], combination_df['Combination']))
    except FileNotFoundError:
        print(f"错误：未找到 {combination_file}，无法映射Combination")
        return
    except KeyError as e:
        print(f"错误：{combination_file} 缺少必要列 {e}")
        return
    
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    
    for category in CATEGORIES:
        input_file = os.path.join(input_dir, f"{category}_dataset.csv")
        if not os.path.exists(input_file):
            print(f"警告：类别 '{category}' 的数据集不存在，跳过。")
            continue
        
        df = pd.read_csv(input_file)
        
        results = []
        for i in range(1, 64):  # 63个模型
            pred_col = f'pred_CTR_{i}'
            if pred_col not in df.columns:
                print(f"警告：缺少列 {pred_col}，跳过模型 {i}")
                continue
            
            # 获取Combination名称
            combination = pred_to_combination.get(pred_col, f"Unknown_{i}")
            
            # 复制df并重命名pred列
            df_temp = df.copy()
            df_temp['pred_CTR'] = df_temp[pred_col]
            
            # 计算准确率
            accuracy, random_accuracy, correct, random_correct, total = evaluate_accuracy(df_temp)
            
            results.append({
                'Combination': combination,  # 使用Combination替换model_id
                'accuracy': accuracy,
                'random_accuracy': random_accuracy,
                'correct': correct,
                'random_correct': random_correct,
                'total': total
            })
        
        # 保存结果为CSV
        results_df = pd.DataFrame(results)
        output_file = os.path.join(output_dir, f"{category}_accuracy_different_model.csv")
        results_df.to_csv(output_file, index=False)
        print(f"保存类别 '{category}' 的准确率结果到 {output_file}")

# script_dir = os.path.dirname(os.path.abspath(__file__))
script_dir = os.getcwd()
print("当前脚本的文件夹位置为: ", script_dir)
parent_dir = os.path.dirname(script_dir)
    
input_dir = os.path.join(script_dir, "category_datasets")
combination_file = os.path.join(parent_dir, "Merge_Combo_RESULT", "all_combinations.csv")
output_dir = os.path.join(script_dir, "accuracy_results")
# 运行函数
compute_accuracy_for_category(input_dir = input_dir, output_dir = output_dir, combination_file = combination_file)

当前脚本的文件夹位置为:  e:\VS code project\Cross Attention for Combo\Classification_ClassAccuracy
准确率计算：正确 1161/2616，准确率 0.444
随机准确率：正确 799/2616，随机准确率 0.305
准确率计算：正确 1166/2616，准确率 0.446
随机准确率：正确 839/2616，随机准确率 0.321
准确率计算：正确 1166/2616，准确率 0.446
随机准确率：正确 827/2616，随机准确率 0.316
准确率计算：正确 1151/2616，准确率 0.440
随机准确率：正确 813/2616，随机准确率 0.311
准确率计算：正确 1156/2616，准确率 0.442
随机准确率：正确 793/2616，随机准确率 0.303
准确率计算：正确 1175/2616，准确率 0.449
随机准确率：正确 817/2616，随机准确率 0.312
准确率计算：正确 1140/2616，准确率 0.436
随机准确率：正确 809/2616，随机准确率 0.309
准确率计算：正确 1145/2616，准确率 0.438
随机准确率：正确 851/2616，随机准确率 0.325
准确率计算：正确 1135/2616，准确率 0.434
随机准确率：正确 785/2616，随机准确率 0.300
准确率计算：正确 1131/2616，准确率 0.432
随机准确率：正确 810/2616，随机准确率 0.310
准确率计算：正确 1135/2616，准确率 0.434
随机准确率：正确 820/2616，随机准确率 0.313
准确率计算：正确 1161/2616，准确率 0.444
随机准确率：正确 787/2616，随机准确率 0.301
准确率计算：正确 1141/2616，准确率 0.436
随机准确率：正确 815/2616，随机准确率 0.312
准确率计算：正确 1189/2616，准确率 0.455
随机准确率：正确 822/2616，随机准确率 0.314
准确率计算：正确 1177/2616，准确率 0.450
随机准确率：正确 816/2616，随机准确率 0.312
准确率计算：正确 1160/2616，准确率 0.44