In [2]:
import pandas as pd
import numpy as np
from typing import Tuple, List

def classify_student_type(row: pd.Series) -> str:
    """
    分类函数：判断学生是偏科型还是均衡型
    
    偏科型判定标准：
    1. 显著性过滤：强/弱知识点需满足最低题量阈值（≥5题）
       - 强项知识点：pk > 0.8 且 Nk ≥ 5
       - 弱项知识点：pk < 0.4 且 Nk ≥ 5
    2. 效应量评估：计算强项与弱项的Cohen's d值
       - 仅当 |d| > 0.8 时判定为显著偏科
    
    参数:
        row: 包含学生数据的pandas Series
    
    返回:
        str: "偏科型" 或 "均衡型"
    """
    # 解析数据并排除填充值-1
    all_concepts = [int(x) for x in row['concepts'].split(',')]
    all_responses = [int(x) for x in row['responses'].split(',')]
    
    # 过滤掉填充值-1
    valid_data = [(concept, response) for concept, response in zip(all_concepts, all_responses) 
                  if concept != -1 and response != -1]
    
    if not valid_data:
        return "均衡型"  # 如果没有有效数据，默认为均衡型
    
    concepts, responses = zip(*valid_data)
    concepts = list(concepts)
    responses = list(responses)
    
    # 计算每个概念的正确率和题量
    concept_accuracy = calculate_concept_accuracy(concepts, responses)
    
    # 显著性过滤：识别强项和弱项知识点
    strong_concepts = {}  # 强项知识点
    weak_concepts = {}    # 弱项知识点
    
    for concept, accuracy in concept_accuracy.items():
        # 计算该概念的题量
        concept_count = concepts.count(concept)
        
        # 强项知识点：正确率 > 0.8 且题量 ≥ 5
        if accuracy > 0.8 and concept_count >= 5:
            strong_concepts[concept] = {
                'accuracy': accuracy,
                'count': concept_count
            }
        
        # 弱项知识点：正确率 < 0.4 且题量 ≥ 5
        elif accuracy < 0.4 and concept_count >= 5:
            weak_concepts[concept] = {
                'accuracy': accuracy,
                'count': concept_count
            }
    
    # 如果没有同时存在强项和弱项，则为均衡型
    if not strong_concepts or not weak_concepts:
        return "均衡型"
    
    # 计算Cohen's d值
    cohens_d = calculate_cohens_d(strong_concepts, weak_concepts)
    
    # 判定标准：|d| > 0.8 为显著偏科
    if abs(cohens_d) > 0.8:
        return "偏科型"
    else:
        return "均衡型"

def parse_student_data(row: pd.Series) -> dict:
    """
    解析学生数据，将字符串格式的列表转换为实际的列表，并排除填充值-1
    
    参数:
        row: 原始数据行
    
    返回:
        dict: 解析后的数据字典
    """
    # 解析所有数据
    all_questions = [int(x) for x in row['questions'].split(',')]
    all_concepts = [int(x) for x in row['concepts'].split(',')]
    all_responses = [int(x) for x in row['responses'].split(',')]
    all_is_repeat = [int(x) for x in row['is_repeat'].split(',')]
    
    # 过滤掉填充值-1（以concepts和responses为准，因为这两个是分类的关键字段）
    valid_indices = [i for i, (concept, response) in enumerate(zip(all_concepts, all_responses)) 
                     if concept != -1 and response != -1]
    
    return {
        'fold': row['fold'],
        'uid': row['uid'],
        'questions': [all_questions[i] for i in valid_indices if i < len(all_questions)],
        'concepts': [all_concepts[i] for i in valid_indices],
        'responses': [all_responses[i] for i in valid_indices],
        'is_repeat': [all_is_repeat[i] for i in valid_indices if i < len(all_is_repeat)]
    }

def calculate_cohens_d(strong_concepts: dict, weak_concepts: dict) -> float:
    """
    计算强项与弱项知识点之间的Cohen's d效应量
    
    公式：d = (p_strong - p_weak) / sqrt(((N_strong-1)*s_strong^2 + (N_weak-1)*s_weak^2) / (N_strong + N_weak - 2))
    
    参数:
        strong_concepts: 强项知识点字典 {concept_id: {'accuracy': float, 'count': int}}
        weak_concepts: 弱项知识点字典 {concept_id: {'accuracy': float, 'count': int}}
    
    返回:
        float: Cohen's d值
    """
    # 提取强项数据
    strong_accuracies = [data['accuracy'] for data in strong_concepts.values()]
    strong_counts = [data['count'] for data in strong_concepts.values()]
    
    # 提取弱项数据
    weak_accuracies = [data['accuracy'] for data in weak_concepts.values()]
    weak_counts = [data['count'] for data in weak_concepts.values()]
    
    # 计算加权平均正确率
    strong_total_questions = sum(strong_counts)
    weak_total_questions = sum(weak_counts)
    
    p_strong = sum(acc * count for acc, count in zip(strong_accuracies, strong_counts)) / strong_total_questions
    p_weak = sum(acc * count for acc, count in zip(weak_accuracies, weak_counts)) / weak_total_questions
    
    # 计算加权方差
    strong_variance = sum(count * (acc - p_strong)**2 for acc, count in zip(strong_accuracies, strong_counts)) / strong_total_questions
    weak_variance = sum(count * (acc - p_weak)**2 for acc, count in zip(weak_accuracies, weak_counts)) / weak_total_questions
    
    # 计算合并标准差
    n_strong = len(strong_concepts)
    n_weak = len(weak_concepts)
    
    if n_strong + n_weak <= 2:
        return 0.0
    
    pooled_variance = ((n_strong - 1) * strong_variance + (n_weak - 1) * weak_variance) / (n_strong + n_weak - 2)
    
    if pooled_variance == 0:
        return 0.0
    
    pooled_std = np.sqrt(pooled_variance)
    
    # 计算Cohen's d
    cohens_d = (p_strong - p_weak)
    # cohens_d = (p_strong - p_weak) / pooled_std
    
    return cohens_d
    """
    计算每个概念的正确率
    
    参数:
        concepts: 概念ID列表
        responses: 对应的回答结果列表
    
    返回:
        dict: 概念ID到正确率的映射
    """
    concept_stats = {}
    
    for concept, response in zip(concepts, responses):
        if concept not in concept_stats:
            concept_stats[concept] = {'correct': 0, 'total': 0}
        
        concept_stats[concept]['total'] += 1
        if response == 1:
            concept_stats[concept]['correct'] += 1
    
    # 计算正确率
    concept_accuracy = {}
    for concept, stats in concept_stats.items():
        concept_accuracy[concept] = stats['correct'] / stats['total']
    
    return concept_accuracy

def main(input_csv_path: str, output_dir: str = './'):
    """
    主函数：读取CSV文件，分类学生，保存结果
    
    参数:
        input_csv_path: 输入CSV文件路径
        output_dir: 输出目录路径
    """
    
    # 1. 读取CSV文件
    print("正在读取CSV文件...")
    df = pd.read_csv(input_csv_path)
    print(f"共读取到 {len(df)} 条学生记录")
    
    # 2. 分类学生
    print("正在分类学生...")
    df['student_type'] = df.apply(classify_student_type, axis=1)
    
    # 3. 统计分类结果
    type_counts = df['student_type'].value_counts()
    print("分类结果统计:")
    for student_type, count in type_counts.items():
        print(f"  {student_type}: {count} 人")
    
    # 4. 分别保存两类学生的数据
    specialized_students = df[df['student_type'] == '偏科型'].copy()
    balanced_students = df[df['student_type'] == '均衡型'].copy()
    
    # 移除分类标签列（如果不需要保存的话）
    specialized_students = specialized_students.drop('student_type', axis=1)
    balanced_students = balanced_students.drop('student_type', axis=1)
    
    # 5. 保存为新的CSV文件
    specialized_output_path = f"{output_dir}/pk.csv"
    balanced_output_path = f"{output_dir}/ph.csv"
    
    specialized_students.to_csv(specialized_output_path, index=False)
    balanced_students.to_csv(balanced_output_path, index=False)
    
    print(f"偏科型学生数据已保存到: {specialized_output_path} ({len(specialized_students)} 人)")
    print(f"均衡型学生数据已保存到: {balanced_output_path} ({len(balanced_students)} 人)")
    
    return specialized_students, balanced_students
def calculate_concept_accuracy(concepts: List[int], responses: List[int]) -> dict:
    """
    计算每个概念的正确率
    
    参数:
        concepts: 概念ID列表
        responses: 对应的回答结果列表
    
    返回:
        dict: 概念ID到正确率的映射
    """
    concept_stats = {}
    
    for concept, response in zip(concepts, responses):
        if concept not in concept_stats:
            concept_stats[concept] = {'correct': 0, 'total': 0}
        
        concept_stats[concept]['total'] += 1
        if response == 1:
            concept_stats[concept]['correct'] += 1
    
    # 计算正确率
    concept_accuracy = {}
    for concept, stats in concept_stats.items():
        concept_accuracy[concept] = stats['correct'] / stats['total']
    
    return concept_accuracy
# 使用示例
if __name__ == "__main__":
    # 指定输入文件路径
    input_file = "/root/autodl-tmp/pykt_self_version/data/assist2009/old_test_question_window_sequences.csv"  # 替换为你的实际文件路径
    output_directory = "/root/autodl-tmp/pykt_self_version/data/assist2009/"  # 输出目录
    
    # 执行分类和保存
    specialized_df, balanced_df = main(input_file, output_directory)
    
    print("处理完成！")

正在读取CSV文件...
共读取到 39807 条学生记录
正在分类学生...
分类结果统计:
  均衡型: 38408 人
  偏科型: 1399 人
偏科型学生数据已保存到: /root/autodl-tmp/pykt_self_version/data/assist2009//sp.csv (1399 人)
均衡型学生数据已保存到: /root/autodl-tmp/pykt_self_version/data/assist2009//ba.csv (38408 人)
处理完成！


Traceback (most recent call last):
  File "/root/autodl-tmp/pykt_self_version/examples/wandb_predict.py", line 14, in <module>
    with open("../configs/wandb.json") as fin:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '../configs/wandb.json'


In [None]:
!python wandb_predict.py --save_dir "/root/autodl-tmp/pykt_self_version/examples/saved_model/assist2009_dkt_qid_saved_model_3407_0_0.5_256_0.001_0_1_0"

In [3]:
import pandas as pd
import numpy as np
from typing import Tuple, List, Dict

def calculate_student_variance(uid_data: pd.DataFrame) -> float:
    """
    计算单个学生的知识点正确率差值
    使用前20%高正确率知识点和后20%低正确率知识点的平均正确率差值
    
    参数:
        uid_data: 包含同一个uid所有记录的DataFrame
    
    返回:
        float: 高正确率知识点平均值 - 低正确率知识点平均值，如果没有符合条件的知识点返回0
    """
    # 合并所有行的数据
    all_concepts = []
    all_responses = []
    
    for _, row in uid_data.iterrows():
        concepts = [int(x) for x in row['concepts'].split(',')]
        responses = [int(x) for x in row['responses'].split(',')]
        
        # 过滤掉填充值-1
        valid_pairs = [(c, r) for c, r in zip(concepts, responses) if c != -1 and r != -1]
        if valid_pairs:
            valid_concepts, valid_responses = zip(*valid_pairs)
            all_concepts.extend(valid_concepts)
            all_responses.extend(valid_responses)
    
    if not all_concepts:
        return 0.0
    
    # 统计每个概念的正确率和题量
    concept_stats = {}
    for concept, response in zip(all_concepts, all_responses):
        if concept not in concept_stats:
            concept_stats[concept] = {'correct': 0, 'total': 0}
        
        concept_stats[concept]['total'] += 1
        if response == 1:
            concept_stats[concept]['correct'] += 1
    
    # 只考虑做题数量>5的知识点
    valid_concept_accuracies = []
    for concept, stats in concept_stats.items():
        if stats['total'] > 5:  # 题量大于5
            accuracy = stats['correct'] / stats['total']
            valid_concept_accuracies.append((concept, accuracy))
    
    # 如果没有符合条件的知识点，返回0
    if not valid_concept_accuracies:
        return 0.0
    
    # 如果只有一个知识点，差值为0
    if len(valid_concept_accuracies) == 1:
        return 0.0
    
    # 按正确率排序
    valid_concept_accuracies.sort(key=lambda x: x[1], reverse=True)
    
    # 计算前20%和后20%的数量
    n_concepts = len(valid_concept_accuracies)
    n_top = max(1, int(n_concepts * 0.2))  # 至少取1个
    n_bottom = max(1, int(n_concepts * 0.2))  # 至少取1个
    
    # 获取前20%的知识点正确率
    top_accuracies = [acc for _, acc in valid_concept_accuracies[:n_top]]
    # 获取后20%的知识点正确率
    bottom_accuracies = [acc for _, acc in valid_concept_accuracies[-n_bottom:]]
    
    # 计算平均值
    avg_top = sum(top_accuracies) / len(top_accuracies)
    avg_bottom = sum(bottom_accuracies) / len(bottom_accuracies)
    
    # 返回差值
    variance = avg_top - avg_bottom
    
    return variance

def classify_students_by_variance(df: pd.DataFrame) -> pd.DataFrame:
    """
    根据正确率差值将学生分为三类，按记录数均衡分配
    前33%记录为偏科型，后33%记录为均衡型，中间为普通型
    
    参数:
        df: 原始数据DataFrame
    
    返回:
        pd.DataFrame: 添加了分类标签的DataFrame
    """
    # 按uid分组计算每个学生的差值和记录数
    student_info = {}
    
    for uid, group in df.groupby('uid'):
        variance = calculate_student_variance(group)
        record_count = len(group)
        student_info[uid] = {
            'variance': variance,
            'record_count': record_count
        }
    
    # 创建学生级别的DataFrame
    student_df = pd.DataFrame.from_dict(student_info, orient='index')
    student_df.reset_index(inplace=True)
    student_df.columns = ['uid', 'variance', 'record_count']
    
    # 按差值降序排序（差值大的在前）
    student_df = student_df.sort_values('variance', ascending=False)
    
    # 计算总记录数
    total_records = student_df['record_count'].sum()
    target_per_group = total_records / 3
    
    # 贪心算法分配学生到三类
    student_df['student_type'] = ''
    cumulative_records = 0
    
    # 第一阶段：分配偏科型（前33%记录）
    for idx, row in student_df.iterrows():
        if cumulative_records < target_per_group:
            student_df.at[idx, 'student_type'] = '偏科型'
            cumulative_records += row['record_count']
        else:
            # 检查是否加入这个学生会更接近目标
            if abs(cumulative_records - target_per_group) > abs(cumulative_records + row['record_count'] - target_per_group):
                student_df.at[idx, 'student_type'] = '偏科型'
                cumulative_records += row['record_count']
            else:
                break
    
    # 第二阶段：分配普通型（中间33%记录）
    for idx, row in student_df.iterrows():
        if student_df.at[idx, 'student_type'] == '':
            if cumulative_records < 2 * target_per_group:
                student_df.at[idx, 'student_type'] = '普通型'
                cumulative_records += row['record_count']
            else:
                # 检查是否加入这个学生会更接近目标
                if abs(cumulative_records - 2 * target_per_group) > abs(cumulative_records + row['record_count'] - 2 * target_per_group):
                    student_df.at[idx, 'student_type'] = '普通型'
                    cumulative_records += row['record_count']
                else:
                    break
    
    # 第三阶段：剩余的都是均衡型
    student_df.loc[student_df['student_type'] == '', 'student_type'] = '均衡型'
    
    # 将分类结果合并回原始DataFrame
    type_mapping = dict(zip(student_df['uid'], student_df['student_type']))
    variance_mapping = dict(zip(student_df['uid'], student_df['variance']))
    
    df['variance'] = df['uid'].map(variance_mapping)
    df['student_type'] = df['uid'].map(type_mapping)
    
    return df

def main(input_csv_path: str, output_dir: str = './'):
    """
    主函数：读取CSV文件，分类学生，保存结果
    
    参数:
        input_csv_path: 输入CSV文件路径
        output_dir: 输出目录路径
    """
    
    # 1. 读取CSV文件
    print("正在读取CSV文件...")
    df = pd.read_csv(input_csv_path)
    print(f"共读取到 {len(df)} 条记录")
    print(f"共有 {df['uid'].nunique()} 个不同的学生")
    
    # 2. 按差值分类学生
    print("\n正在计算学生知识点掌握差异度...")
    df_classified = classify_students_by_variance(df)
    
    # 3. 统计分类结果
    student_summary = df_classified[['uid', 'student_type', 'variance']].drop_duplicates()
    
    # 统计记录数
    record_counts = df_classified['student_type'].value_counts()
    total_records = len(df_classified)
    
    # 统计学生数
    student_counts = student_summary['student_type'].value_counts()
    
    print("\n分类结果统计:")
    print(f"总记录数: {total_records}")
    print(f"总学生数: {len(student_summary)}")
    
    for student_type in ['偏科型', '普通型', '均衡型']:
        if student_type in record_counts.index:
            record_count = record_counts[student_type]
            record_percentage = (record_count / total_records) * 100
            
            student_count = student_counts[student_type]
            student_percentage = (student_count / len(student_summary)) * 100
            
            print(f"\n  {student_type}:")
            print(f"    记录数: {record_count} ({record_percentage:.1f}%)")
            print(f"    学生数: {student_count} ({student_percentage:.1f}%)")
            
            # 显示该类型学生的差值范围
            type_variances = student_summary[student_summary['student_type'] == student_type]['variance']
            if len(type_variances) > 0:
                print(f"    差值范围: {type_variances.min():.3f} - {type_variances.max():.3f}")
                print(f"    平均差值: {type_variances.mean():.3f}")
    
    # 4. 分别保存三类学生的数据
    specialized_students = df_classified[df_classified['student_type'] == '偏科型'].copy()
    normal_students = df_classified[df_classified['student_type'] == '普通型'].copy()
    balanced_students = df_classified[df_classified['student_type'] == '均衡型'].copy()
    
    # 移除临时列
    for df_subset in [specialized_students, normal_students, balanced_students]:
        df_subset.drop(['student_type', 'variance'], axis=1, inplace=True)
    
    # 5. 保存为新的CSV文件
    specialized_output_path = f"{output_dir}/pk.csv"
    normal_output_path = f"{output_dir}/pt.csv"
    balanced_output_path = f"{output_dir}/ph.csv"
    
    specialized_students.to_csv(specialized_output_path, index=False)
    normal_students.to_csv(normal_output_path, index=False)
    balanced_students.to_csv(balanced_output_path, index=False)
    
    print(f"\n数据已保存:")
    print(f"偏科型学生数据: {specialized_output_path} ({len(specialized_students)} 条记录, {specialized_students['uid'].nunique()} 个学生)")
    print(f"普通型学生数据: {normal_output_path} ({len(normal_students)} 条记录, {normal_students['uid'].nunique()} 个学生)")
    print(f"均衡型学生数据: {balanced_output_path} ({len(balanced_students)} 条记录, {balanced_students['uid'].nunique()} 个学生)")
    
    # 6. 显示一些示例
    print("\n示例分析:")
    # 显示差值最大的学生
    top_student = student_summary.nlargest(1, 'variance').iloc[0]
    print(f"差值最大的学生: UID={top_student['uid']}, 差值={top_student['variance']:.3f}, 类型={top_student['student_type']}")
    
    # 显示差值最小的学生
    bottom_student = student_summary.nsmallest(1, 'variance').iloc[0]
    print(f"差值最小的学生: UID={bottom_student['uid']}, 差值={bottom_student['variance']:.3f}, 类型={bottom_student['student_type']}")
    
    # 显示差值分布信息
    print(f"\n差值分布统计:")
    print(f"最小值: {student_summary['variance'].min():.3f}")
    print(f"25%分位数: {student_summary['variance'].quantile(0.25):.3f}")
    print(f"中位数: {student_summary['variance'].median():.3f}")
    print(f"75%分位数: {student_summary['variance'].quantile(0.75):.3f}")
    print(f"最大值: {student_summary['variance'].max():.3f}")
    print(f"标准差: {student_summary['variance'].std():.3f}")
    
    return specialized_students, normal_students, balanced_students

# 使用示例
if __name__ == "__main__":
    # 指定输入文件路径
    input_file = "/root/autodl-tmp/pykt_self_version/data/assist2009/old_test_question_window_sequences.csv"
    output_directory = "/root/autodl-tmp/pykt_self_version/data/assist2009/"
    
    # 执行分类和保存
    specialized_df, normal_df, balanced_df = main(input_file, output_directory)
    
    print("\n处理完成！")

正在读取CSV文件...
共读取到 39807 条记录
共有 776 个不同的学生

正在计算学生知识点掌握差异度...

分类结果统计:
总记录数: 39807
总学生数: 776

  偏科型:
    记录数: 14880 (37.4%)
    学生数: 24 (3.1%)
    差值范围: 0.902 - 1.000
    平均差值: 0.968

  普通型:
    记录数: 11674 (29.3%)
    学生数: 121 (15.6%)
    差值范围: 0.522 - 0.893
    平均差值: 0.696

  均衡型:
    记录数: 13253 (33.3%)
    学生数: 631 (81.3%)
    差值范围: 0.000 - 0.522
    平均差值: 0.108

数据已保存:
偏科型学生数据: /root/autodl-tmp/pykt_self_version/data/assist2009//pk.csv (14880 条记录, 24 个学生)
普通型学生数据: /root/autodl-tmp/pykt_self_version/data/assist2009//pt.csv (11674 条记录, 121 个学生)
均衡型学生数据: /root/autodl-tmp/pykt_self_version/data/assist2009//ph.csv (13253 条记录, 631 个学生)

示例分析:
差值最大的学生: UID=868, 差值=1.000, 类型=偏科型
差值最小的学生: UID=1912, 差值=0.000, 类型=均衡型

差值分布统计:
最小值: 0.000
25%分位数: 0.000
中位数: 0.000
75%分位数: 0.431
最大值: 1.000
标准差: 0.294

处理完成！
