In [7]:
# 单 cell：自检 True/False 转换 + 知识点准确率统计 + correct_count 交叉校验
import os, re
import numpy as np
import pandas as pd

# ===== 配置 =====
CSV_PATH = "Qwen3-14B_trainset.csv"
ISCORRECT_COL = "is_correct"
KPS_COL = "knowledge_points"
OUTPUT_PATH = None     # None 则自动用 *_kp_accuracy.csv
MIN_VALID = 0          # 只保留有效样本数 >= MIN_VALID 的知识点

# ===== 工具函数 =====
TRUE_SET  = {"true","t","yes","y","1","正确","是","对","√","✔","✓"}
FALSE_SET = {"false","f","no","n","0","错误","否","错","×","✗"}

def to_bool(val):
    if pd.isna(val): return np.nan
    s = str(val).strip()
    # 数值优先（处理 1/0/1.0/0.0）
    try:
        return bool(float(s) >= 0.5)
    except Exception:
        pass
    low = s.lower()
    if low in TRUE_SET: return True
    if low in FALSE_SET: return False
    return np.nan

def split_kps(s: str):
    if pd.isna(s):
        return []
    s = str(s)

    parts, buf = [], []
    d_par, d_brk, d_brc = 0, 0, 0   # (), [], {}
    n = len(s)

    def flush():
        part = ''.join(buf).strip()
        if part:
            parts.append(part)
        buf.clear()

    i = 0
    while i < n:
        ch = s[i]

        # 括号层级
        if ch == '(':
            d_par += 1
        elif ch == ')' and d_par > 0:
            d_par -= 1
        elif ch == '[':
            d_brk += 1
        elif ch == ']' and d_brk > 0:
            d_brk -= 1
        elif ch == '{':
            d_brc += 1
        elif ch == '}' and d_brc > 0:
            d_brc -= 1

        at_top = (d_par == 0 and d_brk == 0 and d_brc == 0)

        # 这些符号在外层才作为分隔：英文逗号、中文逗号/分号、竖线、顿号
        if at_top and ch in {',', '，', ';', '；', '|', '、'}:
            flush()
            i += 1
            # 跳过紧随其后的空白
            while i < n and s[i].isspace():
                i += 1
            continue

        # 斜杠：仅在外层且两侧有空格时才分隔（避免拆 f/S）
        if at_top and ch == '/' and (i > 0 and s[i-1].isspace()) and (i+1 < n and s[i+1].isspace()):
            flush()
            i += 1
            while i < n and s[i].isspace():
                i += 1
            continue

        buf.append(ch)
        i += 1

    flush()
    return parts

def read_csv_with_fallback(path: str) -> pd.DataFrame:
    last_err = None
    for enc in (None, "utf-8", "utf-8-sig", "gb18030", "latin1"):
        try:
            return pd.read_csv(path) if enc is None else pd.read_csv(path, encoding=enc)
        except Exception as e:
            last_err = e
    raise RuntimeError(f"读取 CSV 失败：{last_err}")

# ===== 读取与转换 =====
assert os.path.exists(CSV_PATH), f"未找到输入文件：{CSV_PATH}"
df = read_csv_with_fallback(CSV_PATH)

# 原始取值与类型检查
print("【原始 is_correct dtype】", df[ISCORRECT_COL].dtype)
print("【原始 is_correct 前 10 个】\n", df[ISCORRECT_COL].head(10))
print("【原始 is_correct 计数】\n", df[ISCORRECT_COL].astype(str).str.strip().value_counts())

# 转成布尔
df["_iscorrect_bool"] = df[ISCORRECT_COL].apply(to_bool)
print("\n【转换后类型】", df["_iscorrect_bool"].dtype)
print("【转换后计数】\n", df["_iscorrect_bool"].value_counts(dropna=False))
print("【整体正确率（行级）】", round(df["_iscorrect_bool"].mean()*100, 2), "%")

# ===== 展开知识点并分组 =====
df["__kps_list__"] = df[KPS_COL].apply(split_kps)
exploded = df.explode("__kps_list__", ignore_index=True)
exploded = exploded[exploded["__kps_list__"].astype(str).str.strip().ne("")]

g = exploded.groupby("__kps_list__")["_iscorrect_bool"]
total_count  = g.size()
valid_count  = g.apply(lambda s: s.notna().sum())
correct_sum1 = g.sum(min_count=1)                     # 方法1：布尔求和
correct_sum1 = correct_sum1.fillna(0).astype(float)

# 交叉校验：方法2，直接数 True 的行数
correct_sum2 = exploded.loc[exploded["_iscorrect_bool"]==True] \
                       .groupby("__kps_list__")["_iscorrect_bool"] \
                       .count() \
                       .reindex(total_count.index, fill_value=0).astype(float)

# 比对差异
chk = pd.DataFrame({
    "correct_sum1": correct_sum1,
    "correct_sum2": correct_sum2
})
diff = (chk["correct_sum1"] - chk["correct_sum2"]).abs()
bad = diff[diff != 0]
if not bad.empty:
    print("\n⚠️ 发现 correct_count 两种算法不一致的知识点（应为 0）：")
    print(bad.sort_values(ascending=False).head(20))
else:
    print("\n✅ correct_count 两种算法完全一致。")

# 汇总表
summary = pd.DataFrame({
    "knowledge_point": total_count.index,
    "total_count": total_count.values,
    "valid_count": valid_count.reindex(total_count.index).values,
    "correct_count": correct_sum1.reindex(total_count.index).values,
})
summary["accuracy"] = np.where(summary["valid_count"]>0,
                               summary["correct_count"]/summary["valid_count"],
                               np.nan)
summary["accuracy_pct"] = (summary["accuracy"]*100).round(2)

# 过滤 + 排序 + 格式
if MIN_VALID > 0:
    summary = summary[summary["valid_count"] >= MIN_VALID]
summary = summary.sort_values(by=["valid_count","accuracy"], ascending=[False, False]).reset_index(drop=True)
summary["accuracy"] = summary["accuracy"].round(4)

# 预览与导出
from IPython.display import display
display(summary.head(30))

if OUTPUT_PATH is None:
    base, ext = os.path.splitext(CSV_PATH)
    OUTPUT_PATH = base + "_kp_accuracy.csv"
summary.to_csv(OUTPUT_PATH, index=False, encoding="utf-8-sig")
print(f"\n已输出：{OUTPUT_PATH}（共 {len(summary)} 行）")


【原始 is_correct dtype】 bool
【原始 is_correct 前 10 个】
 0    False
1     True
2    False
3    False
4    False
5    False
6    False
7     True
8     True
9     True
Name: is_correct, dtype: bool
【原始 is_correct 计数】
 is_correct
True     662
False    154
Name: count, dtype: int64

【转换后类型】 bool
【转换后计数】
 _iscorrect_bool
True     662
False    154
Name: count, dtype: int64
【整体正确率（行级）】 81.13 %

✅ correct_count 两种算法完全一致。


Unnamed: 0,knowledge_point,total_count,valid_count,correct_count,accuracy,accuracy_pct
0,Conditional probability (multiplication rule),102,102,88.0,0.8627,86.27
1,Standardization to standard normal (Z-score),102,102,64.0,0.6275,62.75
2,Exponential distribution,100,100,89.0,0.89,89.0
3,Binomial distribution,98,98,80.0,0.8163,81.63
4,Normal distribution,98,98,65.0,0.6633,66.33
5,Poisson distribution,86,86,76.0,0.8837,88.37
6,Law of total probability,68,68,58.0,0.8529,85.29
7,Marginal density from joint (integration),60,60,46.0,0.7667,76.67
8,Variance of sum using covariance,56,56,35.0,0.625,62.5
9,Change of variables (univariate),50,50,44.0,0.88,88.0



已输出：Qwen3-14B_trainset_kp_accuracy.csv（共 69 行）
