In [4]:
import json, pathlib

data = json.loads(pathlib.Path('log_data.json').read_text())   # 把文件名换成你的

# 收集所有 user_id 与 exer_id
user_ids = [item['user_id'] for item in data]
exer_ids = [log['exer_id'] for item in data for log in item['logs']]

print('学生索引  min =', min(user_ids), ' max =', max(user_ids))
print('题目索引  min =', min(exer_ids), ' max =', max(exer_ids))

学生索引  min = 1  max = 4163
题目索引  min = 1  max = 17746


In [3]:
import json, pathlib

data = json.loads(pathlib.Path('train_set.json').read_text())   # 把文件名换成你的

# 收集所有 user_id 与 exer_id
user_ids = [item['user_id'] for item in data]
exer_ids = [item['exer_id'] for item in data]

print('学生索引  min =', min(user_ids), ' max =', max(user_ids))
print('题目索引  min =', min(exer_ids), ' max =', max(exer_ids))

学生索引  min = 1  max = 4128
题目索引  min = 1  max = 17746


In [9]:
import json
from pathlib import Path

def count_unique_knowledge_codes(json_file: str | Path) -> int:
    """
    统计 JSON 文件中不重复的知识点（knowledge_code）数量。
    
    参数
    ----
    json_file : str | pathlib.Path
        待读取的 JSON 文件路径。
    
    返回
    ----
    int
        不重复的知识点数量。
    """
    # 读取 JSON
    with open(json_file, encoding="utf-8") as f:
        data = json.load(f)

    # 用 set 去重
    codes = set()
    for record in data:
        # 兼容 knowledge_code 可能是单 int 或 list[int] 的情况
        kc = record.get("knowledge_code", [])
        if isinstance(kc, int):
            codes.add(kc)
        else:
            codes.update(kc)

    return len(codes)


# 示例用法
if __name__ == "__main__":
    json_path = "train_set.json"  # 替换为你的文件路径
    print("知识点数量：", count_unique_knowledge_codes(json_path))

知识点数量： 122


In [12]:
import json
from pathlib import Path

def count_unique_kcs(json_file: str | Path) -> int:
    with open(json_file, encoding='utf-8') as f:
        data = json.load(f)

    codes = set()
    for rec in data:
        kc = rec.get("knowledge_code", [])
        if isinstance(kc, int):
            codes.add(kc)
        else:
            codes.update(kc)
    return len(codes)

if __name__ == "__main__":
    print("知识点数量：", count_unique_kcs("train_set.json"))

知识点数量： 122


In [11]:
import json
from pathlib import Path

def count_unique_knowledge_codes_nested(json_file: str | Path) -> int:
    """
    统计嵌套 JSON 中所有 logs 里不重复的知识点（knowledge_code）数量。
    顶层为列表，每项含 logs 列表。
    """
    with open(json_file, encoding="utf-8") as f:
        data = json.load(f)

    codes = set()
    for top_item in data:          # 遍历顶层列表
        for log in top_item.get("logs", []):
            kc = log.get("knowledge_code", [])
            if isinstance(kc, int):
                codes.add(kc)
            else:
                codes.update(kc)
    return len(codes)


# 示例用法
if __name__ == "__main__":
    json_path = "test_set.json"  # 替换为你的文件路径
    print("知识点数量：", count_unique_knowledge_codes_nested(json_path))

知识点数量： 123


In [7]:
from pathlib import Path
import numpy as np

def q_matrix_shape(txt_file: str | Path):
    """
    返回 Q 矩阵维度 (n_items, n_kcs)
    支持空格/制表符分隔或纯连续 0/1 两种格式
    """
    with open(txt_file, encoding='utf-8') as f:
        lines = [ln.rstrip('\n') for ln in f if ln.strip()]

    # 尝试空格/制表符分隔
    first_split = lines[0].split()
    if len(first_split) > 1:          # 空格分隔
        n_items = len(lines)
        n_kcs   = len(first_split)
    else:                             # 连续 0/1 字符串
        n_items = len(lines)
        n_kcs   = len(lines[0])
        # 简单校验：每行长度应相同
        if any(len(ln) != n_kcs for ln in lines):
            raise ValueError("各行长度不一致，请检查文件格式")
    return n_items, n_kcs


# 示例
if __name__ == "__main__":
    txt_path = "q.txt"          # 换成你的文件
    print("Q 矩阵维度：", q_matrix_shape(txt_path))

Q 矩阵维度： (17746, 123)


In [14]:
import json
from pathlib import Path

def read_kc_set(file_name):
    """返回文件里出现过的所有 knowledge_code 编号"""
    kc = set()
    with open(file_name, encoding='utf-8') as f:
        data = json.load(f)
    # 统一处理两种格式：列表扁平化 或 单条 dict
    if isinstance(data, list) and data and 'logs' in data[0]:   # train_slice/val/test 格式
        for stu in data:
            for log in stu['logs']:
                kc.update(log['knowledge_code'] if isinstance(log['knowledge_code'], list)
                          else [log['knowledge_code']])
    else:  # train_set 格式，已经是扁平列表
        for rec in data:
            kc.update(rec['knowledge_code'] if isinstance(rec['knowledge_code'], list)
                      else [rec['knowledge_code']])
    return kc
original = read_kc_set('log_data.json')
train_set = read_kc_set('train_set.json')
val_set   = read_kc_set('val_set.json')
test_set  = read_kc_set('test_set.json')

print('original :', len(original))
print('train_set:', len(train_set))
print('val_set  :', len(val_set))
print('test_set :', len(test_set))
print('union    :', len(train_set | val_set | test_set))
print('missing  :', original - (train_set | val_set | test_set))

original : 123
train_set: 122
val_set  : 119
test_set : 123
union    : 123
missing  : set()


In [1]:
import json

# 读取JSON文件（假设文件名为 data.json）
with open('log_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 统计所有用户的做题记录总数
total_exercises = 0
for user in data:
    total_exercises += len(user["logs"])

print(f"总做题记录数: {total_exercises}")
print(f"用户数量: {len(data)}")

总做题记录数: 278868
用户数量: 4163
