In [None]:
# Download the Cognitive Diagnosis Benchmark Datasets (CDBD)
from EduData import get_data

get_data("assistment-2009-2010-skill", "../../data")


In [1]:
# Preprocess the original data, take students' first-attempt responses, selecte the 20 most frequent knowledge concepts
import numpy as np
import pandas as pd
import random
import json
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

origin_data = []
non_repeat_data = defaultdict(dict) 
skill_count = defaultdict(int)
with open("../../data/2009_skill_builder_data_corrected/skill_builder_data_corrected.csv", encoding='utf-8',  errors='ignore') as file:
    next(file)
    for fileline in file:
        row = fileline.split(',')
        order = int(row[0])
        stu_id = int(row[2])
        prob_id = int(row[4])
        correct = int(row[6])
        answer_type = row[10]
        try:
            skill_id = int(row[16])
            if answer_type != 'open_response': 
                if prob_id not in non_repeat_data[stu_id]:
                    origin_data.append([order, stu_id, prob_id, correct, skill_id])
                    skill_count[skill_id] += 1
                    non_repeat_data[stu_id][prob_id] = [order, stu_id, prob_id, correct, skill_id]
                else:  # not the first attempt
                    his_att = non_repeat_data[stu_id][prob_id]
                    idx = origin_data.index(his_att)
                    if his_att[0] > order:
                        origin_data[idx] = [order, stu_id, prob_id, correct, skill_id]
        except:
            continue

know_num = 20  # 20 most frequent knowledge
skill_20 = sorted(skill_count, key=lambda x:skill_count[x], reverse=True)[:know_num]  
q_m = []  # Q-matrix
stu_dict, prob_dict = {},{}
data = []
stu_idx, prob_idx = 0, 0
for record in origin_data:
    order, stu, prob, answer, skill = record
    if skill in skill_20:
        skill_new_idx = skill_20.index(skill)
        if stu not in stu_dict:
            stu_dict[stu] = stu_idx
            stu_idx += 1
        if prob not in prob_dict:
            prob_dict[prob] = prob_idx
            prob_idx += 1
            q_m_row = np.zeros(shape=know_num)
            q_m_row[skill_new_idx] = 1
            q_m.append(q_m_row)
        data.append([stu_dict[stu], prob_dict[prob], answer, order])
q_m = np.array(q_m)
data = sorted(data, key=lambda x:x[3])
print(q_m.shape, len(data))
print("number of students is %d, number of problems is %d" % (stu_idx, prob_idx))

(7577, 20) 167068
number of students is 3559, number of problems is 7577


In [2]:
# Preprocess data into multiple time windows, split train/test data
time_window_num = 7

stu_data = defaultdict(list)
for record in data:
    stu, prob, rating, time = record
    stu_data[int(stu)].append({'user_id': int(stu), 'item_id': int(prob), 'score': rating})

# split dataset
train_logs, test_logs = [], []
for t in range(time_window_num):
    t_train = []
    for stu in stu_data:
        split_len = int(len(stu_data[stu])/time_window_num)
        if t != time_window_num-1:
            t_train += stu_data[stu][t*split_len:(t+1)*split_len]
        else:
            for j in range(t*split_len, len(stu_data[stu])):
                if np.random.random() < 0.5:
                    t_train.append(stu_data[stu][j])
                else:
                    test_logs.append(stu_data[stu][j])
    random.shuffle(t_train)
    train_logs.append(t_train)

with open("../../data/2009_skill_builder_data_corrected/train_data.json", 'w', encoding='utf8') as file:
    json.dump(train_logs, file, indent=4, ensure_ascii=False)
with open("../../data/2009_skill_builder_data_corrected/test_data.json", 'w', encoding='utf8') as file:
    json.dump(test_logs, file, indent=4, ensure_ascii=False)
np.savetxt("../../data/2009_skill_builder_data_corrected/q_m.csv", q_m, delimiter=',', fmt='%d')