# Knowledge Proficiency Tracing model (KPT) and Exercise-correlated KPT (EKPT)

This notebook will show you how to train and use the KPT and EKPT.
First, we will show how to get the data (here we use assistment-2009-2010-skill as the dataset).
Then we will show how to train a KPT and EKPT, and perform the parameters persistence.
At last, we will show how to load the parameters from the file and evaluate on the test dataset.

The script version could be found in [KPT.py](KPT.ipynb)

## Data Preparation

Before we process the data, we need to first acquire the dataset which is shown in [prepare_dataset.ipynb](prepare_dataset.ipynb)

In [1]:
# Preprocess the original data, take students' first-attempt responses, selecte the 20 most frequent knowledge concepts
import numpy as np
import pandas as pd
import random
import json
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

origin_data = []
non_repeat_data = defaultdict(dict) 
skill_count = defaultdict(int)
with open("../../data/2009_skill_builder_data_corrected/skill_builder_data_corrected.csv", encoding='utf-8',  errors='ignore') as file:
    next(file)
    for fileline in file:
        row = fileline.split(',')
        order = int(row[0])
        stu_id = int(row[2])
        prob_id = int(row[4])
        correct = int(row[6])
        answer_type = row[10]
        try:
            skill_id = int(row[16])
            if answer_type != 'open_response': 
                if prob_id not in non_repeat_data[stu_id]:
                    origin_data.append([order, stu_id, prob_id, correct, skill_id])
                    skill_count[skill_id] += 1
                    non_repeat_data[stu_id][prob_id] = [order, stu_id, prob_id, correct, skill_id]
                else:  # not the first attempt
                    his_att = non_repeat_data[stu_id][prob_id]
                    idx = origin_data.index(his_att)
                    if his_att[0] > order:
                        origin_data[idx] = [order, stu_id, prob_id, correct, skill_id]
        except:
            continue

know_num = 20  # 20 most frequent knowledge
skill_20 = sorted(skill_count, key=lambda x:skill_count[x], reverse=True)[:know_num]  
q_m = []  # Q-matrix
stu_dict, prob_dict = {},{}
data = []
stu_idx, prob_idx = 0, 0
for record in origin_data:
    order, stu, prob, answer, skill = record
    if skill in skill_20:
        skill_new_idx = skill_20.index(skill)
        if stu not in stu_dict:
            stu_dict[stu] = stu_idx
            stu_idx += 1
        if prob not in prob_dict:
            prob_dict[prob] = prob_idx
            prob_idx += 1
            q_m_row = np.zeros(shape=know_num)
            q_m_row[skill_new_idx] = 1
            q_m.append(q_m_row)
        data.append([stu_dict[stu], prob_dict[prob], answer, order])
q_m = np.array(q_m)
data = sorted(data, key=lambda x:x[3])
print(q_m.shape, len(data))
print("number of students is %d, number of problems is %d" % (stu_idx, prob_idx))

(7577, 20) 167068
number of students is 3559, number of problems is 7577


In [2]:
# Preprocess data into multiple time windows, split train/test data
time_window_num = 7

stu_data = defaultdict(list)
for record in data:
    stu, prob, rating, time = record
    stu_data[int(stu)].append({'user_id': int(stu), 'item_id': int(prob), 'score': rating})

# split dataset
train_logs, test_logs = [], []
for t in range(time_window_num):
    t_train = []
    for stu in stu_data:
        split_len = int(len(stu_data[stu])/time_window_num)
        if t != time_window_num-1:
            t_train += stu_data[stu][t*split_len:(t+1)*split_len]
        else:
            for j in range(t*split_len, len(stu_data[stu])):
                if np.random.random() < 0.5:
                    t_train.append(stu_data[stu][j])
                else:
                    test_logs.append(stu_data[stu][j])
    random.shuffle(t_train)
    train_logs.append(t_train)

with open("../../data/2009_skill_builder_data_corrected/train_data.json", 'w', encoding='utf8') as file:
    json.dump(train_logs, file, indent=4, ensure_ascii=False)
with open("../../data/2009_skill_builder_data_corrected/test_data.json", 'w', encoding='utf8') as file:
    json.dump(test_logs, file, indent=4, ensure_ascii=False)
np.savetxt("../../data/2009_skill_builder_data_corrected/q_m.csv", q_m, delimiter=',', fmt='%d')

# train_logs[0][0] is the first train log in the first Time Window"
print(train_logs[0][0], test_logs[0])

{'user_id': 393, 'item_id': 2857, 'score': 1} {'user_id': 21, 'item_id': 5152, 'score': 1}


In [3]:
# Load the data from files

# Q matrix
q_m = np.loadtxt("../../data/2009_skill_builder_data_corrected/q_m.csv", dtype=int, delimiter=",")
prob_num, know_num = q_m.shape[0], q_m.shape[1]

# training data
with open("../../data/2009_skill_builder_data_corrected/train_data.json", encoding='utf-8') as file:
    train_set = json.load(file)
stu_num = max([x['user_id'] for x in train_set[0]]) + 1
time_window_num = len(train_set)
                    
# testing data
with open("../../data/2009_skill_builder_data_corrected/test_data.json", encoding='utf-8') as file:
    test_set = json.load(file)

## Training and Persistence

In [4]:
import logging
logging.getLogger().setLevel(logging.INFO)

### KPT

In [5]:
from EduKTM import KPT

cdm = KPT('KPT', q_m, stu_num, prob_num, know_num, time_window_num=time_window_num)

cdm.train(train_set, epoch=2, lr=0.001, lr_b=0.0001, epsilon=1e-3, init_method='mean')
cdm.save("kpt.params")

INFO:root:save parameters to kpt.params


### EKPT

In [6]:
cdm2 = KPT('EKPT', q_m, stu_num, prob_num, know_num, time_window_num=time_window_num)

cdm2.train(train_set, epoch=2, lr=0.001, lr_b=0.0001, epsilon=1e-3, init_method='mean')
cdm2.save("ekpt.params")

INFO:root:save parameters to ekpt.params


## Loading and Testing

### KPT

In [7]:
cdm.load("kpt.params")
rmse, mae = cdm.eval(test_set)
print("For KPT, RMSE: %.6f, MAE: %.6f" % (rmse, mae))

INFO:root:load parameters from kpt.params
evaluating: 100%|██████████████████████████████████████████████████████████████| 16370/16370 [00:02<00:00, 7099.88it/s]


For KPT, RMSE: 0.445781, MAE: 0.379456


### EKPT

In [8]:
cdm2.load("ekpt.params")
rmse2, mae2 = cdm2.eval(test_set)
print("For EKPT, RMSE: %.6f, MAE: %.6f" % (rmse2, mae2))

INFO:root:load parameters from ekpt.params
evaluating: 100%|██████████████████████████████████████████████████████████████| 16370/16370 [00:02<00:00, 7576.03it/s]


For EKPT, RMSE: 0.446698, MAE: 0.379953
