This code is heavily based on Edu CDM:</br>
@misc{bigdata2021educdm,
  title={EduCDM},
  author={bigdata-ustc},
  publisher = {GitHub},
  journal = {GitHub repository},
  year = {2021},
  howpublished = {\url{https://github.com/bigdata-ustc/EduCDM}},
}<br></br>
Specifically the presentation of the NCDM model and example as originally proposed in: </br>
@article{wang2022neuralcd,
  title={NeuralCD: A General Framework for Cognitive Diagnosis},
  author={Wang, Fei and Liu, Qi and Chen, Enhong and Huang, Zhenya and Yin, Yu and Wang, Shijin and Su, Yu},
  journal={IEEE Transactions on Knowledge and Data Engineering},
  year={2022},
  publisher={IEEE}
}

# Package Links
EduData: https://pypi.org/project/EduData/
EduCDM: https://pypi.org/project/EduCDM/

In [None]:
!pip --quiet install EduData
!pip --quiet install EduCDM

# Data Wrangling

In [11]:
#use package for easy download of files
from EduData import get_data
get_data("cdbd-a0910", "../data")

downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/item.csv is saved as ..\data\a0910\item.csv


Downloading ..\data\a0910\item.csv 100.00%: 252KB | 252KB

downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/readme.txt is saved as ..\data\a0910\readme.txt



Downloading ..\data\a0910\readme.txt 100.00%: 86.0B | 86.0B

downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/test.csv is saved as ..\data\a0910\test.csv



Downloading ..\data\a0910\test.csv 100.00%: 792KB | 792KB

downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/train.csv is saved as ..\data\a0910\train.csv



Downloading ..\data\a0910\train.csv 100.00%: 2.22MB | 2.22MB

downloader, INFO http://base.ustc.edu.cn/data/cdbd/a0910/valid.csv is saved as ..\data\a0910\valid.csv



Downloading ..\data\a0910\valid.csv 100.00%: 363KB | 363KB


'..\\data\\a0910'

In [12]:
# Load the data from files
import pandas as pd

train_data = pd.read_csv("../data/a0910/train.csv")
valid_data = pd.read_csv("../data/a0910/valid.csv")
test_data = pd.read_csv("../data/a0910/test.csv")
df_item = pd.read_csv("../data/a0910/item.csv")
item2knowledge = {}
knowledge_set = set()
for i, s in df_item.iterrows():
    item_id, knowledge_codes = s['item_id'], list(set(eval(s['knowledge_code'])))
    item2knowledge[item_id] = knowledge_codes
    knowledge_set.update(knowledge_codes)

train_data.head(5)

Unnamed: 0,user_id,item_id,score
0,1615,12977,1
1,782,13124,0
2,1084,16475,0
3,593,8690,0
4,127,14225,1


In [13]:
# Get basic data info for model initialization
import numpy as np
user_n = np.max(train_data['user_id'])
item_n = np.max([np.max(train_data['item_id']), np.max(valid_data['item_id']), np.max(test_data['item_id'])])
knowledge_n = np.max(list(knowledge_set))

user_n, item_n, knowledge_n

(4128, 17746, 123)

In [14]:
# batch_size is set to 32

import torch
from torch.utils.data import TensorDataset, DataLoader

batch_size = 32
def transform(user, item, item2knowledge, score, batch_size):
    knowledge_emb = torch.zeros((len(item), knowledge_n))
    for idx in range(len(item)):
        knowledge_emb[idx][np.array(item2knowledge[item[idx]]) - 1] = 1.0

    data_set = TensorDataset(
        torch.tensor(user, dtype=torch.int64) - 1,  # (1, user_n) to (0, user_n-1)
        torch.tensor(item, dtype=torch.int64) - 1,  # (1, item_n) to (0, item_n-1)
        knowledge_emb,
        torch.tensor(score, dtype=torch.float32)
    )
    return DataLoader(data_set, batch_size=batch_size, shuffle=True)


train_set, valid_set, test_set = [
    transform(data["user_id"], data["item_id"], item2knowledge, data["score"], batch_size)
    for data in [train_data, valid_data, test_data]
]

train_set, valid_set, test_set

(<torch.utils.data.dataloader.DataLoader at 0x16940f6e310>,
 <torch.utils.data.dataloader.DataLoader at 0x1694107c850>,
 <torch.utils.data.dataloader.DataLoader at 0x16940ea8d30>)

# Building the NCDM Model

In [15]:
import logging
logging.getLogger().setLevel(logging.INFO)

In [16]:
#NCDM required imports
import logging
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score

In [17]:
# showing the NCDM class for understanding

#from EduCDM import CDM
class CDM(object):
    def __init__(self, *args, **kwargs) -> ...:
        pass

    def train(self, *args, **kwargs) -> ...:
        raise NotImplementedError

    def eval(self, *args, **kwargs) -> ...:
        raise NotImplementedError

    def save(self, *args, **kwargs) -> ...:
        raise NotImplementedError

    def load(self, *args, **kwargs) -> ...:
        raise NotImplementedError

# from EduCDM import NCDM
class PosLinear(nn.Linear):
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        weight = 2 * F.relu(1 * torch.neg(self.weight)) + self.weight
        return F.linear(input, weight, self.bias)


class Net(nn.Module):

    def __init__(self, knowledge_n, exer_n, student_n):
        self.knowledge_dim = knowledge_n
        self.exer_n = exer_n
        self.emb_num = student_n
        self.stu_dim = self.knowledge_dim
        self.prednet_input_len = self.knowledge_dim
        self.prednet_len1, self.prednet_len2 = 512, 256  # changeable

        super(Net, self).__init__()

        # prediction sub-net
        self.student_emb = nn.Embedding(self.emb_num, self.stu_dim)
        self.k_difficulty = nn.Embedding(self.exer_n, self.knowledge_dim)
        self.e_difficulty = nn.Embedding(self.exer_n, 1)
        self.prednet_full1 = PosLinear(self.prednet_input_len, self.prednet_len1)
        self.drop_1 = nn.Dropout(p=0.5)
        self.prednet_full2 = PosLinear(self.prednet_len1, self.prednet_len2)
        self.drop_2 = nn.Dropout(p=0.5)
        self.prednet_full3 = PosLinear(self.prednet_len2, 1)

        # initialize
        for name, param in self.named_parameters():
            if 'weight' in name:
                nn.init.xavier_normal_(param)

    def forward(self, stu_id, input_exercise, input_knowledge_point):
        # before prednet
        stu_emb = self.student_emb(stu_id)
        stat_emb = torch.sigmoid(stu_emb)
        k_difficulty = torch.sigmoid(self.k_difficulty(input_exercise))
        e_difficulty = torch.sigmoid(self.e_difficulty(input_exercise))  # * 10
        # prednet
        input_x = e_difficulty * (stat_emb - k_difficulty) * input_knowledge_point
        input_x = self.drop_1(torch.sigmoid(self.prednet_full1(input_x)))
        input_x = self.drop_2(torch.sigmoid(self.prednet_full2(input_x)))
        output_1 = torch.sigmoid(self.prednet_full3(input_x))

        return output_1.view(-1)

class NCDM(CDM):
    '''Neural Cognitive Diagnosis Model'''

    def __init__(self, knowledge_n, exer_n, student_n):
        super(NCDM, self).__init__()
        self.ncdm_net = Net(knowledge_n, exer_n, student_n)

    def train(self, train_data, test_data=None, epoch=10, device="cpu", lr=0.002, silence=False):
        self.ncdm_net = self.ncdm_net.to(device)
        self.ncdm_net.train()
        loss_function = nn.BCELoss()
        optimizer = optim.Adam(self.ncdm_net.parameters(), lr=lr)
        for epoch_i in range(epoch):
            epoch_losses = []
            batch_count = 0
            for batch_data in tqdm(train_data, "Epoch %s" % epoch_i):
                batch_count += 1
                user_id, item_id, knowledge_emb, y = batch_data
                user_id: torch.Tensor = user_id.to(device)
                item_id: torch.Tensor = item_id.to(device)
                knowledge_emb: torch.Tensor = knowledge_emb.to(device)
                y: torch.Tensor = y.to(device)
                pred: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge_emb)
                loss = loss_function(pred, y)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_losses.append(loss.mean().item())

            print("[Epoch %d] average loss: %.6f" % (epoch_i, float(np.mean(epoch_losses))))

            if test_data is not None:
                auc, accuracy = self.eval(test_data, device=device)
                print("[Epoch %d] auc: %.6f, accuracy: %.6f" % (epoch_i, auc, accuracy))

    def eval(self, test_data, device="cpu"):
        self.ncdm_net = self.ncdm_net.to(device)
        self.ncdm_net.eval()
        y_true, y_pred = [], []
        for batch_data in tqdm(test_data, "Evaluating"):
            user_id, item_id, knowledge_emb, y = batch_data
            user_id: torch.Tensor = user_id.to(device)
            item_id: torch.Tensor = item_id.to(device)
            knowledge_emb: torch.Tensor = knowledge_emb.to(device)
            pred: torch.Tensor = self.ncdm_net(user_id, item_id, knowledge_emb)
            y_pred.extend(pred.detach().cpu().tolist())
            y_true.extend(y.tolist())

        return roc_auc_score(y_true, y_pred), accuracy_score(y_true, np.array(y_pred) >= 0.5)

    def save(self, filepath):
        torch.save(self.ncdm_net.state_dict(), filepath)
        logging.info("save parameters to %s" % filepath)

    def load(self, filepath):
        self.ncdm_net.load_state_dict(torch.load(filepath))  # , map_location=lambda s, loc: s
        logging.info("load parameters from %s" % filepath)

# Model Training

In [18]:
torch.cuda.is_available()

False

In [19]:
device_str = "cpu"
if torch.cuda.is_available():
    device_str = "cuda"

cdm = NCDM(knowledge_n, item_n, user_n)
cdm.train(train_set, valid_set, epoch=3, device=device_str)
cdm.save("ncdm.snapshot")

Epoch 0: 100%|█████████████████████████████████████████████████████████████████████| 5815/5815 [01:58<00:00, 49.06it/s]


[Epoch 0] average loss: 0.702545


Evaluating: 100%|███████████████████████████████████████████████████████████████████| 801/801 [00:01<00:00, 468.01it/s]


[Epoch 0] auc: 0.716589, accuracy: 0.681208


Epoch 1: 100%|█████████████████████████████████████████████████████████████████████| 5815/5815 [01:57<00:00, 49.58it/s]


[Epoch 1] average loss: 0.522655


Evaluating: 100%|███████████████████████████████████████████████████████████████████| 801/801 [00:01<00:00, 437.75it/s]


[Epoch 1] auc: 0.743691, accuracy: 0.726236


Epoch 2: 100%|█████████████████████████████████████████████████████████████████████| 5815/5815 [02:05<00:00, 46.45it/s]


[Epoch 2] average loss: 0.463883


Evaluating: 100%|███████████████████████████████████████████████████████████████████| 801/801 [00:01<00:00, 444.53it/s]
INFO:root:save parameters to ncdm.snapshot


[Epoch 2] auc: 0.750365, accuracy: 0.724713


# Model Testing

In [20]:
cdm.load("ncdm.snapshot")
auc, accuracy = cdm.eval(test_set)
print("auc: %.6f, accuracy: %.6f" % (auc, accuracy))

INFO:root:load parameters from ncdm.snapshot
Evaluating: 100%|█████████████████████████████████████████████████████████████████| 1743/1743 [00:03<00:00, 460.72it/s]

auc: 0.752972, accuracy: 0.724444



