Merge branch 'main' into main

bigdata-ustc · Dec 4, 2023 · 0802d5a · 0802d5a
2 parents 21b4b24 + ff426dd
commit 0802d5a
Show file tree

Hide file tree

Showing 18 changed files with 1,072 additions and 1 deletion.
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -12,4 +12,6 @@
 
 [Weizhe Huang](https://github.com/weizhehuang0827)
 
+[Bihan Xu](https://github.com/xbh0720) 
+
 The starred is the corresponding author
diff --git a/EduKTM/LBKT/LBKT.py b/EduKTM/LBKT/LBKT.py
@@ -0,0 +1,189 @@
+# coding: utf-8
+# 2023/11/21 @ xubihan
+
+from sklearn import metrics
+from sklearn.metrics import mean_squared_error
+import logging
+import torch
+import torch.nn as nn
+import numpy as np
+from .model import Recurrent
+from EduKTM import KTM
+from tqdm import tqdm
+
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def compute_auc(all_target, all_pred):
+    return metrics.roc_auc_score(all_target, all_pred)
+
+
+def compute_accuracy(all_target, all_pred):
+    all_pred[all_pred > 0.5] = 1.0
+    all_pred[all_pred <= 0.5] = 0.0
+    return metrics.accuracy_score(all_target, all_pred)
+
+
+def binary_entropy(target, pred):
+    loss = target * np.log(np.maximum(1e-10, pred)) \
+        + (1.0 - target) * np.log(np.maximum(1e-10, 1.0 - pred))
+    return np.average(loss) * -1.0
+
+
+def train_one_epoch(recurrent, optimizer, criterion,
+                    batch_size, Topics_all, Resps_all,
+                    time_factor_all, attempts_factor_all, hints_factor_all):
+    recurrent.train()
+    all_pred = []
+    all_target = []
+    n = len(Topics_all) // batch_size
+    shuffled_ind = np.arange(len(Topics_all))
+    np.random.shuffle(shuffled_ind)
+    Topics_all = Topics_all[shuffled_ind]
+    Resps_all = Resps_all[shuffled_ind]
+    time_factor_all = time_factor_all[shuffled_ind]
+    attempts_factor_all = attempts_factor_all[shuffled_ind]
+    hints_factor_all = hints_factor_all[shuffled_ind]
+
+    for idx in tqdm(range(n)):
+        optimizer.zero_grad()
+
+        Topics = Topics_all[idx * batch_size: (idx + 1) * batch_size, :]
+        Resps = Resps_all[idx * batch_size: (idx + 1) * batch_size, :]
+        time_factor = time_factor_all[idx * batch_size:
+                                      (idx + 1) * batch_size, :]
+        attempts_factor = attempts_factor_all[idx * batch_size:
+                                              (idx + 1) * batch_size, :]
+        hints_factor = hints_factor_all[idx * batch_size:
+                                        (idx + 1) * batch_size, :]
+
+        input_topics = torch.from_numpy(Topics).long().to(device)
+        input_resps = torch.from_numpy(Resps).long().to(device)
+        input_time_factor = torch.from_numpy(time_factor).float().to(device)
+        input_attempts_factor = torch.from_numpy(
+            attempts_factor).float().to(device)
+        input_hints_factor = torch.from_numpy(hints_factor).float().to(device)
+
+        y_pred = recurrent(input_topics, input_resps, input_time_factor,
+                           input_attempts_factor, input_hints_factor)
+
+        mask = input_topics[:, 1:] > 0
+        masked_pred = y_pred[:, 1:][mask]
+        masked_truth = input_resps[:, 1:][mask]
+        loss = criterion(masked_pred, masked_truth.float()).sum()
+        loss.backward()
+        optimizer.step()
+
+        masked_pred = masked_pred.detach().cpu().numpy()
+        masked_truth = masked_truth.detach().cpu().numpy()
+
+        all_pred.append(masked_pred)
+        all_target.append(masked_truth)
+
+    all_pred = np.concatenate(all_pred, axis=0)
+    all_target = np.concatenate(all_target, axis=0)
+
+    loss = binary_entropy(all_target, all_pred)
+    auc = compute_auc(all_target, all_pred)
+    acc = compute_accuracy(all_target, all_pred)
+
+    return loss, auc, acc
+
+
+def test_one_epoch(recurrent, batch_size, Topics_all, Resps_all,
+                   time_factor_all, attempts_factor_all, hints_factor_all):
+    recurrent.eval()
+    all_pred, all_target = [], []
+    n = len(Topics_all) // batch_size
+    for idx in range(n):
+        Topics = Topics_all[idx * batch_size:
+                            (idx + 1) * batch_size, :]
+        Resps = Resps_all[idx * batch_size:
+                          (idx + 1) * batch_size, :]
+        time_factor = time_factor_all[idx * batch_size:
+                                      (idx + 1) * batch_size, :]
+        attempts_factor = attempts_factor_all[idx * batch_size:
+                                              (idx + 1) * batch_size, :]
+        hints_factor = hints_factor_all[idx * batch_size:
+                                        (idx + 1) * batch_size, :]
+
+        input_topics = torch.from_numpy(Topics).long().to(device)
+        input_resps = torch.from_numpy(Resps).long().to(device)
+        input_time_factor = torch.from_numpy(time_factor).float().to(device)
+        input_attempts_factor = torch.from_numpy(attempts_factor)\
+            .float().to(device)
+        input_hints_factor = torch.from_numpy(hints_factor)\
+            .float().to(device)
+
+        with torch.no_grad():
+            y_pred = recurrent(input_topics, input_resps, input_time_factor,
+                               input_attempts_factor, input_hints_factor)
+
+            mask = input_topics[:, 1:] > 0
+            masked_pred = y_pred[:, 1:][mask]
+            masked_truth = input_resps[:, 1:][mask]
+
+            masked_pred = masked_pred.detach().cpu().numpy()
+            masked_truth = masked_truth.detach().cpu().numpy()
+
+            all_pred.append(masked_pred)
+            all_target.append(masked_truth)
+
+    all_pred = np.concatenate(all_pred, axis=0)
+    all_target = np.concatenate(all_target, axis=0)
+
+    loss = binary_entropy(all_target, all_pred)
+    auc = compute_auc(all_target, all_pred)
+    rmse = mean_squared_error(all_target, all_pred, squared=False)
+    acc = compute_accuracy(all_target, all_pred)
+
+    return loss, auc, acc, rmse
+
+
+class LBKT(KTM):
+    def __init__(self, num_topics, dim_tp, num_resps, num_units,
+                 dropout, dim_hidden, memory_size, BATCH_SIZE, q_matrix):
+        super(LBKT, self).__init__()
+        q_matrix = torch.from_numpy(q_matrix).float().to(device)
+        self.recurrent = Recurrent(num_topics, dim_tp, num_resps, num_units,
+                                   dropout, dim_hidden, memory_size,
+                                   BATCH_SIZE, q_matrix).to(device)
+        self.batch_size = BATCH_SIZE
+
+    def train(self, train_data, test_data, epoch: int,
+              lr, lr_decay_step=1, lr_decay_rate=0.5) -> ...:
+        optimizer = torch.optim.Adam(self.recurrent.parameters(), lr=lr,
+                                     eps=1e-8, betas=(0.1, 0.999),
+                                     weight_decay=1e-6)
+        scheduler = torch.optim.lr_scheduler.StepLR(
+            optimizer, lr_decay_step, gamma=lr_decay_rate)
+        criterion = nn.BCELoss(reduction='none')
+
+        best_test_auc = 0
+        for idx in range(epoch):
+            train_loss, _, _ = train_one_epoch(self.recurrent,
+                                               optimizer, criterion,
+                                               self.batch_size, *train_data)
+            print("[Epoch %d] LogisticLoss: %.6f" % (idx, train_loss))
+            scheduler.step()
+            if test_data is not None:
+                _, valid_auc, valid_acc, valid_rmse = self.eval(test_data)
+                print("[Epoch %d] auc: %.6f, accuracy: %.6f, rmse: %.6f" % (
+                    idx, valid_auc, valid_acc, valid_rmse))
+                if valid_auc > best_test_auc:
+                    best_test_auc = valid_auc
+        return best_test_auc
+
+    def eval(self, test_data) -> ...:
+        self.recurrent.eval()
+        return test_one_epoch(self.recurrent, self.batch_size, *test_data)
+
+    def save(self, filepath) -> ...:
+
+        torch.save(self.recurrent.state_dict(), filepath)
+        logging.info("save parameters to %s" % filepath)
+
+    def load(self, filepath) -> ...:
+        self.recurrent.load_state_dict(torch.load(filepath))
+        logging.info("load parameters from %s" % filepath)
diff --git a/EduKTM/LBKT/__init__.py b/EduKTM/LBKT/__init__.py
@@ -0,0 +1,4 @@
+# coding: utf-8
+# 2023/11/21 @ xubihan
+
+from .LBKT import LBKT
diff --git a/EduKTM/LBKT/model.py b/EduKTM/LBKT/model.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+# 2023/11/21 @ xubihan
+
+import torch
+import torch.nn as nn
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+class Layer1(nn.Module):
+    def __init__(self, num_units, d=10, k=0.3, b=0.3, name='lb'):
+        super(Layer1, self).__init__()
+        self.weight = nn.Parameter(torch.Tensor(2 * num_units, num_units))
+        self.bias = nn.Parameter(torch.zeros(1, num_units))
+
+        nn.init.xavier_normal_(self.weight)
+        nn.init.xavier_normal_(self.bias)
+
+        self.d = d
+        self.k = k
+        self.b = b
+
+    def forward(self, factor, interact_emb, h):
+        k = self.k
+        d = self.d
+        b = self.b
+
+        gate = k + (1 - k) / (1 + torch.exp(-d * (factor - b)))
+
+        w = torch.cat([h, interact_emb], -1).matmul(self.weight) + self.bias
+
+        w = nn.Sigmoid()(w * gate)
+        return w
+
+
+class LBKTcell(nn.Module):
+    def __init__(self, num_units, memory_size, dim_tp,
+                 dropout=0.2, name='lbktcell'):
+        super(LBKTcell, self).__init__()
+        self.num_units = num_units
+        self.memory_size = memory_size
+        self.dim_tp = dim_tp
+        self.r = 4
+        self.factor_dim = 50
+
+        self.time_gain = Layer1(self.num_units, name='time_gain')
+        self.attempt_gain = Layer1(self.num_units, name='attempt_gain')
+        self.hint_gain = Layer1(self.num_units, name='hint_gain')
+
+        self.time_weight = nn.Parameter(torch.Tensor(self.r, num_units + 1, num_units))
+        nn.init.xavier_normal_(self.time_weight)
+
+        self.attempt_weight = nn.Parameter(torch.Tensor(self.r, num_units + 1, num_units))
+        nn.init.xavier_normal_(self.attempt_weight)
+
+        self.hint_weight = nn.Parameter(torch.Tensor(self.r, num_units + 1, num_units))
+        nn.init.xavier_normal_(self.hint_weight)
+
+        self.Wf = nn.Parameter(torch.Tensor(1, self.r))
+        nn.init.xavier_normal_(self.Wf)
+
+        self.bias = nn.Parameter(torch.Tensor(1, num_units))
+        nn.init.xavier_normal_(self.bias)
+
+        self.gate3 = nn.Linear(2 * num_units + 3 * self.factor_dim, num_units)
+        torch.nn.init.xavier_normal_(self.gate3.weight)
+
+        self.dropout = nn.Dropout(dropout)
+        self.output_layer = nn.Linear(dim_tp + num_units, num_units)
+        torch.nn.init.xavier_normal_(self.output_layer.weight)
+        self.sig = nn.Sigmoid()
+
+    def forward(self, interact_emb, correlation_weight, topic_emb,
+                time_factor, attempt_factor, hint_factor, h_pre):
+        # bs *1 * memory_size , bs * memory_size * d_k
+        h_pre_tilde = torch.squeeze(torch.bmm(correlation_weight.unsqueeze(1), h_pre), 1)
+        # predict performance
+        preds = torch.sum(self.sig(self.output_layer(torch.cat([h_pre_tilde, topic_emb], -1))),
+                          -1) / self.num_units  # bs
+
+        # characterize each behavior's effect
+        time_gain = self.time_gain(time_factor, interact_emb, h_pre_tilde)
+        attempt_gain = self.attempt_gain(attempt_factor, interact_emb, h_pre_tilde)
+        hint_gain = self.hint_gain(hint_factor, interact_emb, h_pre_tilde)
+
+        # capture the dependency among different behaviors
+        pad = torch.ones_like(time_factor)  # bs * 1
+        time_gain1 = torch.cat([time_gain, pad], -1)  # bs * num_units + 1
+        attempt_gain1 = torch.cat([attempt_gain, pad], -1)
+        hint_gain1 = torch.cat([hint_gain, pad], -1)
+        # bs * r  *num_units: bs * num_units + 1 ,r * num_units + 1 *num_units
+        fusion_time = torch.matmul(time_gain1, self.time_weight)
+        fusion_attempt = torch.matmul(attempt_gain1, self.attempt_weight)
+        fusion_hint = torch.matmul(hint_gain1, self.hint_weight)
+        fusion_all = fusion_time * fusion_attempt * fusion_hint
+        # 1 * r, bs * r * num_units -> bs * 1 * num_units -> bs * num_units
+        fusion_all = torch.matmul(self.Wf, fusion_all.permute(1, 0, 2)).squeeze(1) + self.bias
+        learning_gain = torch.relu(fusion_all)
+
+        LG = torch.matmul(correlation_weight.unsqueeze(-1), learning_gain.unsqueeze(1))
+
+        # forget effect
+        forget_gate = self.gate3(torch.cat([h_pre, interact_emb.unsqueeze(1).repeat(1, self.memory_size, 1),
+                                            time_factor.unsqueeze(1).repeat(1, self.memory_size, self.factor_dim),
+                                            attempt_factor.unsqueeze(1).repeat(1, self.memory_size, self.factor_dim),
+                                            hint_factor.unsqueeze(1).repeat(1, self.memory_size, self.factor_dim)], -1))
+        LG = self.dropout(LG)
+        h = h_pre * self.sig(forget_gate) + LG
+
+        return preds, h
+
+
+class Recurrent(nn.Module):
+    def __init__(self, num_topics, dim_tp, num_resps, num_units, dropout,
+                 dim_hidden, memory_size, batch_size, q_matrix):
+        super(Recurrent, self).__init__()
+
+        self.embedding_topic = nn.Embedding(num_topics + 10, dim_tp)
+        torch.nn.init.xavier_normal_(self.embedding_topic.weight)
+
+        self.embedding_resps = nn.Embedding(num_resps, dim_hidden)
+        torch.nn.init.xavier_normal_(self.embedding_resps.weight)
+
+        self.memory_size = memory_size
+        self.num_units = num_units
+        self.dim_tp = dim_tp
+        self.q_matrix = q_matrix
+
+        self.input_layer = nn.Linear(dim_tp + dim_hidden, num_units)
+        torch.nn.init.xavier_normal_(self.input_layer.weight)
+
+        self.lbkt_cell = LBKTcell(num_units, memory_size,
+                                  dim_tp, dropout=dropout, name='lbkt')
+
+        self.init_h = nn.Parameter(torch.Tensor(memory_size, num_units))
+        nn.init.xavier_normal_(self.init_h)
+
+    def forward(self, topics, resps, time_factor, attempt_factor, hint_factor):
+        batch_size, seq_len = topics.size(0), topics.size(1)
+        topic_emb = self.embedding_topic(topics)
+        resps_emb = self.embedding_resps(resps)
+
+        correlation_weight = self.q_matrix[topics]
+        acts_emb = torch.relu(self.input_layer(torch.cat([topic_emb, resps_emb], -1)))
+
+        time_factor = time_factor.unsqueeze(-1)
+        attempt_factor = attempt_factor.unsqueeze(-1)
+        hint_factor = hint_factor.unsqueeze(-1)
+
+        h_init = self.init_h.unsqueeze(0).repeat(batch_size, 1, 1)
+        h_pre = h_init
+        preds = torch.zeros(batch_size, seq_len).to(device)
+        for t in range(0, seq_len):
+            pred, h = self.lbkt_cell(acts_emb[:, t], correlation_weight[:, t],
+                                     topic_emb[:, t], time_factor[:, t],
+                                     attempt_factor[:, t], hint_factor[:, t], h_pre)
+            h_pre = h
+
+            preds[:, t] = pred
+
+        return preds
diff --git a/EduKTM/__init__.py b/EduKTM/__init__.py
@@ -11,3 +11,4 @@
 from .GKT import GKT
 from .DKVMN import DKVMN
 from .SKT import SKT
+from .LBKT import LBKT
diff --git a/docs/DKT.md b/docs/DKT.md
@@ -1,6 +1,14 @@
 # Deep Knowledge Tracing(DKT)
 
-If the reader wants to know the details of DKT, please refer to the Appendix of the paper: *[Deep Knowledge Tracing](http://stanford.edu/~cpiech/bio/papers/deepKnowledgeTracing.pdf)*.
+Deep knowledge tracing (DKT) is the first approach to introduce deep learning into KT, which utilizes recurrent neural networks (RNNs) to model the students’ learning process. DKT applies RNNs to process the input sequence of learning interactions over time, maintaining a hidden state that implicitly represents students' knowledge state which evolves based on both the previous knowledge state and the present input learning interaction. 
+
+![DKT model](_static/DKT.png)
+
+The above figure shows the data flow of DKT model. $x_i$ are the input embeddings of students’ learning interactions, $h_i$ are the hidden states that represent students’ knowledge states, and $y_i$ are the predicted answers. The high-dimensional and continuous representation of the knowledge state makes it better able to model the complex learning process. Generally, RNNs’ variant long short term memory (LSTM) networks are more commonly used in the implementation of DKT, which is made more powerful through considering forgetting. We also use LSTM to implement DKT model in our library.
+
+
+
+If the reader wants to know the details of DKT, please refer to the paper: *[Deep Knowledge Tracing](http://stanford.edu/~cpiech/bio/papers/deepKnowledgeTracing.pdf)*.
 ```bibtex
 @article{piech2015dkt,
   title={Deep Knowledge Tracing},