# CS490 Deep Knowledge Tracing using Transformers

---

Implementation in TensorFlow. Trained using Assistments 2017

# Importing Required Modules
---

In [None]:
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, MultiHeadAttention, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC
from tensorflow.keras.regularizers import L2

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

Also importing dataset

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive/ass*

/content/drive/MyDrive/assistments_2017.csv


In [None]:
data = pd.read_csv('/content/drive/MyDrive/DeepKT/assistments_2017.csv')

data.head(20)

  data = pd.read_csv('/content/drive/MyDrive/assistments_2017.csv')


Unnamed: 0,studentId,MiddleSchoolId,InferredGender,SY ASSISTments Usage,AveKnow,AveCarelessness,AveCorrect,NumActions,AveResBored,AveResEngcon,...,RES_CONFUSED,RES_FRUSTRATED,RES_OFFTASK,RES_GAMING,Ln-1,Ln,MCAS,Enrolled,Selective,isSTEM
0,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.0,0.0,0.785585,0.000264,0.13,0.06119,45,0,0,
1,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.887452,0.0,0.468252,0.001483,0.06119,0.21351,45,0,0,
2,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.887452,0.0,0.468252,0.001483,0.116,0.033306,45,0,0,
3,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.0,0.0,0.108417,0.010665,0.116,0.033306,45,0,0,
4,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.0,0.0,0.108417,0.010665,0.033306,0.118386,45,0,0,
5,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.0,0.0,0.785585,0.002026,0.033306,0.118386,45,0,0,
6,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.0,1.0,0.108417,0.005952,0.033306,0.118386,45,0,0,
7,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.060808,0.0,0.785585,0.010665,0.348,0.138588,45,0,0,
8,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.060808,0.0,0.916914,0.012562,0.168,0.097911,45,0,0,
9,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.060808,0.0,0.916914,0.012562,0.168,0.097911,45,0,0,


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 942816 entries, 0 to 942815
Data columns (total 82 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   studentId                               942816 non-null  int64  
 1   MiddleSchoolId                          942816 non-null  int64  
 2   InferredGender                          769160 non-null  object 
 3   SY ASSISTments Usage                    942816 non-null  object 
 4   AveKnow                                 942816 non-null  float64
 5   AveCarelessness                         942816 non-null  float64
 6   AveCorrect                              942816 non-null  float64
 7   NumActions                              942816 non-null  int64  
 8   AveResBored                             942816 non-null  float64
 9   AveResEngcon                            942816 non-null  float64
 10  AveResConf                              9428

# Defining Model
---

In [None]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, inputs, training, mask=None):
        attn_output = self.attention(inputs, inputs, attention_mask=mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class Transformer_DKTModel(Model):
    def __init__(self, num_skills, embed_dim, num_heads, ff_dim, num_transformer_blocks, dropout_rate=0.1):
        super(Transformer_DKTModel, self).__init__()

        self.input_dim = 2 * num_skills
        self.num_skills = num_skills
        self.embed_dim = embed_dim

        self.embedding = Embedding(self.input_dim, embed_dim)
        self.pos_encoding = self._positional_encoding(embed_dim)

        self.transformer_blocks = [
            TransformerBlock(embed_dim, num_heads, ff_dim, dropout_rate)
            for _ in range(num_transformer_blocks)
        ]

        self.dropout = Dropout(dropout_rate)
        self.fc = Dense(num_skills, activation="sigmoid")

    def _positional_encoding(self, d_model, max_length=5000):
        pos = tf.range(max_length, dtype=tf.float32)[:, tf.newaxis]
        i = tf.range(d_model, dtype=tf.float32)[tf.newaxis, :]
        angle = pos / tf.pow(10000.0, (2 * (i // 2)) / tf.cast(d_model, tf.float32))

        sines = tf.sin(angle[:, 0::2])
        cosines = tf.cos(angle[:, 1::2])

        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]

        return tf.cast(pos_encoding, tf.float32)

    def _create_causal_mask(self, size):
        mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
        return mask[tf.newaxis, tf.newaxis, :, :]

    def call(self, inputs, training=False):
        # Get input shape using tf.shape
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]

        # Create causal mask
        mask = self._create_causal_mask(seq_len)

        # Embedding
        x = self.embedding(inputs)  # (batch_size, seq_len, embed_dim)

        # Add positional encoding
        x = x + self.pos_encoding[:, :seq_len, :]

        # Apply transformer blocks
        for transformer in self.transformer_blocks:
            x = transformer(x, training=training, mask=mask)

        x = self.dropout(x, training=training)
        return self.fc(x)  # (batch_size, seq_len, num_skills)

class DKTDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, interaction_indices, next_skill_masks, batch_size=32):
        self.interaction_indices = interaction_indices
        self.next_skill_masks = next_skill_masks
        self.batch_size = batch_size
        self.indices = np.arange(len(self.interaction_indices))

    def __len__(self):
        return int(np.ceil(len(self.indices) / self.batch_size))

    def __getitem__(self, idx):
        start_idx = idx * self.batch_size
        end_idx = min((idx + 1) * self.batch_size, len(self.indices))
        batch_indices = self.indices[start_idx:end_idx]

        # Ensure inputs are properly shaped (batch_size, sequence_length)
        batch_x = np.array([self.interaction_indices[i] for i in batch_indices])
        batch_y = np.array([self.next_skill_masks[i] for i in batch_indices])

        return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y)

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

# Preprocess
---

In [None]:
def create_skill_mappings(df):
    skills = df['skill'].unique()
    skill_to_id = {skill: idx for idx, skill in enumerate(skills)}
    id_to_skill = {idx: skill for skill, idx in skill_to_id.items()}
    return skill_to_id, id_to_skill

def create_interaction_index(row, skill_to_id):
    skill_id = skill_to_id[row['skill']]
    return 2 * skill_id + row['correct']

def prepare_sequences(df, skill_to_id, sequence_length=50):
    num_skills = len(skill_to_id)
    sequences = []
    next_skill_masks = []

    df = df.sort_values('startTime')

    for i in range(0, len(df) - sequence_length):
        window = df.iloc[i:i + sequence_length+1]
        sequence = []
        next_masks = []

        for j in range(len(window) - 1):
            current_interaction = create_interaction_index(window.iloc[j], skill_to_id)
            sequence.append(current_interaction)

            next_skill = window.iloc[j + 1]['skill']
            skill_mask = np.zeros(num_skills)
            skill_mask[skill_to_id[next_skill]] = 1
            next_masks.append(skill_mask)

        sequences.append(sequence)
        next_skill_masks.append(next_masks)

    return np.array(sequences), np.array(next_skill_masks)

def create_data_splits(df, skill_to_id, sequence_length=50, test_size=0.2, val_size=0.1):
    # Prepare sequences
    X, Y = prepare_sequences(df, skill_to_id, sequence_length)

    # First split: separate test set
    X_temp, X_test, Y_temp, Y_test = train_test_split(
        X, Y, test_size=test_size, shuffle=False
    )

    # Second split: separate validation from training
    val_size_adjusted = val_size / (1 - test_size)
    X_train, X_val, Y_train, Y_val = train_test_split(
        X_temp, Y_temp, test_size=val_size_adjusted, shuffle=False
    )

    # Convert to TensorFlow tensors
    data_splits = {
        'X_train': tf.convert_to_tensor(X_train, dtype=tf.int32),
        'Y_train': tf.convert_to_tensor(Y_train, dtype=tf.int32),
        'X_val': tf.convert_to_tensor(X_val, dtype=tf.int32),
        'Y_val': tf.convert_to_tensor(Y_val, dtype=tf.int32),
        'X_test': tf.convert_to_tensor(X_test, dtype=tf.int32),
        'Y_test': tf.convert_to_tensor(Y_test, dtype=tf.int32)
    }

    return data_splits


class DKTDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, interaction_indices, next_skill_masks, batch_size=32):
        self.interaction_indices = interaction_indices
        self.next_skill_masks = next_skill_masks
        self.batch_size = batch_size
        self.indices = np.arange(len(self.interaction_indices))

    def __len__(self):
        return int(np.ceil(len(self.indices) / self.batch_size))

    def __getitem__(self, idx):
        start_idx = idx * self.batch_size
        end_idx = min((idx + 1) * self.batch_size, len(self.indices))
        batch_indices = self.indices[start_idx:end_idx]

        # Ensure inputs are properly shaped (batch_size, sequence_length)
        batch_x = np.array([self.interaction_indices[i] for i in batch_indices])
        batch_y = np.array([self.next_skill_masks[i] for i in batch_indices])

        return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y)

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

df = data[data['studentId'] == 8]  # Your student data

# Create skill mappings
skill_to_id, id_to_skill = create_skill_mappings(df)

# Create all data splits
data_splits = create_data_splits(
    df=df,
    skill_to_id=skill_to_id,
    sequence_length=50,
    test_size=0.2,
    val_size=0.1
)

In [None]:
data_splits

{'X_train': <tf.Tensor: shape=(703, 50), dtype=int32, numpy=
 array([[  0,   1,   2, ...,  10,  23,  13],
        [  1,   2,   2, ...,  23,  13,  11],
        [  2,   2,   3, ...,  13,  11,  10],
        ...,
        [ 95, 121,   3, ...,  11,  10,  10],
        [121,   3,  94, ...,  10,  10,  10],
        [  3,  94,  95, ...,  10,  10,  11]], dtype=int32)>,
 'Y_train': <tf.Tensor: shape=(703, 50, 76), dtype=int32, numpy=
 array([[[1, 0, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]],
 
        [[0, 1, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]],
 
        [[0, 1, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0

# Training Code
---

In [None]:
def train_model(train_generator, val_generator, num_skills, epochs=50):
    model = Transformer_DKTModel(
        num_skills=len(skill_to_id),
        embed_dim=128,        # Increased from 64
        num_heads=4,          # Reduced from 8
        ff_dim=256,          # Increased from 128
        num_transformer_blocks=3,  # Reduced from 6
        dropout_rate=0.2     # Increased from 0.1
    )

    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=0.001,
        weight_decay=0.01,
        clipnorm=1.0
    )

    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )

    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True,
            min_delta=0.001
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=0.0001,
            min_delta=0.001
        ),
        tf.keras.callbacks.ModelCheckpoint(
            'transformer_dkt_best.weights.h5',
            monitor='val_auc',
            save_weights_only=True,
            mode='max'
        )
    ]

    history = model.fit(
        train_generator,
        validation_data=val_generator,
        epochs=epochs,
        callbacks=callbacks
    )

    return model, history

def execute_training(data_splits, skill_to_id, batch_size=32, epochs=50):
    train_generator = DKTDataGenerator(
        data_splits['X_train'],
        data_splits['Y_train'],
        batch_size=batch_size
    )

    val_generator = DKTDataGenerator(
        data_splits['X_val'],
        data_splits['Y_val'],
        batch_size=batch_size
    )

    model, history = train_model(
        train_generator,
        val_generator,
        num_skills=len(skill_to_id),
        epochs=epochs
    )

    return model, history

# Testing Code
---

In [None]:
def evaluate_model(model, data_splits, id_to_skill):
    test_predictions = model.predict(data_splits['X_test'])

    test_results = model.evaluate(
        data_splits['X_test'],
        data_splits['Y_test'],
        verbose=0
    )

    print("\nOverall Test Set Performance:")
    print(f"Loss: {test_results[0]:.4f}")
    print(f"Accuracy: {test_results[1]:.4f}")
    print(f"AUC: {test_results[2]:.4f}")

    return test_predictions

def analyze_skill_performance(predictions, true_values, id_to_skill):
    num_skills = len(id_to_skill)
    skill_metrics = {}

    for skill_id in range(num_skills):
        skill_name = id_to_skill[skill_id]

        # Reshape tensors to 1D arrays
        skill_pred = tf.reshape(predictions[:, :, skill_id], [-1])
        skill_true = tf.reshape(true_values[:, :, skill_id], [-1])

        # Calculate metrics
        accuracy = tf.reduce_mean(tf.keras.metrics.binary_accuracy(skill_true, skill_pred))
        auc = tf.keras.metrics.AUC()
        auc.update_state(skill_true, skill_pred)

        skill_metrics[skill_name] = {
            'accuracy': float(accuracy),
            'auc': float(auc.result())
        }

    return skill_metrics

def analyze_sequential_predictions(predictions, true_values, id_to_skill, num_sequences=5):
    for seq_idx in range(min(num_sequences, predictions.shape[0])):
        print(f"\nSequence {seq_idx + 1} Analysis:")
        sequence_pred = predictions[seq_idx]
        sequence_true = true_values[seq_idx]

        for step in range(sequence_pred.shape[0]):
            pred_skill_id = int(tf.argmax(sequence_pred[step]))
            true_skill_id = int(tf.argmax(sequence_true[step]))

            pred_prob = float(sequence_pred[step][pred_skill_id])

            print(f"\nStep {step + 1}:")
            print(f"Predicted Skill: {id_to_skill[pred_skill_id]}")
            print(f"Actual Skill: {id_to_skill[true_skill_id]}")
            print(f"Prediction Confidence: {pred_prob:.3f}")

def run_model_evaluation(model, data_splits, id_to_skill):
    print("Starting model evaluation...")

    test_predictions = evaluate_model(model, data_splits, id_to_skill)

    skill_metrics = analyze_skill_performance(
        test_predictions,
        data_splits['Y_test'],
        id_to_skill
    )

    print("\nPer-Skill Performance:")
    for skill, metrics in skill_metrics.items():
        print(f"\nSkill: {skill}")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"AUC: {metrics['auc']:.4f}")

    print("\nDetailed Sequence Analysis:")
    analyze_sequential_predictions(
        test_predictions,
        data_splits['Y_test'],
        id_to_skill
    )

    return test_predictions, skill_metrics

def execute_testing(model, data_splits, skill_to_id):
    id_to_skill = {v: k for k, v in skill_to_id.items()}
    predictions, metrics = run_model_evaluation(model, data_splits, id_to_skill)
    return predictions, metrics

# Run
---

In [None]:
data_splits

{'X_train': <tf.Tensor: shape=(703, 49), dtype=int32, numpy=
 array([[  0,   1,   2, ...,  21,  10,  23],
        [  1,   2,   2, ...,  10,  23,  13],
        [  2,   2,   3, ...,  23,  13,  11],
        ...,
        [ 95, 121,   3, ...,  10,  11,  10],
        [121,   3,  94, ...,  11,  10,  10],
        [  3,  94,  95, ...,  10,  10,  10]], dtype=int32)>,
 'Y_train': <tf.Tensor: shape=(703, 49, 76), dtype=int32, numpy=
 array([[[1, 0, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]],
 
        [[0, 1, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]],
 
        [[0, 1, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         [0, 1, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0

In [None]:
# Train the model
model, history = execute_training(
    data_splits=data_splits,
    skill_to_id=skill_to_id,
    batch_size=32,
    epochs=25
)

model.evaluate(data_splits['X_test'], data_splits['Y_test'])

model.save('transformer_dkt_model.keras')

Epoch 1/25


  self._warn_if_super_not_called()


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 748ms/step - accuracy: 0.0150 - auc: 0.5320 - loss: 0.3976 - val_accuracy: 0.0212 - val_auc: 0.6488 - val_loss: 0.0925 - learning_rate: 0.0010
Epoch 2/25
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 645ms/step - accuracy: 0.0240 - auc: 0.5928 - loss: 0.0963 - val_accuracy: 0.0279 - val_auc: 0.5639 - val_loss: 0.0703 - learning_rate: 0.0010
Epoch 3/25
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 620ms/step - accuracy: 0.0422 - auc: 0.6469 - loss: 0.0719 - val_accuracy: 0.0279 - val_auc: 0.5407 - val_loss: 0.0706 - learning_rate: 0.0010
Epoch 4/25
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 643ms/step - accuracy: 0.0466 - auc: 0.6850 - loss: 0.0684 - val_accuracy: 0.0000e+00 - val_auc: 0.6049 - val_loss: 0.0708 - learning_rate: 0.0010
Epoch 5/25
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 592ms/step - accuracy: 0.0534 - auc: 0.7002 - loss: 0.0