# CS490 Deep Knowledge Tracing using LSTM

---

Implementation in TensorFlow. Trained using Assistments 2017

# Importing Required Modules
---

In [None]:

from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC
from tensorflow.keras.regularizers import L2

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

Also importing dataset

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/DeepKT/assistments_2017.csv')

data.head(20)

  data = pd.read_csv('/content/drive/MyDrive/Colab/assistments_2017.csv')


Unnamed: 0,studentId,MiddleSchoolId,InferredGender,SY ASSISTments Usage,AveKnow,AveCarelessness,AveCorrect,NumActions,AveResBored,AveResEngcon,...,RES_CONFUSED,RES_FRUSTRATED,RES_OFFTASK,RES_GAMING,Ln-1,Ln,MCAS,Enrolled,Selective,isSTEM
0,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.0,0.0,0.785585,0.000264,0.13,0.06119,45,0,0,
1,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.887452,0.0,0.468252,0.001483,0.06119,0.21351,45,0,0,
2,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.887452,0.0,0.468252,0.001483,0.116,0.033306,45,0,0,
3,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.0,0.0,0.108417,0.010665,0.116,0.033306,45,0,0,
4,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.0,0.0,0.108417,0.010665,0.033306,0.118386,45,0,0,
5,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.0,0.0,0.785585,0.002026,0.033306,0.118386,45,0,0,
6,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.0,1.0,0.108417,0.005952,0.033306,0.118386,45,0,0,
7,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.060808,0.0,0.785585,0.010665,0.348,0.138588,45,0,0,
8,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.060808,0.0,0.916914,0.012562,0.168,0.097911,45,0,0,
9,8,2,Male,2004-2005,0.352416,0.183276,0.483902,1056,0.208389,0.679126,...,0.060808,0.0,0.916914,0.012562,0.168,0.097911,45,0,0,


# Defining Model
---

In [None]:
class LSTM_DKTModel(Model):
    def __init__(self, num_skills, embed_dim=64, lstm_units=128, num_lstm_layers=2, dropout_rate=0.1):
        super(LSTM_DKTModel, self).__init__()

        # Input dimension is 2 * num_skills to account for correct/incorrect per skill
        self.input_dim = 2 * num_skills
        self.num_skills = num_skills
        self.lstm_units = lstm_units

        # Embedding layer for skill-correctness pairs
        self.embedding = Embedding(self.input_dim, embed_dim)

        # Stack of LSTM layers
        self.lstm_layers = []
        for i in range(num_lstm_layers):
            # Return sequences True for all layers to maintain sequence length
            self.lstm_layers.append(
                LSTM(
                    units=lstm_units,
                    return_sequences=True,
                    dropout=dropout_rate,
                    recurrent_dropout=dropout_rate/2,  # Lower dropout for recurrent connections
                    name=f'lstm_layer_{i}'
                )
            )

        # Layer normalization after LSTM for stable training
        self.layer_norm = LayerNormalization(epsilon=1e-6)

        # Dropout for regularization
        self.dropout = Dropout(dropout_rate)

        # Output layer predicts probability for each skill
        self.fc = Dense(num_skills, activation="sigmoid")

    def call(self, inputs, training=False):
        # Get input shape
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]

        # Embedding layer
        x = self.embedding(inputs)  # Shape: (batch_size, seq_len, embed_dim)

        # Process through LSTM layers
        for lstm_layer in self.lstm_layers:
            x = lstm_layer(x, training=training)

        # Apply layer normalization
        x = self.layer_norm(x)

        # Apply dropout
        x = self.dropout(x, training=training)

        # Output layer
        return self.fc(x)  # Shape: (batch_size, seq_len, num_skills)



# Preprocess
---

In [None]:
def create_skill_mappings(df):
    skills = df['skill'].unique()
    skill_to_id = {skill: idx for idx, skill in enumerate(skills)}
    id_to_skill = {idx: skill for skill, idx in skill_to_id.items()}
    return skill_to_id, id_to_skill

def create_interaction_index(row, skill_to_id):
    skill_id = skill_to_id[row['skill']]
    return 2 * skill_id + row['correct']

def prepare_sequences(df, skill_to_id, sequence_length=50):
    num_skills = len(skill_to_id)
    sequences = []
    next_skill_masks = []

    df = df.sort_values('startTime')

    for i in range(0, len(df) - sequence_length):
        window = df.iloc[i:i + sequence_length]
        sequence = []
        next_masks = []

        for j in range(len(window) - 1):
            current_interaction = create_interaction_index(window.iloc[j], skill_to_id)
            sequence.append(current_interaction)

            next_skill = window.iloc[j + 1]['skill']
            skill_mask = np.zeros(num_skills)
            skill_mask[skill_to_id[next_skill]] = 1
            next_masks.append(skill_mask)

        sequences.append(sequence)
        next_skill_masks.append(next_masks)

    return np.array(sequences), np.array(next_skill_masks)

def create_data_splits(df, skill_to_id, sequence_length=50, test_size=0.2, val_size=0.1):
    # Prepare sequences
    X, Y = prepare_sequences(df, skill_to_id, sequence_length)

    # First split: separate test set
    X_temp, X_test, Y_temp, Y_test = train_test_split(
        X, Y, test_size=test_size, shuffle=False
    )

    # Second split: separate validation from training
    val_size_adjusted = val_size / (1 - test_size)
    X_train, X_val, Y_train, Y_val = train_test_split(
        X_temp, Y_temp, test_size=val_size_adjusted, shuffle=False
    )

    # Convert to TensorFlow tensors
    data_splits = {
        'X_train': tf.convert_to_tensor(X_train, dtype=tf.int32),
        'Y_train': tf.convert_to_tensor(Y_train, dtype=tf.int32),
        'X_val': tf.convert_to_tensor(X_val, dtype=tf.int32),
        'Y_val': tf.convert_to_tensor(Y_val, dtype=tf.int32),
        'X_test': tf.convert_to_tensor(X_test, dtype=tf.int32),
        'Y_test': tf.convert_to_tensor(Y_test, dtype=tf.int32)
    }

    return data_splits


class DKTDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, interaction_indices, next_skill_masks, batch_size=32):
        self.interaction_indices = interaction_indices
        self.next_skill_masks = next_skill_masks
        self.batch_size = batch_size
        self.indices = np.arange(len(self.interaction_indices))

    def __len__(self):
        return int(np.ceil(len(self.indices) / self.batch_size))

    def __getitem__(self, idx):
        start_idx = idx * self.batch_size
        end_idx = min((idx + 1) * self.batch_size, len(self.indices))
        batch_indices = self.indices[start_idx:end_idx]

        # Ensure inputs are properly shaped (batch_size, sequence_length)
        batch_x = np.array([self.interaction_indices[i] for i in batch_indices])
        batch_y = np.array([self.next_skill_masks[i] for i in batch_indices])

        return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y)

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

df = data[data['studentId'] == 8]  # Your student data

# Create skill mappings
skill_to_id, id_to_skill = create_skill_mappings(df)

# Create all data splits
data_splits = create_data_splits(
    df=df,
    skill_to_id=skill_to_id,
    sequence_length=50,
    test_size=0.2,
    val_size=0.1
)

# Training Code
---

In [None]:
def train_model(train_generator, val_generator, num_skills, epochs=50):
    model = LSTM_DKTModel(
        num_skills=num_skills,
        embed_dim=64,
        lstm_units=128,
        num_lstm_layers=2,
        dropout_rate=0.2
    )

    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=0.001,
        weight_decay=0.01,
        clipnorm=1.0
    )

    model.compile(
        optimizer=optimizer,
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )

    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=7,
            restore_best_weights=True,
            min_delta=0.001
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=0.0001,
            min_delta=0.001
        ),
        tf.keras.callbacks.ModelCheckpoint(
            'dkt_model_best.weights.h5',
            monitor='val_auc',
            save_best_only=True,
            save_weights_only=True,
            mode='max'
        )
    ]

    history = model.fit(
        train_generator,
        validation_data=val_generator,
        epochs=epochs,
        callbacks=callbacks
    )

    return model, history

# Main training execution
def execute_training(data_splits, skill_to_id, batch_size=32, epochs=50):
    train_generator = DKTDataGenerator(
        data_splits['X_train'],
        data_splits['Y_train'],
        batch_size=batch_size
    )

    val_generator = DKTDataGenerator(
        data_splits['X_val'],
        data_splits['Y_val'],
        batch_size=batch_size
    )

    model, history = train_model(
        train_generator,
        val_generator,
        num_skills=len(skill_to_id),
        epochs=epochs
    )

    return model, history

# Testing Code
---

In [None]:
def evaluate_model(model, data_splits, id_to_skill):
    test_predictions = model.predict(data_splits['X_test'])

    test_results = model.evaluate(
        data_splits['X_test'],
        data_splits['Y_test'],
        verbose=0
    )

    print("\nOverall Test Set Performance:")
    print(f"Loss: {test_results[0]:.4f}")
    print(f"Accuracy: {test_results[1]:.4f}")
    print(f"AUC: {test_results[2]:.4f}")

    return test_predictions

def analyze_skill_performance(predictions, true_values, id_to_skill):
    num_skills = len(id_to_skill)
    skill_metrics = {}

    for skill_id in range(num_skills):
        skill_name = id_to_skill[skill_id]

        # Reshape tensors to 1D arrays
        skill_pred = tf.reshape(predictions[:, :, skill_id], [-1])
        skill_true = tf.reshape(true_values[:, :, skill_id], [-1])

        # Calculate metrics
        accuracy = tf.reduce_mean(tf.keras.metrics.binary_accuracy(skill_true, skill_pred))
        auc = tf.keras.metrics.AUC()
        auc.update_state(skill_true, skill_pred)

        skill_metrics[skill_name] = {
            'accuracy': float(accuracy),
            'auc': float(auc.result())
        }

    return skill_metrics

def analyze_sequential_predictions(predictions, true_values, id_to_skill, num_sequences=5):
    for seq_idx in range(min(num_sequences, predictions.shape[0])):
        print(f"\nSequence {seq_idx + 1} Analysis:")
        sequence_pred = predictions[seq_idx]
        sequence_true = true_values[seq_idx]

        for step in range(sequence_pred.shape[0]):
            pred_skill_id = int(tf.argmax(sequence_pred[step]))
            true_skill_id = int(tf.argmax(sequence_true[step]))

            pred_prob = float(sequence_pred[step][pred_skill_id])

            print(f"\nStep {step + 1}:")
            print(f"Predicted Skill: {id_to_skill[pred_skill_id]}")
            print(f"Actual Skill: {id_to_skill[true_skill_id]}")
            print(f"Prediction Confidence: {pred_prob:.3f}")

def run_model_evaluation(model, data_splits, id_to_skill):
    print("Starting model evaluation...")

    test_predictions = evaluate_model(model, data_splits, id_to_skill)

    skill_metrics = analyze_skill_performance(
        test_predictions,
        data_splits['Y_test'],
        id_to_skill
    )

    print("\nPer-Skill Performance:")
    for skill, metrics in skill_metrics.items():
        print(f"\nSkill: {skill}")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"AUC: {metrics['auc']:.4f}")

    print("\nDetailed Sequence Analysis:")
    analyze_sequential_predictions(
        test_predictions,
        data_splits['Y_test'],
        id_to_skill
    )

    return test_predictions, skill_metrics

def execute_testing(model, data_splits, skill_to_id):
    """
    Executes the complete testing pipeline.
    """
    id_to_skill = {v: k for k, v in skill_to_id.items()}
    predictions, metrics = run_model_evaluation(model, data_splits, id_to_skill)
    return predictions, metrics

# Run
---

In [None]:
# Train the model
model, history = execute_training(
    data_splits=data_splits,
    skill_to_id=skill_to_id,
    batch_size=32,
    epochs=25
)

execute_testing(
    model=model,
    data_splits=data_splits,
    skill_to_id=skill_to_id
)

model.save('lstm_dkt_model.keras')

Epoch 1/25


  self._warn_if_super_not_called()


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 234ms/step - accuracy: 0.0137 - auc: 0.4978 - loss: 0.4716 - val_accuracy: 0.0000e+00 - val_auc: 0.6390 - val_loss: 0.0969 - learning_rate: 0.0010
Epoch 2/25
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 255ms/step - accuracy: 0.0294 - auc: 0.5913 - loss: 0.0943 - val_accuracy: 0.0978 - val_auc: 0.7012 - val_loss: 0.0685 - learning_rate: 0.0010
Epoch 3/25
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 292ms/step - accuracy: 0.0692 - auc: 0.6760 - loss: 0.0698 - val_accuracy: 0.0707 - val_auc: 0.6241 - val_loss: 0.0684 - learning_rate: 0.0010
Epoch 4/25
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 181ms/step - accuracy: 0.1107 - auc: 0.7236 - loss: 0.0654 - val_accuracy: 0.1158 - val_auc: 0.5943 - val_loss: 0.0686 - learning_rate: 0.0010
Epoch 5/25
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 237ms/step - accuracy: 0.1593 - auc: 0.7503 - loss: 0.062