# Deep Knowledge Tracing using Transformer model

Dataset: Assistments 2017

# Data Layer

Import Dataset from Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

assistments = pd.read_csv('/content/drive/MyDrive/DeepKT/assistments_2017.csv')

Mounted at /content/drive


  assistments = pd.read_csv('/content/drive/MyDrive/DeepKT/assistments_2017.csv')


**Assistments 2017**

We will use mainly 2 columns from the dataframe: Skill and Correctness, the other two columns will be for aiding preprocessing.

In [None]:
assistments[['studentId', 'skill', 'correct', 'action_num']].head(15000)

Unnamed: 0,studentId,skill,correct,action_num
0,8,properties-of-geometric-figures,0,9950
1,8,properties-of-geometric-figures,1,9951
2,8,sum-of-interior-angles-more-than-3-sides,0,9952
3,8,sum-of-interior-angles-more-than-3-sides,0,9953
4,8,sum-of-interior-angles-more-than-3-sides,1,9954
...,...,...,...,...
14995,337,interpreting-numberline,1,269095
14996,337,interpreting-numberline,1,269096
14997,337,interpreting-numberline,0,269097
14998,337,inequality-solving,0,269098


# Preprocess

In [None]:
import pandas as pd
import numpy as np

from dataclasses import dataclass
from typing import Tuple, List, Dict

@dataclass
class SequenceConfig:
  seq_length: int
  sliding_window_step: int = 1
  max_students: int = 100

class SequenceGenerator:
  def __init__(self, config: SequenceConfig, skill_to_id: Dict):
    self.config = config # Configuring the parameters for preprocessing
    self.skill_to_id = {} # Mapping skills to unique IDs

  def load_and_process(self, file_path: str) -> Tuple[pd.DataFrame, int]:
    # Load and preprocess data from Dataset
    data = pd.read_csv(file_path)

    num_skills = data['skill'].nunique()

    data.sort_values(by=['studentId', 'action_num'])

    selected_students = data['studentId'].unique()[:self.config.max_students]
    data = data[data['studentId'].isin(selected_students)]

    self.skill_to_id = self.skill_map(data)

    return data, num_skills

  def skill_map(self, data: pd.DataFrame) -> Dict[str, int]:
    skill_to_id = {}

    for skill in data['skill'].unique():
      skill_to_id[skill] = len(skill_to_id)

    return skill_to_id

  def encode_interaction(self, skill: int, correctness: int) -> int:
    # Encode each possible interaction uniquely as a number
    return 2 * skill + correctness

  def generate_label(self, num_skills: int, skill: int, correctness: int) -> List[int]:
    # Create labels to calculate BCE loss
    label = np.zeros(num_skills)
    label[skill] = correctness
    return label

  def prepare_student_sequences(self, student_data: pd.DataFrame, num_skills: int) -> Tuple[List, List]:
    # Prepare sequences for each student
    sequences = []
    labels = []

    print('Checkpoint', student_data['studentId'])

    if len(student_data) < self.config.seq_length:
      return sequences, labels

    for i in range(0, len(student_data) - self.config.seq_length, self.config.sliding_window_step):
      if i + self.config.seq_length > len(student_data):
        break

      window = student_data.iloc[i: i + self.config.seq_length]

      next_interaction = student_data.iloc[i + self.config.seq_length]

      encoded_sequence = [self.encode_interaction(self.skill_to_id[row['skill']], row['correct']) for _, row in window.iterrows()]

      sequences.append(encoded_sequence)

      labels.append(window['correct'].tolist())

    return sequences, labels

  def prepare_sequences(self, df: pd.DataFrame, num_skills: int) -> Tuple[List, List]:
    all_sequences = []
    all_labels = []

    for student_id in df['studentId'].unique():
      student_data = df[df['studentId'] == student_id]

      student_seq, student_lab = self.prepare_student_sequences(student_data, num_skills)

      all_sequences.extend(student_seq)
      all_labels.extend(student_lab)

    return all_sequences, all_labels

gen = SequenceGenerator(SequenceConfig(15, 1, 100), {})

df, num_skills = gen.load_and_process('/content/drive/MyDrive/DeepKT/assistments_2017.csv')

df.head(100)

seq, lab = gen.prepare_sequences(df, num_skills)


  data = pd.read_csv(file_path)


Checkpoint 0       8
1       8
2       8
3       8
4       8
       ..
1051    8
1052    8
1053    8
1054    8
1055    8
Name: studentId, Length: 1056, dtype: int64
Checkpoint 1056    35
1057    35
1058    35
1059    35
1060    35
        ..
2044    35
2045    35
2046    35
2047    35
2048    35
Name: studentId, Length: 993, dtype: int64
Checkpoint 2049    39
2050    39
2051    39
2052    39
2053    39
        ..
2462    39
2463    39
2464    39
2465    39
2466    39
Name: studentId, Length: 418, dtype: int64
Checkpoint 2467    64
2468    64
2469    64
2470    64
2471    64
        ..
3881    64
3882    64
3883    64
3884    64
3885    64
Name: studentId, Length: 1419, dtype: int64
Checkpoint 3886    77
3887    77
3888    77
3889    77
3890    77
        ..
4198    77
4199    77
4200    77
4201    77
4202    77
Name: studentId, Length: 317, dtype: int64
Checkpoint 4203    126
4204    126
4205    126
4206    126
4207    126
       ... 
4604    126
4605    126
4606    126
4607    126
460

# **Save and Load Functions**

In [None]:
import pickle
import json
import os

def save_preprocessed_data(sequences, labels, skill_to_id, config, save_dir='/content/drive/MyDrive/DeepKT/preprocessed_data'):
    """Save preprocessed data to Google Drive"""
    # Mount Google Drive if not already mounted
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive')

    os.makedirs(save_dir, exist_ok=True)

    # Save sequences and labels
    np.save(os.path.join(save_dir, 'sequences.npy'), np.array(sequences))
    np.save(os.path.join(save_dir, 'labels.npy'), np.array(labels))

    # Save skill mapping and configuration
    metadata = {
        'skill_to_id': skill_to_id,
        'config': {
            'seq_length': config.seq_length,
            'sliding_window_step': config.sliding_window_step,
            'num_students': config.max_students
        },
        'dataset_stats': {
            'num_sequences': len(sequences),
            'sequence_length': len(sequences[0]) if sequences else 0,
            'num_skills': len(skill_to_id)
        }
    }

    with open(os.path.join(save_dir, 'metadata.json'), 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"Data saved successfully to {save_dir}")
    print("Files saved:")
    print(f"- sequences.npy: {os.path.getsize(os.path.join(save_dir, 'sequences.npy'))/1024/1024:.2f} MB")
    print(f"- labels.npy: {os.path.getsize(os.path.join(save_dir, 'labels.npy'))/1024/1024:.2f} MB")
    print(f"- metadata.json: {os.path.getsize(os.path.join(save_dir, 'metadata.json'))/1024:.2f} KB")

# # Save Preprocessed Data:
# save_preprocessed_data(seq, lab, gen.skill_to_id, gen.config)

# **In case already preprocessed, load initial packages and start here**

In [None]:
import pickle
import json
import os
from google.colab import drive
import numpy as np

def load_preprocessed_data(load_dir='/content/drive/MyDrive/DeepKT/preprocessed_data'):
    """Load preprocessed data from Google Drive"""
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive')

    # Load sequences and labels
    sequences = np.load(os.path.join(load_dir, 'sequences.npy'))
    labels = np.load(os.path.join(load_dir, 'labels.npy'))

    # Load metadata
    with open(os.path.join(load_dir, 'metadata.json'), 'r') as f:
        metadata = json.load(f)

    print("Data loaded successfully")
    print(f"Loaded {metadata['dataset_stats']['num_sequences']} sequences")
    print(f"Sequence length: {metadata['dataset_stats']['sequence_length']}")
    print(f"Number of skills: {metadata['dataset_stats']['num_skills']}")

    return sequences, labels, metadata

# Load in preprocessed data
sequences, labels, metadata = load_preprocessed_data()

print(sequences[:50])
print(labels[:50])

Mounted at /content/drive
Data loaded successfully
Loaded 80887 sequences
Sequence length: 15
Number of skills: 89
[[ 0  1  2  2  3  2  3  4  6  6  7  7  4  5  4]
 [ 1  2  2  3  2  3  4  6  6  7  7  4  5  4  5]
 [ 2  2  3  2  3  4  6  6  7  7  4  5  4  5  9]
 [ 2  3  2  3  4  6  6  7  7  4  5  4  5  9 10]
 [ 3  2  3  4  6  6  7  7  4  5  4  5  9 10 10]
 [ 2  3  4  6  6  7  7  4  5  4  5  9 10 10 11]
 [ 3  4  6  6  7  7  4  5  4  5  9 10 10 11 13]
 [ 4  6  6  7  7  4  5  4  5  9 10 10 11 13 13]
 [ 6  6  7  7  4  5  4  5  9 10 10 11 13 13 10]
 [ 6  7  7  4  5  4  5  9 10 10 11 13 13 10 11]
 [ 7  7  4  5  4  5  9 10 10 11 13 13 10 11 13]
 [ 7  4  5  4  5  9 10 10 11 13 13 10 11 13 10]
 [ 4  5  4  5  9 10 10 11 13 13 10 11 13 10 10]
 [ 5  4  5  9 10 10 11 13 13 10 11 13 10 10 11]
 [ 4  5  9 10 10 11 13 13 10 11 13 10 10 11 11]
 [ 5  9 10 10 11 13 13 10 11 13 10 10 11 11 14]
 [ 9 10 10 11 13 13 10 11 13 10 10 11 11 14 14]
 [10 10 11 13 13 10 11 13 10 10 11 11 14 14 15]
 [10 11 13 13 10 11 1

# Data Transformation

In [None]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

def prepare_data(sequences, labels, batch_size = 64, train_ratio = 0.7, val_ratio = 0.15):
  sequences = sequences.astype(np.int32)
  labels = sequences.astype(np.float32)

  train_sequences, temp_sequences, train_labels, temp_labels = train_test_split(sequences, labels, train_size=train_ratio, random_state=42)

  val_ratio_adjusted = val_ratio / (1 - train_ratio)

  val_sequences, test_sequences, val_labels, test_labels = train_test_split(temp_sequences, temp_labels, train_size=val_ratio_adjusted, random_state=42)

  def create_dataset(sequences, labels, batch_size, training=False):
    dataset = tf.data.Dataset.from_tensor_slices((sequences, labels))

    if training:
      dataset = dataset.shuffle(len(sequences)) # Shuffle tensors

    dataset = dataset.batch(batch_size)

    if training:
      dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) # Prefetch for optimum training

    return dataset

  train_dataset = create_dataset(train_sequences, train_labels, batch_size)
  val_dataset = create_dataset(val_sequences, val_labels, batch_size)
  test_dataset = create_dataset(test_sequences, test_labels, batch_size)

  return train_dataset, val_dataset, test_dataset

train_dataset, val_dataset, test_dataset = prepare_data(sequences, labels)

def inspect_dataset(dataset, name="Dataset"):
    """Helper function to inspect the prepared datasets"""
    for sequences, labels in dataset.take(1):
        print(f"\n{name} inspection:")
        print(f"Sequences shape: {sequences.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Sequences dtype: {sequences.dtype}")
        print(f"Labels dtype: {labels.dtype}")
        print("\nSample sequence (first in batch):")
        print("Encoded interactions:", sequences[0])
        print("Correctness labels:", labels[0])

inspect_dataset(train_dataset, "Training")
inspect_dataset(val_dataset, "Validation")
inspect_dataset(test_dataset, "Test")


Training inspection:
Sequences shape: (64, 15)
Labels shape: (64, 15)
Sequences dtype: <dtype: 'int32'>
Labels dtype: <dtype: 'float32'>

Sample sequence (first in batch):
Encoded interactions: tf.Tensor([ 10  11  10  10  10  11 142 142 143 104 120 120 121 120 120], shape=(15,), dtype=int32)
Correctness labels: tf.Tensor(
[ 10.  11.  10.  10.  10.  11. 142. 142. 143. 104. 120. 120. 121. 120.
 120.], shape=(15,), dtype=float32)

Validation inspection:
Sequences shape: (64, 15)
Labels shape: (64, 15)
Sequences dtype: <dtype: 'int32'>
Labels dtype: <dtype: 'float32'>

Sample sequence (first in batch):
Encoded interactions: tf.Tensor([132 132 132 133 132 132 132 132 132 133 127 126 126 126 126], shape=(15,), dtype=int32)
Correctness labels: tf.Tensor(
[132. 132. 132. 133. 132. 132. 132. 132. 132. 133. 127. 126. 126. 126.
 126.], shape=(15,), dtype=float32)

Test inspection:
Sequences shape: (64, 15)
Labels shape: (64, 15)
Sequences dtype: <dtype: 'int32'>
Labels dtype: <dtype: 'float32'>


# Transformer Implementation

In [None]:
# tensorflow sublibraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, MultiHeadAttention, LayerNormalization, Embedding
from tensorflow.keras.metrics import AUC
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

class TransformerBlock:

class
