In [1]:
# Load standard imports for the rest of the notebook.
import seaborn as sns
import pandas as pd
import numpy as np
import scipy as sc
import torch
from pytorch_utils import SkillDataSet, pad_collate, LSTM, get_device

# In this demo, we use a lot of SciKit-Learn functions, as imported below.
from sklearn import feature_extraction, model_selection
from sklearn.metrics import mean_squared_error, roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.preprocessing import MinMaxScaler

DATA_DIR = "./../../data/"

In [2]:
def create_iterator(data):
    '''
    Create an iterator to split interactions in data into train and test, with the same student not appearing in two diverse folds.
    :param data:        Dataframe with student's interactions.
    :return:            An iterator.
    '''    
    # Both passing a matrix with the raw data or just an array of indexes works
    X = np.arange(len(data.index))
    # Groups of interactions are identified by the user id (we do not want the same user appearing in two folds)
    groups = data['user_id'].values 
    return model_selection.GroupShuffleSplit(n_splits=1, train_size=.8, test_size=0.2, random_state=0).split(X, groups=groups)

In [3]:
data = pd.read_csv(DATA_DIR + 'assistments.csv', low_memory=False).dropna()
data.head()

Unnamed: 0,user_id,order_id,skill_name,correct
0,64525,33022537,Box and Whisker,1
1,64525,33022709,Box and Whisker,1
2,70363,35450204,Box and Whisker,0
3,70363,35450295,Box and Whisker,1
4,70363,35450311,Box and Whisker,0


In [4]:
print("Number of unique students in the dataset:", len(set(data['user_id'])))
print("Number of unique skills in the dataset:", len(set(data['skill_name'])))

Number of unique students in the dataset: 4151
Number of unique skills in the dataset: 110


In [5]:
data['skill'], skill_codes = pd.factorize(data['skill_name'], sort=True)
# Cross skill id with answer to form a synthetic feature
data['skill_with_answer'] = data['skill'] * 2 + data['correct']

In [6]:
data.head()

Unnamed: 0,user_id,order_id,skill_name,correct,skill,skill_with_answer
0,64525,33022537,Box and Whisker,1,15,31
1,64525,33022709,Box and Whisker,1,15,31
2,70363,35450204,Box and Whisker,0,15,30
3,70363,35450295,Box and Whisker,1,15,31
4,70363,35450311,Box and Whisker,0,15,30


In [7]:
train_index, test_index = next(create_iterator(data))

# Split the data into training and test
train_data, test_data = data.iloc[train_index], data.iloc[test_index]

# Obtain indexes for training and validation sets
train_val_index, val_index = next(create_iterator(train_data))

# Split the training data into training and validation
train_data, val_data = train_data.iloc[train_val_index], train_data.iloc[val_index]

In [8]:
train_data.head()

Unnamed: 0,user_id,order_id,skill_name,correct,skill,skill_with_answer
0,64525,33022537,Box and Whisker,1,15,31
1,64525,33022709,Box and Whisker,1,15,31
8,70677,33140811,Box and Whisker,1,15,31
9,70677,33140919,Box and Whisker,1,15,31
10,70695,33275447,Box and Whisker,1,15,31


In [9]:
from torch.utils.data import Dataset, DataLoader

train_dataset = SkillDataSet(train_data, 220,110)
val_dataset = SkillDataSet(val_data, 220, 110)
test_dataset = SkillDataSet(test_data, 220, 110)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, collate_fn=pad_collate)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True, num_workers=6, collate_fn=pad_collate)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, num_workers=6, collate_fn=pad_collate)

In [10]:
model = LSTM(220, 64, 110).to(get_device())
learning_rate = 0.0001

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

num_params = np.sum(np.fromiter((p.numel() for p in model.parameters() if p.requires_grad), dtype=int))
print(f'The model has {num_params:,} trainable parameters')

The model has 80,366 trainable parameters


In [11]:
num_epochs = 50
from tqdm.notebook import tqdm as visual_progress
train_len = train_dataset.__len__() // 32

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for i, (inputs, targets) in visual_progress(enumerate(train_loader), 
                                                total=train_len,
                                                leave=False,
                                                desc="Batch progress"):
        inputs, targets = inputs.to(get_device()), targets.to(get_device())
        
        optimizer.zero_grad()
        inputs = inputs[:, :-1, :]
        targets = targets[:, 1:, :]
        outputs = model(inputs)

        # Find the padded samples and ignore them
        mask = (targets[:, :, -1] != -1) # shape: batch_size x time_steps
        #print("Mask:", mask.shape)
        # find the number of padded samples in total
        # split the targets into the label and the skill
        label = targets[:, :, -1]
        skill = targets[:, :, :-1]

        outputs = outputs * skill
        outputs = outputs.sum(dim=2)
        #print("Before mask: ", outputs.shape)
        #print("Before mask: ", label.shape)
        outputs = outputs[mask]
        label = label[mask]

        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
    print(f'Epoch [{epoch}/{num_epochs}] | Train Loss: {epoch_loss:.5f}')

In [None]:
    # find the accuracy on the validation set
    # model.eval()
    # val_loss = 0
    # with torch.no_grad():
    #     for j, (val_inputs, val_targets) in enumerate(val_loader):
    #         val_inputs, val_targets = val_inputs.to(get_device()), val_targets.to(get_device())
    #         val_inputs = val_inputs[:, :-1, :]
    #         val_targets = val_targets[:, 1:, :]
    #         val_outputs = model(val_inputs)
    #         val_mask = (val_targets[:, :, -1] != -1)
    #         val_label = val_targets[:, :, -1]
    #         val_skill = val_targets[:, :, :-1]
    #         val_outputs = val_outputs * val_skill
    #         val_predicted_labels = []
    #         val_actual_labels = []
    #         for i, batch in enumerate(val_outputs):
    #             for j, time_step in enumerate(batch):
    #                 if val_targets[i, j, 0] == -1: continue
    #                 val_actual_label = val_label[i, j]
    #                 current_skill = torch.argmax(val_skill[i, j]).item()
    #                 val_predicted_labels.append(time_step.sum())
    #                 val_actual_labels.append(val_actual_label)
    #         val_predicted_labels = torch.tensor(val_predicted_labels, requires_grad=True)
    #         val_actual_labels = torch.tensor(val_actual_labels, requires_grad=True)
    #         val_loss = criterion(val_predicted_labels, val_actual_labels)