In [139]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from torchtext.vocab import vocab
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

from collections import Counter

from zipfile import ZipFile
from urllib.request import urlretrieve

import pandas as pd
import numpy as np

import time

# 1. Data Preprocessing
In this section, we'll start by loading the problemLens dataset. We will then construct vocabularies for problem IDs and user IDs, and create sequences of user interactions. These steps lay the groundwork for our recommendation model, converting the data into a format that our model can utilize effectively.
## 1.1 Loading Dataset
At first we will download our dataset to generate our sequences and vocabularies. Then user_id and problem_id values are processesed to fix their data types.

In [140]:
# urlretrieve("http://files.grouplens.org/datasets/problemlens/ml-1m.zip", "problemlens.zip")
# ZipFile("problemlens.zip", "r").extractall()

In [141]:
interactions = pd.read_csv("/kaggle/input/cm-to-m/user_problem.csv")

df_user_features = pd.read_csv("/kaggle/input/cm-to-m/user_tags.csv")
df_user_tags = pd.read_csv("/kaggle/input/cm-to-m/user_ratings.csv")


In [142]:
interactions

Unnamed: 0,user_handle,problem_id,timestamp,problem_rating,problem_tags
0,maspy,1538:D,1626961617,1700.0,"['constructive algorithms', 'math', 'number th..."
1,maspy,1538:C,1626959972,1300.0,"['binary search', 'data structures', 'math', '..."
2,maspy,1538:B,1626959819,800.0,"['greedy', 'math']"
3,maspy,1538:A,1626959622,800.0,"['brute force', 'dp', 'greedy']"
4,maspy,1520:G,1626959357,2200.0,"['brute force', 'dfs and similar', 'graphs', '..."
...,...,...,...,...,...
27172,celestialcoder,1360:A,1594058431,800.0,"['greedy', 'math']"
27173,celestialcoder,1284:A,1594058264,800.0,"['implementation', 'strings']"
27174,celestialcoder,1375:G,1593962149,2800.0,"['brute force', 'constructive algorithms', 'df..."
27175,celestialcoder,1375:F,1593898420,2600.0,"['constructive algorithms', 'games', 'interact..."


In [143]:
# Drop one or more columns from the DataFrame
interactions.drop(columns=['problem_rating', 'problem_tags'], inplace=True)
interactions

Unnamed: 0,user_handle,problem_id,timestamp
0,maspy,1538:D,1626961617
1,maspy,1538:C,1626959972
2,maspy,1538:B,1626959819
3,maspy,1538:A,1626959622
4,maspy,1520:G,1626959357
...,...,...,...
27172,celestialcoder,1360:A,1594058431
27173,celestialcoder,1284:A,1594058264
27174,celestialcoder,1375:G,1593962149
27175,celestialcoder,1375:F,1593898420


In [144]:
df_user_features.head()

Unnamed: 0,*special,0user_handle,2-sat,binary search,bitmasks,brute force,chinese remainder theorem,combinatorics,constructive algorithms,data structures,...,number theory,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers
0,,maspy,,32.0,11.0,56,,6.0,45,34,...,15.0,,,12.0,55,,24.0,,18.0,21.0
1,,wsyear,,25.0,15.0,47,,18.0,61,46,...,17.0,3.0,,3.0,24,,11.0,2.0,24.0,15.0
2,,LXH-cat,3.0,51.0,23.0,84,,51.0,118,94,...,43.0,18.0,,13.0,54,4.0,24.0,4.0,46.0,29.0
3,1.0,skittles1412,,3.0,1.0,5,,,8,1,...,,,,,4,,8.0,,1.0,1.0
4,,PurpleCrayon,4.0,30.0,28.0,77,,18.0,101,68,...,36.0,4.0,1.0,9.0,60,2.0,41.0,3.0,38.0,26.0


In [145]:
# Replace all NaNs in the DataFrame with zeroes
df_user_features.fillna(0, inplace=True)
df_user_features.head()

Unnamed: 0,*special,0user_handle,2-sat,binary search,bitmasks,brute force,chinese remainder theorem,combinatorics,constructive algorithms,data structures,...,number theory,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers
0,0.0,maspy,0.0,32.0,11.0,56,0.0,6.0,45,34,...,15.0,0.0,0.0,12.0,55,0.0,24.0,0.0,18.0,21.0
1,0.0,wsyear,0.0,25.0,15.0,47,0.0,18.0,61,46,...,17.0,3.0,0.0,3.0,24,0.0,11.0,2.0,24.0,15.0
2,0.0,LXH-cat,3.0,51.0,23.0,84,0.0,51.0,118,94,...,43.0,18.0,0.0,13.0,54,4.0,24.0,4.0,46.0,29.0
3,1.0,skittles1412,0.0,3.0,1.0,5,0.0,0.0,8,1,...,0.0,0.0,0.0,0.0,4,0.0,8.0,0.0,1.0,1.0
4,0.0,PurpleCrayon,4.0,30.0,28.0,77,0.0,18.0,101,68,...,36.0,4.0,1.0,9.0,60,2.0,41.0,3.0,38.0,26.0


In [146]:
# Rename a column in the DataFrame
df_user_features.rename(columns={'0user_handle': 'user_id'}, inplace=True)
interactions.rename(columns={'user_handle': 'user_id'}, inplace=True)
df_user_features.head()

Unnamed: 0,*special,user_id,2-sat,binary search,bitmasks,brute force,chinese remainder theorem,combinatorics,constructive algorithms,data structures,...,number theory,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers
0,0.0,maspy,0.0,32.0,11.0,56,0.0,6.0,45,34,...,15.0,0.0,0.0,12.0,55,0.0,24.0,0.0,18.0,21.0
1,0.0,wsyear,0.0,25.0,15.0,47,0.0,18.0,61,46,...,17.0,3.0,0.0,3.0,24,0.0,11.0,2.0,24.0,15.0
2,0.0,LXH-cat,3.0,51.0,23.0,84,0.0,51.0,118,94,...,43.0,18.0,0.0,13.0,54,4.0,24.0,4.0,46.0,29.0
3,1.0,skittles1412,0.0,3.0,1.0,5,0.0,0.0,8,1,...,0.0,0.0,0.0,0.0,4,0.0,8.0,0.0,1.0,1.0
4,0.0,PurpleCrayon,4.0,30.0,28.0,77,0.0,18.0,101,68,...,36.0,4.0,1.0,9.0,60,2.0,41.0,3.0,38.0,26.0


In [147]:
print(df_user_features.shape)
print(df_user_features["user_id"].nunique())
print(interactions["user_id"].nunique())

(185, 38)
185
185


In [148]:
# interactions = pd.merge(interactions, df_user_features, on="user_id")
# print(interactions["user_id"].nunique())
# print(interactions.head())


In [149]:
# Preventing ids to be written as integer or float data type

interactions["problem_id"] = interactions["problem_id"].apply(lambda x: f"problem_{x}")
interactions["user_id"] = interactions["user_id"].apply(lambda x: f"user_{x}")
df_user_features["user_id"] = df_user_features["user_id"].apply(lambda x: f"user_{x}")

In [150]:
interactions

Unnamed: 0,user_id,problem_id,timestamp
0,user_maspy,problem_1538:D,1626961617
1,user_maspy,problem_1538:C,1626959972
2,user_maspy,problem_1538:B,1626959819
3,user_maspy,problem_1538:A,1626959622
4,user_maspy,problem_1520:G,1626959357
...,...,...,...
27172,user_celestialcoder,problem_1360:A,1594058431
27173,user_celestialcoder,problem_1284:A,1594058264
27174,user_celestialcoder,problem_1375:G,1593962149
27175,user_celestialcoder,problem_1375:F,1593898420


In [151]:
print(interactions.shape)

(27177, 3)


In [152]:
df_user_features

Unnamed: 0,*special,user_id,2-sat,binary search,bitmasks,brute force,chinese remainder theorem,combinatorics,constructive algorithms,data structures,...,number theory,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers
0,0.0,user_maspy,0.0,32.0,11.0,56,0.0,6.0,45,34,...,15.0,0.0,0.0,12.0,55,0.0,24.0,0.0,18.0,21.0
1,0.0,user_wsyear,0.0,25.0,15.0,47,0.0,18.0,61,46,...,17.0,3.0,0.0,3.0,24,0.0,11.0,2.0,24.0,15.0
2,0.0,user_LXH-cat,3.0,51.0,23.0,84,0.0,51.0,118,94,...,43.0,18.0,0.0,13.0,54,4.0,24.0,4.0,46.0,29.0
3,1.0,user_skittles1412,0.0,3.0,1.0,5,0.0,0.0,8,1,...,0.0,0.0,0.0,0.0,4,0.0,8.0,0.0,1.0,1.0
4,0.0,user_PurpleCrayon,4.0,30.0,28.0,77,0.0,18.0,101,68,...,36.0,4.0,1.0,9.0,60,2.0,41.0,3.0,38.0,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,0.0,user_Osmabnlden,0.0,28.0,15.0,51,2.0,25.0,49,29,...,30.0,2.0,0.0,2.0,30,0.0,26.0,2.0,12.0,18.0
181,4.0,user_Carmel_Ab1,1.0,100.0,49.0,226,2.0,40.0,181,104,...,99.0,6.0,0.0,15.0,158,3.0,106.0,6.0,28.0,60.0
182,26.0,user_valavshonok,0.0,42.0,23.0,106,0.0,29.0,86,40,...,39.0,8.0,5.0,10.0,72,0.0,69.0,1.0,10.0,30.0
183,0.0,user_Erinyes,2.0,33.0,20.0,54,2.0,8.0,77,26,...,18.0,2.0,1.0,9.0,42,0.0,11.0,4.0,14.0,21.0


## 1.2 Creating Vocabulary
Now that we have our data ready, it's time to prepare our vocabularies for user IDs and problem IDs. This step will convert the unique IDs into numerical indices that our model can use. The following code snippet accomplishes this task.

In [153]:
np.random.seed(42)
# Generating a list of unique problem ids
problem_ids = interactions.problem_id.unique()

# Counter is used to feed problems to movive_vocab
problem_counter = Counter(problem_ids)

# Genarting vocabulary
problem_vocab = vocab(problem_counter, specials=['<unk>'])

# For indexing input ids
problem_vocab_stoi = problem_vocab.get_stoi()

# problem to title mapping dictionary
# problem_title_dict = dict(zip(problems.problem_id, problems.title))

# Similarly generating a vocabulary for user ids
user_ids = interactions.user_id.unique()
user_counter = Counter(user_ids)
user_vocab = vocab(user_counter, specials=['<unk>'])
user_vocab_stoi = user_vocab.get_stoi()

## 1.3 Generating Sequences
All interactions of users are first sorted by their interaction timestamp and then divided into sub sequences to train our model.

In [154]:
# Group ratings by user_id in order of increasing unix_timestamp.
ratings_group = interactions.sort_values(by=["timestamp"]).groupby("user_id")

interactions_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "problem_ids": list(ratings_group.problem_id.apply(list)),
        "timestamps": list(ratings_group.timestamp.apply(list)),
    }
)

In [155]:
interactions_data

Unnamed: 0,user_id,problem_ids,timestamps
0,user_21cs01033,"[problem_1719:C, problem_1715:B, problem_1624:...","[1677832155, 1678090610, 1678288460, 167834644..."
1,user_2497201210,"[problem_1921:F, problem_1921:G, problem_1837:...","[1705376753, 1705386035, 1705460611, 170550065..."
2,user_36champ,"[problem_1686:A, problem_1686:B, problem_1686:...","[1657093017, 1657093549, 1657094326, 165761486..."
3,user_874641984,"[problem_1914:G1, problem_1914:G2, problem_827...","[1703042317, 1703140815, 1703165485, 170331179..."
4,user_A_cat_with_a_hat,"[problem_1579:D, problem_1829:G, problem_1833:...","[1684523408, 1684678075, 1684956695, 168683971..."
...,...,...,...
180,user_yuanyuxuan,"[problem_1393:A, problem_1393:B, problem_1393:...","[1596860896, 1596860923, 1596860954, 159686096..."
181,user_zeemanz,"[problem_1225:D, problem_1208:D, problem_1648:...","[1698036624, 1698049348, 1698063638, 169807262..."
182,user_zhenghanyun,"[problem_1806:C, problem_1561:D1, problem_665:...","[1679204602, 1679207180, 1679216742, 167928868..."
183,user_zjjws,"[problem_1466:A, problem_1466:B, problem_1466:...","[1609565701, 1609565720, 1609565752, 160956576..."


In [156]:
# Sequence length, min history count and window slide size
sequence_length = 20
min_history = 1
step_size = 2

# Creating sequences from lists with sliding window
def create_sequences(values, window_size, step_size, min_history):
  sequences = []
  start_index = 0
  while len(values[start_index:]) > min_history:
    seq = values[start_index : start_index + window_size]
    sequences.append(seq)
    start_index += step_size
  return sequences

interactions_data.problem_ids = interactions_data.problem_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size, min_history)
)


del interactions_data["timestamps"]

In [157]:
interactions_data

Unnamed: 0,user_id,problem_ids
0,user_21cs01033,"[[problem_1719:C, problem_1715:B, problem_1624..."
1,user_2497201210,"[[problem_1921:F, problem_1921:G, problem_1837..."
2,user_36champ,"[[problem_1686:A, problem_1686:B, problem_1686..."
3,user_874641984,"[[problem_1914:G1, problem_1914:G2, problem_82..."
4,user_A_cat_with_a_hat,"[[problem_1579:D, problem_1829:G, problem_1833..."
...,...,...
180,user_yuanyuxuan,"[[problem_1393:A, problem_1393:B, problem_1393..."
181,user_zeemanz,"[[problem_1225:D, problem_1208:D, problem_1648..."
182,user_zhenghanyun,"[[problem_1806:C, problem_1561:D1, problem_665..."
183,user_zjjws,"[[problem_1466:A, problem_1466:B, problem_1466..."


In [158]:
# Sub-sequences are exploded.
# Since there might be more than one sequence for each user.
interactions_data_transformed = interactions_data[["user_id", "problem_ids"]].explode(
    "problem_ids", ignore_index=True
)

interactions_data_transformed.rename(
    columns={"problem_ids": "sequence_problem_ids"},
    inplace=True,
)

In [159]:
print(interactions_data_transformed.sample(frac=1).reset_index(drop=True).head())
print(interactions_data_transformed.shape)

              user_id                               sequence_problem_ids
0     user_Carmel_Ab1  [problem_1096:C, problem_1661:C, problem_1661:...
1         user_zltzlt  [problem_1486:C1, problem_1451:E2, problem_145...
2  user_ventusliberum  [problem_1827:B2, problem_1828:D2, problem_183...
3       user_geospiza  [problem_1606:A, problem_1606:C, problem_1606:...
4   user_aniket_kundu  [problem_1304:E, problem_1301:D, problem_1371:...
(13544, 2)


In [160]:
interactions_data_transformed

Unnamed: 0,user_id,sequence_problem_ids
0,user_21cs01033,"[problem_1719:C, problem_1715:B, problem_1624:..."
1,user_21cs01033,"[problem_1624:A, problem_1742:E, problem_1742:..."
2,user_21cs01033,"[problem_1742:B, problem_1802:A, problem_1802:..."
3,user_21cs01033,"[problem_1802:B, problem_1779:A, problem_1607:..."
4,user_21cs01033,"[problem_1607:E, problem_1804:D, problem_1758:..."
...,...,...
13539,user_zltzlt,"[problem_1422:C, problem_1422:D, problem_1422:..."
13540,user_zltzlt,"[problem_1422:A, problem_1422:B, problem_1421:..."
13541,user_zltzlt,"[problem_1421:D, problem_1624:D, problem_1624:..."
13542,user_zltzlt,"[problem_1624:F, problem_1624:G, problem_1624:..."


In [161]:
interactions_data_transformed = pd.merge(interactions_data_transformed,df_user_features, on="user_id")
interactions_data_transformed

Unnamed: 0,user_id,sequence_problem_ids,*special,2-sat,binary search,bitmasks,brute force,chinese remainder theorem,combinatorics,constructive algorithms,...,number theory,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers
0,user_21cs01033,"[problem_1719:C, problem_1715:B, problem_1624:...",2.0,0.0,21.0,2.0,43,0.0,7.0,53,...,13.0,2.0,1.0,1.0,33,0.0,26.0,5.0,1.0,13.0
1,user_21cs01033,"[problem_1624:A, problem_1742:E, problem_1742:...",2.0,0.0,21.0,2.0,43,0.0,7.0,53,...,13.0,2.0,1.0,1.0,33,0.0,26.0,5.0,1.0,13.0
2,user_21cs01033,"[problem_1742:B, problem_1802:A, problem_1802:...",2.0,0.0,21.0,2.0,43,0.0,7.0,53,...,13.0,2.0,1.0,1.0,33,0.0,26.0,5.0,1.0,13.0
3,user_21cs01033,"[problem_1802:B, problem_1779:A, problem_1607:...",2.0,0.0,21.0,2.0,43,0.0,7.0,53,...,13.0,2.0,1.0,1.0,33,0.0,26.0,5.0,1.0,13.0
4,user_21cs01033,"[problem_1607:E, problem_1804:D, problem_1758:...",2.0,0.0,21.0,2.0,43,0.0,7.0,53,...,13.0,2.0,1.0,1.0,33,0.0,26.0,5.0,1.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13539,user_zltzlt,"[problem_1422:C, problem_1422:D, problem_1422:...",1.0,0.0,36.0,34.0,57,0.0,22.0,44,...,25.0,2.0,0.0,11.0,40,3.0,20.0,3.0,46.0,17.0
13540,user_zltzlt,"[problem_1422:A, problem_1422:B, problem_1421:...",1.0,0.0,36.0,34.0,57,0.0,22.0,44,...,25.0,2.0,0.0,11.0,40,3.0,20.0,3.0,46.0,17.0
13541,user_zltzlt,"[problem_1421:D, problem_1624:D, problem_1624:...",1.0,0.0,36.0,34.0,57,0.0,22.0,44,...,25.0,2.0,0.0,11.0,40,3.0,20.0,3.0,46.0,17.0
13542,user_zltzlt,"[problem_1624:F, problem_1624:G, problem_1624:...",1.0,0.0,36.0,34.0,57,0.0,22.0,44,...,25.0,2.0,0.0,11.0,40,3.0,20.0,3.0,46.0,17.0


In [162]:
print(interactions_data_transformed.shape)

(13544, 39)


## 1.4 Train Test Split
The data is split into training and testing sets. Although considering timestamps could potentially provide a more refined split, for the sake of simplicity, we opt for a random indexing approach.

In [163]:
# Random indexing
random_selection = np.random.rand(len(interactions_data_transformed.index)) <= 0.85

# Split train data
df_train_data = interactions_data_transformed[random_selection]


# Split test data
df_test_data = interactions_data_transformed[~random_selection]


In [164]:
# from sklearn.preprocessing import MinMaxScaler

# # Get the numerical columns by excluding "user_id" and "sequence_problem_ids"
# numerical_columns = df_train_data.drop(columns=['user_id', 'sequence_problem_ids']).columns.tolist()

# # Create MinMaxScaler objects
# scaler_train = MinMaxScaler()

# # Fit and transform the training data
# df_train_data.loc[:, numerical_columns] = scaler_train.fit_transform(df_train_data[numerical_columns])

# # Transform the testing data (using parameters learned from training data)
# df_test_data.loc[:, numerical_columns] = scaler_train.transform(df_test_data[numerical_columns])


In [165]:
train_data_raw = df_train_data.values
test_data_raw = df_test_data.values

In [166]:
df_train_data

Unnamed: 0,user_id,sequence_problem_ids,*special,2-sat,binary search,bitmasks,brute force,chinese remainder theorem,combinatorics,constructive algorithms,...,number theory,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers
0,user_21cs01033,"[problem_1719:C, problem_1715:B, problem_1624:...",2.0,0.0,21.0,2.0,43,0.0,7.0,53,...,13.0,2.0,1.0,1.0,33,0.0,26.0,5.0,1.0,13.0
1,user_21cs01033,"[problem_1624:A, problem_1742:E, problem_1742:...",2.0,0.0,21.0,2.0,43,0.0,7.0,53,...,13.0,2.0,1.0,1.0,33,0.0,26.0,5.0,1.0,13.0
2,user_21cs01033,"[problem_1742:B, problem_1802:A, problem_1802:...",2.0,0.0,21.0,2.0,43,0.0,7.0,53,...,13.0,2.0,1.0,1.0,33,0.0,26.0,5.0,1.0,13.0
3,user_21cs01033,"[problem_1802:B, problem_1779:A, problem_1607:...",2.0,0.0,21.0,2.0,43,0.0,7.0,53,...,13.0,2.0,1.0,1.0,33,0.0,26.0,5.0,1.0,13.0
5,user_21cs01033,"[problem_1758:A, problem_1758:B, problem_1800:...",2.0,0.0,21.0,2.0,43,0.0,7.0,53,...,13.0,2.0,1.0,1.0,33,0.0,26.0,5.0,1.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13534,user_zltzlt,"[problem_1528:D, problem_538:B, problem_359:B,...",1.0,0.0,36.0,34.0,57,0.0,22.0,44,...,25.0,2.0,0.0,11.0,40,3.0,20.0,3.0,46.0,17.0
13538,user_zltzlt,"[problem_1397:B, problem_50:D, problem_1422:C,...",1.0,0.0,36.0,34.0,57,0.0,22.0,44,...,25.0,2.0,0.0,11.0,40,3.0,20.0,3.0,46.0,17.0
13539,user_zltzlt,"[problem_1422:C, problem_1422:D, problem_1422:...",1.0,0.0,36.0,34.0,57,0.0,22.0,44,...,25.0,2.0,0.0,11.0,40,3.0,20.0,3.0,46.0,17.0
13540,user_zltzlt,"[problem_1422:A, problem_1422:B, problem_1421:...",1.0,0.0,36.0,34.0,57,0.0,22.0,44,...,25.0,2.0,0.0,11.0,40,3.0,20.0,3.0,46.0,17.0


In [167]:
df_train_data.describe()

Unnamed: 0,*special,2-sat,binary search,bitmasks,brute force,chinese remainder theorem,combinatorics,constructive algorithms,data structures,dfs and similar,...,number theory,probabilities,schedules,shortest paths,sortings,string suffix structures,strings,ternary search,trees,two pointers
count,11510.0,11510.0,11510.0,11510.0,11510.0,11510.0,11510.0,11510.0,11510.0,11510.0,...,11510.0,11510.0,11510.0,11510.0,11510.0,11510.0,11510.0,11510.0,11510.0,11510.0
mean,3.387055,0.827889,41.143788,20.479235,77.351347,0.528497,18.875065,76.246481,51.832841,30.046308,...,33.813032,3.338749,0.385317,9.433449,55.623371,1.480104,36.75821,1.564639,19.014944,24.044657
std,5.314561,1.723326,45.978338,20.892296,73.539887,1.04658,19.733837,63.549026,66.966483,41.777531,...,32.933952,5.034007,0.811557,14.834583,52.439205,3.448804,36.04365,2.13522,27.605008,24.913539
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,15.0,7.0,30.0,0.0,6.0,36.0,17.0,7.0,...,13.0,0.0,0.0,2.0,21.0,0.0,13.0,0.0,4.0,9.0
50%,1.0,0.0,26.0,14.0,51.0,0.0,11.0,53.0,28.0,17.0,...,22.0,2.0,0.0,5.0,40.0,0.0,22.0,1.0,9.0,16.0
75%,4.0,1.0,45.0,26.0,99.0,1.0,25.0,102.0,67.0,32.0,...,47.0,4.0,1.0,11.0,72.0,1.0,55.0,2.0,22.0,28.0
max,30.0,9.0,248.0,120.0,393.0,9.0,93.0,325.0,413.0,243.0,...,190.0,26.0,5.0,92.0,300.0,20.0,202.0,9.0,156.0,127.0


In [168]:
train_data_raw

array([['user_21cs01033',
        list(['problem_1719:C', 'problem_1715:B', 'problem_1624:A', 'problem_1742:E', 'problem_1742:B', 'problem_1802:A', 'problem_1802:B', 'problem_1779:A', 'problem_1607:E', 'problem_1804:D', 'problem_1758:A', 'problem_1758:B', 'problem_1800:B', 'problem_56:E', 'problem_1760:B', 'problem_1806:C', 'problem_1807:G2', 'problem_1807:G1', 'problem_1733:C', 'problem_1748:B']),
        2.0, ..., 5.0, 1.0, 13.0],
       ['user_21cs01033',
        list(['problem_1624:A', 'problem_1742:E', 'problem_1742:B', 'problem_1802:A', 'problem_1802:B', 'problem_1779:A', 'problem_1607:E', 'problem_1804:D', 'problem_1758:A', 'problem_1758:B', 'problem_1800:B', 'problem_56:E', 'problem_1760:B', 'problem_1806:C', 'problem_1807:G2', 'problem_1807:G1', 'problem_1733:C', 'problem_1748:B', 'problem_1721:C', 'problem_1742:G']),
        2.0, ..., 5.0, 1.0, 13.0],
       ['user_21cs01033',
        list(['problem_1742:B', 'problem_1802:A', 'problem_1802:B', 'problem_1779:A', 'problem_1607:

DataLoader is defined to be used for training and evaluation as final pre-processing step.

In [169]:
# Pytorch Dataset for user interactions
class problemSeqDataset(Dataset):
    # Initialize dataset
    def __init__(self, data, problem_vocab_stoi, user_vocab_stoi):
        self.data = data
        self.problem_vocab_stoi = problem_vocab_stoi
        self.user_vocab_stoi = user_vocab_stoi


    def __len__(self):
        return len(self.data)

    # Fetch data from the dataset
    def __getitem__(self, idx):
        user = self.data[idx][0]
        problem_sequence = self.data[idx][1]
        # Directly index into the vocabularies
        problem_data = [self.problem_vocab_stoi[item] for item in problem_sequence]
        user_data = self.user_vocab_stoi[user]
        
        # Create a dictionary to hold all features
        encoded_features = {
            'problem_data': torch.tensor(problem_data),
            'user_data': torch.tensor(user_data)
        }
        
        # Add other features to the dictionary
        for i, feature in enumerate(self.data[idx][2:]):
            feature_name = f'feature_{i+1}'
            encoded_features[feature_name] = torch.tensor(feature).int()
            
        return encoded_features


# Collate function and padding
def collate_batch(batch):
    # Extract tensors for problem and user data
    problem_list = [item['problem_data'] for item in batch]
    user_list = [item['user_data'] for item in batch]
    
    # Pad problem sequences
    padded_problem = pad_sequence(problem_list, padding_value=problem_vocab_stoi['<unk>'], batch_first=True)
    
    # Stack user data
    user_data = torch.stack(user_list)
#     print("HELOOOO", user_data.shape)
    
    # Prepare a dictionary to hold all features
    collated_batch = {
        'problem_data': padded_problem,
        'user_data': user_data,
    }
    
    # Add other features to the collated batch
    for i in range(1, len(batch[0].keys()) - 1): 
        feature_name = f'feature_{i}'
        feature_list = [item[feature_name] for item in batch]
        collated_batch[feature_name] = torch.stack(feature_list)
    
    return collated_batch


BATCH_SIZE = 256
# Create instances of your Dataset for each set
train_dataset = problemSeqDataset(train_data_raw, problem_vocab_stoi, user_vocab_stoi)
val_dataset = problemSeqDataset(test_data_raw, problem_vocab_stoi, user_vocab_stoi)

# Create DataLoaders
train_iter = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=collate_batch)
val_iter = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                      shuffle=False, collate_fn=collate_batch)


In [170]:
print(train_dataset[0])

{'problem_data': tensor([3893, 4777, 1152, 1021, 1028, 5324, 5323, 5854,  256, 2948, 3548, 3547,
        1007,  359, 3828, 2952,  989,  990, 3867, 3550]), 'user_data': tensor(96), 'feature_1': tensor(2, dtype=torch.int32), 'feature_2': tensor(0, dtype=torch.int32), 'feature_3': tensor(21, dtype=torch.int32), 'feature_4': tensor(2, dtype=torch.int32), 'feature_5': tensor(43, dtype=torch.int32), 'feature_6': tensor(0, dtype=torch.int32), 'feature_7': tensor(7, dtype=torch.int32), 'feature_8': tensor(53, dtype=torch.int32), 'feature_9': tensor(17, dtype=torch.int32), 'feature_10': tensor(3, dtype=torch.int32), 'feature_11': tensor(1, dtype=torch.int32), 'feature_12': tensor(20, dtype=torch.int32), 'feature_13': tensor(2, dtype=torch.int32), 'feature_14': tensor(0, dtype=torch.int32), 'feature_15': tensor(0, dtype=torch.int32), 'feature_16': tensor(0, dtype=torch.int32), 'feature_17': tensor(9, dtype=torch.int32), 'feature_18': tensor(10, dtype=torch.int32), 'feature_19': tensor(0, dtype=t

# 2. Model Definition
In this section we will define and initialize our model. Then the model will be trained with our previously generated dataset.
## 2.1 Positional Encoder
We start by defining the positional encoder, which is crucial for sequence-based models like the Transformer. This encoder will capture the positions of problem interactions in our sequences, thus embedding the order information that the Transformer model needs.

In [171]:
# class PositionalEncoding(nn.Module):

#     def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
#         super().__init__()
#         self.dropout = nn.Dropout(p=dropout)

#         position = torch.arange(max_len).unsqueeze(1)

#         # `div_term` is used in the calculation of the sinusoidal values.
#         div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

#         # Initializing positional encoding matrix with zeros.
#         pe = torch.zeros(max_len, 1, d_model)

#         # Calculating the positional encodings.
#         pe[:, 0, 0::2] = torch.sin(position * div_term)
#         pe[:, 0, 1::2] = torch.cos(position * div_term)
#         self.register_buffer('pe', pe)

#     def forward(self, x: Tensor) -> Tensor:
#         """
#         Arguments:
#             x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
#         """
#         x = x + self.pe[:x.size(0)]
#         return self.dropout(x)

## 2.2 Transformer Model
Following the definition of our positional encoder, we then establish our transformer model. This model takes both the user id and the problem id sequence as input, and it is responsible for generating the output problem predictions.

In [172]:
# class TransformerModel(nn.Module):
#     def __init__(self, ntoken: int, nuser: int, d_model: int, nhead: int, d_hid: int,
#                  nlayers: int, dropout: float = 0.5):
#         super().__init__()
#         self.model_type = 'Transformer'
#         # positional encoder
#         self.pos_encoder = PositionalEncoding(d_model, dropout)

#         # Multihead attention mechanism.
#         encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
#         self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

#         # Embedding layers
#         self.problem_embedding = nn.Embedding(ntoken, d_model)
#         self.user_embedding = nn.Embedding(nuser, d_model)
#         self.feature_embedding = nn.Embedding(1000, d_model)

#         # Defining the size of the input to the model.
#         self.d_model = d_model

#         # Linear layer to map the output toproblem vocabulary.
#         self.linear = nn.Linear(31*d_model, ntoken)

#         self.init_weights()

#     def init_weights(self) -> None:
#         # Initializing the weights of the embedding and linear layers.
#         initrange = 0.1
#         self.problem_embedding.weight.data.uniform_(-initrange, initrange)
#         self.user_embedding.weight.data.uniform_(-initrange, initrange)
#         self.linear.bias.data.zero_()
#         self.linear.weight.data.uniform_(-initrange, initrange)

#     def forward(self, src: Tensor, user: Tensor, other_features_batch, src_mask: Tensor = None) -> Tensor:
#         # Embedding problem ids and userid
#         problem_embed = self.problem_embedding(src) * math.sqrt(self.d_model)
#         user_embed = self.user_embedding(user) * math.sqrt(self.d_model)
        
# #         print("problem_embed shape:", problem_embed.shape)
# #         print("user_embed shape:", user_embed.shape)

#         # positional encoding
#         problem_embed = self.pos_encoder(problem_embed)

#         # generating output with final layers
#         output = self.transformer_encoder(problem_embed, src_mask)
        
# #         print("lstm_output", output.shape)

#         # Expand user_embed tensor along the sequence length dimension
#         user_embed = user_embed.expand(-1, output.size(1), -1)
#         other_features_embed = []
#         for i, (feature_name, feature_tensor) in enumerate(other_features_batch.items()):
#             other_feature_embedding = self.feature_embedding(feature_tensor)* math.sqrt(self.d_model)
#             other_features_embed.append(other_feature_embedding)
        
#         other_features_embed = torch.cat(other_features_embed, dim=-1)
#         other_features_embed = other_features_embed.expand(-1, output.size(1), -1)
        
# #         print("user_embed", user_embed.shape)

#         # Concatenate user embeddings with transformer output
#         output = torch.cat((output, user_embed, other_features_embed), dim=-1)
        
# #         print("hello", output.shape)

#         output = self.linear(output)
#         return output


In [173]:
class LSTMModel(nn.Module):
    def __init__(self, ntoken: int, nuser: int, d_model: int, d_hid: int, nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'LSTM'

        # Embedding layers
        self.problem_embedding = nn.Embedding(ntoken, d_model)
        self.user_embedding = nn.Embedding(nuser, d_model)
        self.feature_embedding = nn.Embedding(1000, d_model)

        # LSTM layers
        self.lstm = nn.GRU(d_model, d_hid, nlayers, batch_first=True, dropout=dropout)

        self.d_model = d_model

        # Linear layer to map the LSTM output to problem vocabulary
        self.linear = nn.Linear(39*d_hid, ntoken)
#         self.linear = nn.Sequential(
#             nn.Linear(31*d_hid,31*d_hid//2),
#             nn.Linear(31*d_hid//2, 31*d_hid//4),
#             nn.Dropout(0.2),
#             nn.Linear(31*d_hid//4, 31*d_hid//8),
#             nn.Linear(31*d_hid//8, ntoken)
#         )

        self.init_weights()

    def init_weights(self) -> None:
        # Initializing the weights of the embedding and linear layers
        initrange = 0.1
        self.problem_embedding.weight.data.uniform_(-initrange, initrange)
        self.user_embedding.weight.data.uniform_(-initrange, initrange)
        self.feature_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, user: Tensor, other_features_batch) -> Tensor:
        # Embedding problem ids and user id
        
#         print("user shape",user.shape)
        problem_embed = self.problem_embedding(src)* math.sqrt(self.d_model)
        user_embed = self.user_embedding(user)* math.sqrt(self.d_model)
        
        # print("problem_embed shape:", problem_embed.shape)
#         print("user_embed shape:", user_embed.shape)

        # Pass the combined embeddings through LSTM layers
        lstm_output, _ = self.lstm(problem_embed)
        
#         print("lstm_output", lstm_output.shape)
        
        other_features_embed = []
        for i, (feature_name, feature_tensor) in enumerate(other_features_batch.items()):
#             print(f"Processing other feature '{feature_name}'")
#             print("Feature tensor shape:", feature_tensor.shape)
            other_feature_embedding = self.feature_embedding(feature_tensor)* math.sqrt(self.d_model)
            other_features_embed.append(other_feature_embedding)
#             print("Other feature embedding shape:", other_feature_embedding.shape)
        
        
        other_features_embed = torch.cat(other_features_embed, dim=-1)
        other_features_embed = other_features_embed.expand(-1, lstm_output.size(1), -1)

        user_embed = user_embed.expand(-1, lstm_output.size(1), -1)
#         print("user_embed after expansion", user_embed.shape)

        output = torch.cat((lstm_output, user_embed, other_features_embed), dim=-1)

        # print("hello", output.shape)

        # Apply linear layer to obtain the output logits
        output = self.linear(output)

        return output


Following the model definitions, we proceed to initialize our model using a set of arbitrarily selected hyperparameters.

In [174]:
ntokens = len(problem_vocab)  # size of vocabulary
nusers = len(user_vocab)
d_model = 128  # embedding dimension (maybe 512?)
d_hid = 128  # dimension of the LSTM hidden states
nlayers = 2  # number of LSTM layers
dropout = 0.2  # dropout probability

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMModel(ntokens, nusers, d_model, d_hid, nlayers, dropout).to(device)

criterion = nn.CrossEntropyLoss()
lr = 1.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


In [175]:
# ntokens = len(problem_vocab)  # size of vocabulary
# nusers = len(user_vocab)
# emsize = 128  # embedding dimension
# d_hid = 128  # dimension of the feedforward network model
# nlayers = 2  # number of ``nn.TransformerEncoderLayer``
# nhead = 2  # number of heads in ``nn.MultiheadAttention``
# dropout = 0.2  # dropout probability

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = TransformerModel(ntokens, nusers, emsize, nhead, d_hid, nlayers, dropout).to(device)

# criterion = nn.CrossEntropyLoss()
# lr = 1.0  # learning rate
# optimizer = torch.optim.SGD(model.parameters(), lr=lr)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

# 3. Train & Evaluation
We're now ready to kick off the training process with our model, where it will learn from the dataset we've prepared. Following the training phase, we'll evaluate how well our model performs on unseen data to check its effectiveness.
## 3.1 Train Function

In [176]:
def train(model: nn.Module, train_iter, epoch) -> None:
    # Switch to training mode
    model.train()
    total_loss = 0.
    log_interval = 200
    start_time = time.time()

    for i, batch in enumerate(train_iter):
        # Unpack the batch
        problem_data_batch = batch['problem_data']
        user_data_batch = batch['user_data']
        other_features_batch = {k: v for k, v in batch.items() if k not in ['problem_data', 'user_data']}
        
#         print("user_data_batch.shape", user_data_batch.shape)
#         user_data_batch = user_data_batch.unsqueeze(1)
        user_data_batch = user_data_batch.reshape(-1, 1)
        
#         print("user_data_batch.shape after squeeze", user_data_batch.shape)
        
        # Move tensors to the appropriate device
        problem_data_batch = problem_data_batch.to(device)
        user_data_batch = user_data_batch.to(device)
        for k, v in other_features_batch.items():
            v=v.unsqueeze(1)
            other_features_batch[k] = v.to(device)
        
        # Split problem sequence to inputs and targets
        inputs, targets = problem_data_batch[:, :-1], problem_data_batch[:, 1:]
        targets_flat = targets.reshape(-1)
        
#         if (i==0):
#             print(problem_data_batch[0])
#             print('\n',inputs[0])
#             print('\n',targets[0])
#             print("inputs",inputs.shape)
#             print("user_data_batch", user_data_batch.shape)
#             for k, v in other_features_batch.items():
#                 print('\n', k, v.shape)
#             print("targets",targets.shape)
#             print("targets_flat",targets_flat.shape)
            

        # Predict problems
        output = model(inputs, user_data_batch, other_features_batch)

        # Compute the loss
        loss = criterion(output.view(-1, ntokens), targets_flat)
        
#         if (i==0):
#             print("output_shape", output.shape)
#             print("output_shape_2",output.view(-1, ntokens).shape)

        # Perform backpropagation
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()

        # Print training progress
        if i % log_interval == 0 and i > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()


In [177]:
# def train(model: nn.Module, train_iter, epoch) -> None:
#     # Switch to training mode
#     model.train()
#     total_loss = 0.
#     log_interval = 200
#     start_time = time.time()

#     for i, batch in enumerate(train_iter):
#         # Unpack the batch
#         problem_data_batch = batch['problem_data']
#         user_data_batch = batch['user_data']
#         other_features_batch = {k: v for k, v in batch.items() if k not in ['problem_data', 'user_data']}
        
# #         print("user_data_batch.shape", user_data_batch.shape)
# #         user_data_batch = user_data_batch.unsqueeze(1)
#         user_data_batch = user_data_batch.reshape(-1, 1)
        
# #         print("user_data_batch.shape after squeeze", user_data_batch.shape)
        
#         # Move tensors to the appropriate device
#         problem_data_batch = problem_data_batch.to(device)
#         user_data_batch = user_data_batch.to(device)
#         for k, v in other_features_batch.items():
#             v=v.unsqueeze(1)
#             other_features_batch[k] = v.to(device)
        
#         # Split problem sequence to inputs and targets
#         inputs, targets = problem_data_batch[:, :-1], problem_data_batch[:, 1:]
#         targets_flat = targets.reshape(-1)
        
# #         if (i==0):
# #             print(problem_data_batch[0])
# #             print('\n',inputs[0])
# #             print('\n',targets[0])
# #             print("inputs",inputs.shape)
# #             print("user_data_batch", user_data_batch.shape)
# #             for k, v in other_features_batch.items():
# #                 print('\n', k, v.shape)
# #             print("targets",targets.shape)
# #             print("targets_flat",targets_flat.shape)
            

#         # Predict problems
#         output = model(inputs, user_data_batch, other_features_batch)

#         # Compute the loss
#         loss = criterion(output.view(-1, ntokens), targets_flat)
        
# #         if (i==0):
# #             print("output_shape", output.shape)
# #             print("output_shape_2",output.view(-1, ntokens).shape)

#         # Perform backpropagation
#         optimizer.zero_grad()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
#         optimizer.step()

#         total_loss += loss.item()

#         # Print training progress
#         if i % log_interval == 0 and i > 0:
#             lr = scheduler.get_last_lr()[0]
#             ms_per_batch = (time.time() - start_time) * 1000 / log_interval
#             cur_loss = total_loss / log_interval
#             ppl = math.exp(cur_loss)
#             print(f'| epoch {epoch:3d} '
#                   f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
#                   f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
#             total_loss = 0
#             start_time = time.time()


## 3.2 Evaluation Function

In [178]:
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    # Switch the model to evaluation mode.
    # This is necessary for layers like dropout,
    model.eval()
    total_loss = 0.

    with torch.no_grad():
        for i, batch in enumerate(train_iter):
        # Unpack the batch
            problem_data_batch = batch['problem_data']
            user_data_batch = batch['user_data']
            other_features_batch = {k: v for k, v in batch.items() if k not in ['problem_data', 'user_data']}

            # Move tensors to the appropriate device
            problem_data_batch = problem_data_batch.to(device)
            user_data_batch = user_data_batch.to(device)
            
            user_data_batch = user_data_batch.reshape(-1, 1)
            
            for k, v in other_features_batch.items():
                v = v.unsqueeze(1)
                other_features_batch[k] = v.to(device)
                
            # Split problem sequence to inputs and targets
            inputs, targets = problem_data_batch[:, :-1], problem_data_batch[:, 1:]
            targets_flat = targets.reshape(-1)

            # Predict problems
            output = model(inputs, user_data_batch,other_features_batch)

            # Calculate loss
            loss = criterion(output.view(-1, ntokens), targets_flat)  # Reshape output for loss calculation
            total_loss += loss.item()

    # Return average loss over all batches
    return total_loss / len(eval_data)


In [179]:
# def evaluate(model: nn.Module, eval_data: Tensor) -> float:
#     # Switch the model to evaluation mode.
#     # This is necessary for layers like dropout,
#     model.eval()
#     total_loss = 0.

#     with torch.no_grad():
#         for i, batch in enumerate(train_iter):
#         # Unpack the batch
#             problem_data_batch = batch['problem_data']
#             user_data_batch = batch['user_data']
#             other_features_batch = {k: v for k, v in batch.items() if k not in ['problem_data', 'user_data']}

#             # Move tensors to the appropriate device
#             problem_data_batch = problem_data_batch.to(device)
#             user_data_batch = user_data_batch.to(device)
            
#             user_data_batch = user_data_batch.reshape(-1, 1)
            
#             for k, v in other_features_batch.items():
#                 v = v.unsqueeze(1)
#                 other_features_batch[k] = v.to(device)
                
#             # Split problem sequence to inputs and targets
#             inputs, targets = problem_data_batch[:, :-1], problem_data_batch[:, 1:]
#             targets_flat = targets.reshape(-1)

#             # Predict problems
#             output = model(inputs, user_data_batch,other_features_batch)

#             # Calculate loss
#             loss = criterion(output.view(-1, ntokens), targets_flat)  # Reshape output for loss calculation
#             total_loss += loss.item()

#     # Return average loss over all batches
#     return total_loss / len(eval_data)


## 3.3 Train & Evaluation Loop

In [180]:
best_val_loss = float('inf')
epochs = 30

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()

        # Training
        train(model, train_iter, epoch)

        # Evaluation
        val_loss = evaluate(model, val_iter)
        
        elapsed = time.time() - epoch_start_time

        # Results
        print('-' * 89)
        print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
            f'valid loss {val_loss:5.2f}')
        print('-' * 89)

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()

    # After training, load the best model parameters
    model.load_state_dict(torch.load(best_model_params_path))


-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 26.73s | valid loss 47.02
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 26.65s | valid loss 44.01
-----------------------------------------------------------------------------------------


## 3.4 Generating Popular problem Recommendations as Baseline
In order to compare our model success a baseline recommendation method is required. One of the easiest recommendation method is popular problem recommendation which is obtained by most frequent and highly rated problems.

In [181]:
# def get_popular_problems(interactions):
#   # Calculate the number of ratings for each problem
#   rating_counts = interactions['problem_id'].value_counts().reset_index()
#   rating_counts.columns = ['problem_id', 'rating_count']

#   # Get the most frequently rated problems
#   min_ratings_threshold = rating_counts['rating_count'].quantile(0.95)

#   # Filter problems based on the minimum number of ratings
#   popular_problems = interactions.merge(rating_counts, on='problem_id')
#   popular_problems = popular_problems[popular_problems['rating_count'] >= min_ratings_threshold]


#   # Calculate the average rating for each problem
#   average_ratings = popular_problems.groupby('problem_id')['rating'].mean().reset_index()
#   # Get the top 10 rated problems
#   top_10_problems = list(average_ratings.sort_values('rating', ascending=False).head(10).problem_id.values)
#   return top_10_problems

In [182]:
# top_10_problems = get_popular_problems(interactions)
# [problem_title_dict[problem] for problem in top_10_problems]

## 3.5 Recommendations Result Comparison
Like the evaluation function we will iterate our validation dataset and store recommendation results in lists to compare them with normalized discounted gain(NDCG) metric.

In [183]:
# # problem id decoder
# problem_vocab_itos = problem_vocab.get_itos()

# # A placeholders to store results of recommendations
# transformer_reco_results = list()
# popular_reco_results = list()

# # Get top 10 problems
# k = 10
# # Iterate over the validation data
# for i, (problem_data, user_data) in enumerate(val_iter):
#     # Feed the input and get the outputs
#     problem_data, user_data = problem_data.to(device), user_data.to(device)
#     user_data = user_data.reshape(-1, 1)
#     inputs, targets = problem_data[:, :-1], problem_data[:, 1:]
#     output = model(inputs, user_data)
#     output_flat = output.reshape(-1, ntokens)
#     targets_flat = targets.reshape(-1)

#     # Reshape the output_flat to get top predictions
#     outputs = output_flat.reshape(output_flat.shape[0] // inputs.shape[1],
#                                   inputs.shape[1],
#                                   output_flat.shape[1])[: , -1, :]
#     # k + len(inputs) = 13 problems obtained
#     # In order to prevent to recommend already watched problems
#     values, indices = outputs.topk(k + inputs.shape[1], dim=-1)

#     for sub_sequence, sub_indice_org in zip(problem_data, indices):
#         sub_indice_org = sub_indice_org.cpu().detach().numpy()
#         sub_sequence = sub_sequence.cpu().detach().numpy()

#         # Generate mask array to eliminate already watched problems
#         mask = np.isin(sub_indice_org, sub_sequence[:-1], invert=True)

#         # After masking get top k problems
#         sub_indice = sub_indice_org[mask][:k]

#         # Generate results array
#         transformer_reco_result = np.isin(sub_indice, sub_sequence[-1]).astype(int)

#         # Decode problem to search in popular problems
#         target_problem_decoded = problem_vocab_itos[sub_sequence[-1]]
#         popular_reco_result = np.isin(top_10_problems, target_problem_decoded).astype(int)

#         transformer_reco_results.append(transformer_reco_result)
#         popular_reco_results.append(popular_reco_result)

After generating result for each recommendation now time to compare baseline method vs transformer model.

In [184]:
# from sklearn.metrics import ndcg_score

# # Since we have already sorted our recommendations
# # An array that represent our recommendation scores is used.
# representative_array = [[i for i in range(k, 0, -1)]] * len(transformer_reco_results)

# for k in [3, 5, 10]:
#   transformer_result = ndcg_score(transformer_reco_results,
#                                   representative_array, k=k)
#   popular_result = ndcg_score(popular_reco_results,
#                               representative_array, k=k)

#   print(f"Transformer NDCG result at top {k}: {round(transformer_result, 4)}")
#   print(f"Popular recommendation NDCG result at top {k}: {round(popular_result, 4)}\n\n")


Here we have seen our model results are approximately 10 times better than popular problem recommendation at NDCG metric. A function to generate recommendation for single data is given below.

In [185]:
def generate_recommendation(data, k=5):
    model.eval()
    user_id = data[0];
    problem_sequence = data[1];
    input_sequence = problem_sequence[:-1]
    # Tokenize and numerically encode the user id and problem sequence
    user_tensor = torch.tensor(user_vocab_stoi[user_id])
    problem_tensor = torch.tensor([[problem_vocab_stoi[problem_id]] for problem_id in input_sequence])
    # Shape: [1, 1]
    user_tensor = user_tensor.unsqueeze(0).to(device)
    user_tensor = user_tensor.view(user_tensor.shape[0], 1)

    # Shape: [1, seq_length]
    problem_tensor = problem_tensor.unsqueeze(0).to(device)[0]
    problem_tensor = problem_tensor.view(1, problem_tensor.shape[0])
    
    other_features_batch={}
    
    for i, feature in enumerate(data[2:]):
            feature_name = f'feature_{i+1}'
            other_features_batch[feature_name] = torch.tensor(feature).int()
            other_features_batch[feature_name] = other_features_batch[feature_name].unsqueeze(0).to(device)
            other_features_batch[feature_name] = other_features_batch[feature_name].view(other_features_batch[feature_name].shape[0], 1)
    
#     print("user_tensor", user_tensor.shape)
#     print("problem_tensor", problem_tensor.shape)

    # Pass the tensors through the model
    with torch.no_grad():
        predictions = model(problem_tensor, user_tensor, other_features_batch)

    # The output is a probability distribution over the next problem.
    # Topk to get most probable problems
    values, indices = predictions.topk(k + len(input_sequence), dim=-1)

    # Eliminate already watched problems
    indices = [indice for indice in indices[-1, :][0] if indice not in problem_tensor][:k]
    predicted_problems = [problem_vocab.get_itos()[problem] for problem in indices]
    return predicted_problems

In [186]:
len(test_data_raw)

2034

In [187]:
row_iter = test_data_raw[970]
print("Input Sequence:")
print("-" + "\n-".join([ea_problem for ea_problem in row_iter[1][:-1]]))
recos = '\n-'.join(generate_recommendation(row_iter))

print(f"Recomendations:\n-{recos}")

Input Sequence:
-problem_1355:B
-problem_1256:D
-problem_1349:C
-problem_1551:E
-problem_1207:D
-problem_1699:D
-problem_1242:A
-problem_1450:C1
-problem_1420:A
-problem_1213:A
-problem_1182:A
-problem_1206:B
-problem_1146:B
-problem_1417:B
-problem_1354:B
-problem_1208:A
-problem_1339:A
-problem_1374:C
-problem_1254:A
Recomendations:
-problem_1132:F
-problem_1228:D
-problem_1490:G
-problem_1613:E
-problem_1430:E


In [188]:
row_iter

array(['user_aniket_kundu',
       list(['problem_1355:B', 'problem_1256:D', 'problem_1349:C', 'problem_1551:E', 'problem_1207:D', 'problem_1699:D', 'problem_1242:A', 'problem_1450:C1', 'problem_1420:A', 'problem_1213:A', 'problem_1182:A', 'problem_1206:B', 'problem_1146:B', 'problem_1417:B', 'problem_1354:B', 'problem_1208:A', 'problem_1339:A', 'problem_1374:C', 'problem_1254:A', 'problem_1209:C']),
       4.0, 3.0, 77.0, 54.0, 155, 1.0, 37.0, 181, 91, 31.0, 5.0, 131,
       15.0, 0.0, 1.0, 4.0, 10.0, 15.0, 2.0, 43.0, 311, 13.0, 205, 19.0,
       287, 5.0, 1.0, 73.0, 3.0, 1.0, 14.0, 98, 4.0, 58.0, 2.0, 21.0,
       53.0], dtype=object)

In [189]:
row_iter[0] = '<unk>'

In [190]:
row_iter

array(['<unk>',
       list(['problem_1355:B', 'problem_1256:D', 'problem_1349:C', 'problem_1551:E', 'problem_1207:D', 'problem_1699:D', 'problem_1242:A', 'problem_1450:C1', 'problem_1420:A', 'problem_1213:A', 'problem_1182:A', 'problem_1206:B', 'problem_1146:B', 'problem_1417:B', 'problem_1354:B', 'problem_1208:A', 'problem_1339:A', 'problem_1374:C', 'problem_1254:A', 'problem_1209:C']),
       4.0, 3.0, 77.0, 54.0, 155, 1.0, 37.0, 181, 91, 31.0, 5.0, 131,
       15.0, 0.0, 1.0, 4.0, 10.0, 15.0, 2.0, 43.0, 311, 13.0, 205, 19.0,
       287, 5.0, 1.0, 73.0, 3.0, 1.0, 14.0, 98, 4.0, 58.0, 2.0, 21.0,
       53.0], dtype=object)

In [191]:
# user_vocab_stoi

In [193]:
generate_recommendation(row_iter)

['problem_1433:F',
 'problem_933:A',
 'problem_999:E',
 'problem_1490:G',
 'problem_1523:A']