In [1]:
from torch.utils.data import DataLoader
import numpy as np
import torch
from tqdm.notebook import tqdm
import time
import datetime
import os
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from pathlib import Path
import torch
from torch import nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_joke_df = pd.read_csv(r'data\recsys-in-practice\train_joke_df.csv')
sample_submission = pd.read_csv(r'data\recsys-in-practice\sample_submission.csv')
test_joke_df_nofactrating = pd.read_csv(r'data\recsys-in-practice\test_joke_df_nofactrating.csv', index_col=0)

In [3]:
train_joke_df["UID"] = train_joke_df["UID"].astype(str)
train_joke_df["JID"] = train_joke_df["JID"].astype(str)

In [4]:
train_df, valid_df = train_test_split(train_joke_df, test_size=0.1, random_state=42)

In [5]:
valid_dict = {i: i for i in valid_df.index}

In [6]:
valid = [1 if i in valid_dict else 0 for i in train_joke_df.index]
train_joke_df['is_valid'] = valid
train_joke_df

Unnamed: 0,UID,JID,Rating,is_valid
0,18029,6,-1.26,0
1,3298,64,-4.17,0
2,3366,58,0.92,0
3,12735,92,3.69,0
4,11365,38,-6.60,0
...,...,...,...,...
1448359,22604,26,2.82,0
1448360,22255,36,-1.94,0
1448361,21056,40,-9.56,1
1448362,12328,97,0.87,0


In [7]:
valid_df

Unnamed: 0,UID,JID,Rating
1113065,2296,32,5.53
501732,14967,94,2.04
348251,6395,22,-2.86
183559,17692,26,-7.28
196180,20977,40,0.73
...,...,...,...
806339,15769,36,2.82
1192740,22618,22,4.17
264310,15135,62,3.35
161392,12596,47,1.17


In [8]:
user_lookup = {v: i+1 for i, v in enumerate(train_joke_df["UID"].unique())}

In [9]:
movie_lookup = {v: i+1 for i, v in enumerate(train_joke_df["JID"].unique())}

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [11]:
def RMSE_loss(prediction, target):
    return torch.sqrt(nn.MSELoss()(prediction, target))

Comparing this to our baseline, we can see that there is an improvement!

## Sequential recommendations using a transformer

Using matrix factorization, we are treating each rating as being independent from the ratings around it; however, incorporating information about other movies that a user recently rated could provide an additional signal that could boost performance. For example, suppose that a user is watching a trilogy of films; if they have rated the first two instalments highly, it is likely that they may do the same for the finale!

One way that we can approach this is to use a transformer network, specifically the encoder portion, to encode additional context into the learned embeddings for each movie, and then using a fully connected neural network to make the rating predictions.

### Pre-processing the data

The first step is to process our data so that we have a time-sorted list of movies for each user. Let's start by grouping all the ratings by user:

In [12]:
grouped_ratings = train_joke_df.groupby('UID').agg(tuple).reset_index()

In [13]:
grouped_ratings

Unnamed: 0,UID,JID,Rating,is_valid
0,1,"(53, 30, 15, 7, 29, 38, 20, 32, 54, 59, 3, 82,...","(3.2, -7.43, -7.18, -9.85, 9.13, -8.4, -9.85, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
1,10,"(9, 39, 34, 57, 63, 23, 45, 43, 6, 31, 1, 18, ...","(3.01, 3.01, 6.5, 2.43, 6.8, 6.6, 6.8, 3.01, 5...","(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,100,"(69, 28, 54, 39, 8, 36, 46, 53, 32, 20, 66, 16...","(-4.17, -5.49, -4.61, -4.85, -4.27, -4.03, -1....","(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,1000,"(60, 62, 8, 32, 54, 63, 26, 38, 57, 30, 37, 12...","(5.29, 2.09, 0.97, 5.83, -1.7, 8.01, 1.26, 1.2...","(1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,10000,"(14, 18, 68, 66, 53, 40, 49, 69, 12, 50, 23, 2...","(9.37, -4.95, 8.64, 9.17, 9.32, -4.71, -4.76, ...","(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
...,...,...,...,...
24978,9995,"(32, 5, 13, 48, 27, 77, 21, 63, 36, 38, 11, 65...","(-1.6, 6.46, -8.4, 3.74, -3.11, 6.26, 5.97, 6....","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
24979,9996,"(70, 39, 7, 51, 13, 89, 38, 25, 26, 17, 69, 31...","(6.99, 5.97, -5.19, -2.72, 0.0, 8.5, 6.55, -4....","(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ..."
24980,9997,"(20, 8, 38, 17, 55, 1, 22, 29, 53, 49, 85, 45,...","(-0.44, 2.96, -0.34, -1.31, -5.29, -6.31, 1.07...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
24981,9998,"(85, 67, 40, 19, 31, 94, 27, 52, 1, 56, 26, 47...","(-9.81, -9.71, -9.66, 0.53, -9.85, -7.18, -9.6...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."


Now that we have grouped by user, we can create an additional column so that we can see the number of events associated with each user

In [14]:
grouped_ratings['num_ratings'] = grouped_ratings['Rating'].apply(lambda row: len(row))

Let's take a look at the new dataframe

In [15]:
grouped_ratings

Unnamed: 0,UID,JID,Rating,is_valid,num_ratings
0,1,"(53, 30, 15, 7, 29, 38, 20, 32, 54, 59, 3, 82,...","(3.2, -7.43, -7.18, -9.85, 9.13, -8.4, -9.85, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",62
1,10,"(9, 39, 34, 57, 63, 23, 45, 43, 6, 31, 1, 18, ...","(3.01, 3.01, 6.5, 2.43, 6.8, 6.6, 6.8, 3.01, 5...","(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65
2,100,"(69, 28, 54, 39, 8, 36, 46, 53, 32, 20, 66, 16...","(-4.17, -5.49, -4.61, -4.85, -4.27, -4.03, -1....","(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",26
3,1000,"(60, 62, 8, 32, 54, 63, 26, 38, 57, 30, 37, 12...","(5.29, 2.09, 0.97, 5.83, -1.7, 8.01, 1.26, 1.2...","(1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",52
4,10000,"(14, 18, 68, 66, 53, 40, 49, 69, 12, 50, 23, 2...","(9.37, -4.95, 8.64, 9.17, 9.32, -4.71, -4.76, ...","(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",48
...,...,...,...,...,...
24978,9995,"(32, 5, 13, 48, 27, 77, 21, 63, 36, 38, 11, 65...","(-1.6, 6.46, -8.4, 3.74, -3.11, 6.26, 5.97, 6....","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",39
24979,9996,"(70, 39, 7, 51, 13, 89, 38, 25, 26, 17, 69, 31...","(6.99, 5.97, -5.19, -2.72, 0.0, 8.5, 6.55, -4....","(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",58
24980,9997,"(20, 8, 38, 17, 55, 1, 22, 29, 53, 49, 85, 45,...","(-0.44, 2.96, -0.34, -1.31, -5.29, -6.31, 1.07...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",48
24981,9998,"(85, 67, 40, 19, 31, 94, 27, 52, 1, 56, 26, 47...","(-9.81, -9.71, -9.66, 0.53, -9.85, -7.18, -9.6...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",71


Now that we have grouped all the ratings for each user, let's divide these into smaller sequences. To make the most out of the data, we would like the model to have the opportunity to predict a rating for every movie in the training set. To do this, let's specify a sequence length s and use the previous s-1 ratings as our user history.

As the model expects each sequence to be a fixed length, we will fill empty spaces with a padding token, so that sequences can be batched and passed to the model. Let's create a function to do this.

We are going to arbitrarily choose a length of 10 here.

In [16]:
sequence_length = 10

In [17]:
def create_sequences(values, sequence_length):
    sequences = []
    for i, v in enumerate(values):
        seq = values[:i+1]
        if len(seq) > sequence_length:
            seq = seq[i-sequence_length+1:i+1]
        elif len(seq) < sequence_length:
            seq =(*(['[PAD]'] * (sequence_length - len(seq))), *seq)
       
        sequences.append(seq)
    return sequences
        

To visualize how this function works, let's apply it, with a sequence length of 3, to the first 10 movies rated by the first user. These movies are:

Applying our function, we have:

In [18]:
create_sequences(grouped_ratings.iloc[0]['JID'][:10], 3)

[('[PAD]', '[PAD]', '53'),
 ('[PAD]', '53', '30'),
 ('53', '30', '15'),
 ('30', '15', '7'),
 ('15', '7', '29'),
 ('7', '29', '38'),
 ('29', '38', '20'),
 ('38', '20', '32'),
 ('20', '32', '54'),
 ('32', '54', '59')]

In [19]:
grouped_ratings

Unnamed: 0,UID,JID,Rating,is_valid,num_ratings
0,1,"(53, 30, 15, 7, 29, 38, 20, 32, 54, 59, 3, 82,...","(3.2, -7.43, -7.18, -9.85, 9.13, -8.4, -9.85, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",62
1,10,"(9, 39, 34, 57, 63, 23, 45, 43, 6, 31, 1, 18, ...","(3.01, 3.01, 6.5, 2.43, 6.8, 6.6, 6.8, 3.01, 5...","(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",65
2,100,"(69, 28, 54, 39, 8, 36, 46, 53, 32, 20, 66, 16...","(-4.17, -5.49, -4.61, -4.85, -4.27, -4.03, -1....","(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",26
3,1000,"(60, 62, 8, 32, 54, 63, 26, 38, 57, 30, 37, 12...","(5.29, 2.09, 0.97, 5.83, -1.7, 8.01, 1.26, 1.2...","(1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",52
4,10000,"(14, 18, 68, 66, 53, 40, 49, 69, 12, 50, 23, 2...","(9.37, -4.95, 8.64, 9.17, 9.32, -4.71, -4.76, ...","(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",48
...,...,...,...,...,...
24978,9995,"(32, 5, 13, 48, 27, 77, 21, 63, 36, 38, 11, 65...","(-1.6, 6.46, -8.4, 3.74, -3.11, 6.26, 5.97, 6....","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",39
24979,9996,"(70, 39, 7, 51, 13, 89, 38, 25, 26, 17, 69, 31...","(6.99, 5.97, -5.19, -2.72, 0.0, 8.5, 6.55, -4....","(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...",58
24980,9997,"(20, 8, 38, 17, 55, 1, 22, 29, 53, 49, 85, 45,...","(-0.44, 2.96, -0.34, -1.31, -5.29, -6.31, 1.07...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",48
24981,9998,"(85, 67, 40, 19, 31, 94, 27, 52, 1, 56, 26, 47...","(-9.81, -9.71, -9.66, 0.53, -9.85, -7.18, -9.6...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",71


As we can see, we have 10 sequences of length 3, where the final movie in the sequence is unchanged from the original list.

Now, let's apply this function to all of the features in our dataframe

In [20]:
grouped_cols = ['JID', 'Rating', 'is_valid'] 
for col in grouped_cols:
    grouped_ratings[col] = grouped_ratings[col].apply(lambda x: create_sequences(x, sequence_length))

In [21]:
grouped_ratings.head(2)

Unnamed: 0,UID,JID,Rating,is_valid,num_ratings
0,1,"[([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [P...","[([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [P...","[([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [P...",62
1,10,"[([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [P...","[([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [P...","[([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [P...",65


Currently, we have one row that contains all the sequences for a certain user. However, during training, we would like to create batches made up of sequences from many different users. To do this, we will have to transform the data so that each sequence has its own row, while remaining associated with the user ID. We can use the pandas 'explode' function for each feature, and then aggregate these DataFrames together.

In [22]:
exploded_ratings = grouped_ratings[['UID', 'JID']].explode('JID', ignore_index=True)
dfs = [grouped_ratings[[col]].explode(col, ignore_index=True) for col in grouped_cols[1:]]
seq_df = pd.concat([exploded_ratings, *dfs], axis=1)

In [23]:
seq_df.head()

Unnamed: 0,UID,JID,Rating,is_valid
0,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA..."
1,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA..."
2,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA..."
3,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 53,...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 3.2...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 0, ..."
4,1,"([PAD], [PAD], [PAD], [PAD], [PAD], 53, 30, 15...","([PAD], [PAD], [PAD], [PAD], [PAD], 3.2, -7.43...","([PAD], [PAD], [PAD], [PAD], [PAD], 0, 0, 0, 0..."


Now, we can see that each sequence has its own row. However, for the is_valid column, we don't care about the whole sequence and only need the last value as this is the movie for which we will be trying to predict the rating. Let's create a function to extract this value and apply it to these columns.

In [24]:
def get_last_entry(sequence):
    return sequence[-1]

seq_df['is_valid'] = seq_df['is_valid'].apply(get_last_entry)

In [25]:
seq_df

Unnamed: 0,UID,JID,Rating,is_valid
0,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...",0
1,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...",0
2,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...",0
3,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 53,...","([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 3.2...",0
4,1,"([PAD], [PAD], [PAD], [PAD], [PAD], 53, 30, 15...","([PAD], [PAD], [PAD], [PAD], [PAD], 3.2, -7.43...",0
...,...,...,...,...
1448359,9999,"(99, 17, 95, 49, 36, 59, 90, 65, 16, 58)","(4.66, 2.09, 2.86, 4.32, 4.22, 5.1, 5.78, 6.6,...",0
1448360,9999,"(17, 95, 49, 36, 59, 90, 65, 16, 58, 86)","(2.09, 2.86, 4.32, 4.22, 5.1, 5.78, 6.6, -4.42...",0
1448361,9999,"(95, 49, 36, 59, 90, 65, 16, 58, 86, 88)","(2.86, 4.32, 4.22, 5.1, 5.78, 6.6, -4.42, 0.1,...",0
1448362,9999,"(49, 36, 59, 90, 65, 16, 58, 86, 88, 48)","(4.32, 4.22, 5.1, 5.78, 6.6, -4.42, 0.1, 0.49,...",0


Also, to make it easy to access the rating that we are trying to predict, let's separate this into its own column.

In [26]:
seq_df['target_rating'] = seq_df['Rating'].apply(get_last_entry)
seq_df['previous_ratings'] = seq_df['Rating'].apply(lambda seq: seq[:-1])
seq_df.drop(columns=['Rating'], inplace=True)

To prevent the model from including padding tokens when calculating attention scores, we can provide an attention mask to the transformer; the mask should be 'True' for a padding token and 'False' otherwise. Let's calculate this for each row, as well as creating a column to show the number of padding tokens present.

In [27]:
seq_df['pad_mask'] = seq_df['JID'].apply(lambda x: (np.array(x) == '[PAD]'))
seq_df['num_pads'] = seq_df['pad_mask'].apply(sum)
seq_df['pad_mask'] = seq_df['pad_mask'].apply(lambda x: x.tolist()) # in case we serialize later

Let's inspect the transformed data

In [28]:
seq_df

Unnamed: 0,UID,JID,is_valid,target_rating,previous_ratings,pad_mask,num_pads
0,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...",0,3.20,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","[True, True, True, True, True, True, True, Tru...",9
1,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...",0,-7.43,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","[True, True, True, True, True, True, True, Tru...",8
2,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...",0,-7.18,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], [PA...","[True, True, True, True, True, True, True, Fal...",7
3,1,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 53,...",0,-9.85,"([PAD], [PAD], [PAD], [PAD], [PAD], [PAD], 3.2...","[True, True, True, True, True, True, False, Fa...",6
4,1,"([PAD], [PAD], [PAD], [PAD], [PAD], 53, 30, 15...",0,9.13,"([PAD], [PAD], [PAD], [PAD], [PAD], 3.2, -7.43...","[True, True, True, True, True, False, False, F...",5
...,...,...,...,...,...,...,...
1448359,9999,"(99, 17, 95, 49, 36, 59, 90, 65, 16, 58)",0,0.10,"(4.66, 2.09, 2.86, 4.32, 4.22, 5.1, 5.78, 6.6,...","[False, False, False, False, False, False, Fal...",0
1448360,9999,"(17, 95, 49, 36, 59, 90, 65, 16, 58, 86)",0,0.49,"(2.09, 2.86, 4.32, 4.22, 5.1, 5.78, 6.6, -4.42...","[False, False, False, False, False, False, Fal...",0
1448361,9999,"(95, 49, 36, 59, 90, 65, 16, 58, 86, 88)",0,5.49,"(2.86, 4.32, 4.22, 5.1, 5.78, 6.6, -4.42, 0.1,...","[False, False, False, False, False, False, Fal...",0
1448362,9999,"(49, 36, 59, 90, 65, 16, 58, 86, 88, 48)",0,-3.01,"(4.32, 4.22, 5.1, 5.78, 6.6, -4.42, 0.1, 0.49,...","[False, False, False, False, False, False, Fal...",0


All looks as it should! Let's split this into training and validation sets and save this.

In [29]:
train_seq_df = seq_df[seq_df.is_valid == False]
valid_seq_df = seq_df[seq_df.is_valid == True]

### Training the model

As we saw previously, before we can feed this data into the model, we need to create lookup tables to encode our movies and users. However, this time, we need to include the padding token in our movie lookup.

In [30]:
user_lookup = {v: i+1 for i, v in enumerate(train_joke_df['UID'].unique())}

In [31]:
def create_feature_lookup(df, feature):
    lookup = {v: i+1 for i, v in enumerate(df[feature].unique())}
    lookup['[PAD]'] = 0
    return lookup

In [32]:
movie_lookup = create_feature_lookup(train_joke_df, 'JID')

Now, we are dealing with sequences of ratings, rather than individual ones, so we will need to create a new dataset to wrap our processed DataFrame:

In [33]:
class MovieSequenceDataset(Dataset):
    def __init__(self, df, movie_lookup, user_lookup):
        super().__init__()
        self.df = df
        self.movie_lookup = movie_lookup
        self.user_lookup = user_lookup

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        data = self.df.iloc[index]
        user_id = torch.tensor(self.user_lookup[str(data.UID)])
        movie_ids = torch.tensor([self.movie_lookup[j] for j in data.JID])

        previous_ratings = torch.tensor(
            [rating if rating != "[PAD]" else 0 for rating in data.previous_ratings]
        )

        attention_mask = torch.tensor(data.pad_mask).to(device)
        target_rating = data.target_rating
        encoded_features = {
            "user_id": user_id.to(device),
            "movie_ids": movie_ids.to(device),
            "ratings": previous_ratings.to(device),
        }

        return (encoded_features, attention_mask), torch.tensor(target_rating, dtype=torch.float32).to(device)


In [34]:
train_dataset = MovieSequenceDataset(train_seq_df, movie_lookup, user_lookup)
valid_dataset = MovieSequenceDataset(valid_seq_df, movie_lookup, user_lookup)

Now, let's define our transformer model! As a start, given that the matrix factorization model can achieve good performance using only the user and movie ids, let's only include this information for now.

In [35]:
class BstTransformer(nn.Module):
    def __init__(
        self,
        movies_num_unique,
        users_num_unique,
        sequence_length=10,
        embedding_size=120,
        num_transformer_layers=1,
        ratings_range=(-10, 10),
    ):
        super().__init__()
        self.sequence_length = sequence_length
        self.y_range = ratings_range
        self.movies_embeddings = nn.Embedding(movies_num_unique + 1, embedding_size, padding_idx=0)
        self.user_embeddings = nn.Embedding(users_num_unique + 1, embedding_size)
        self.position_embeddings = nn.Embedding(sequence_length, embedding_size)

        self.encoder = nn.TransformerEncoder(
            encoder_layer=nn.TransformerEncoderLayer(
                d_model=embedding_size,
                nhead=12,
                dropout=0.1,
                batch_first=True,
                activation="gelu",
            ),
            num_layers=num_transformer_layers,
        )

        self.linear = nn.Sequential(
            nn.Linear(embedding_size + (embedding_size * sequence_length), 1024),
            nn.BatchNorm1d(1024),
            nn.Mish(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.Mish(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.Mish(),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )

    def forward(self, inputs):
        features, mask = inputs

        encoded_user_id = self.user_embeddings(features["user_id"])

        user_features = encoded_user_id

        encoded_movies = self.movies_embeddings(features["movie_ids"])

        positions = torch.arange(0, self.sequence_length, 1, dtype=int, device=features["movie_ids"].device)
        positions = self.position_embeddings(positions)

        transformer_features = encoded_movies + positions

        transformer_output = self.encoder(transformer_features, src_key_padding_mask=mask)
        transformer_output = torch.flatten(transformer_output, start_dim=1)

        combined_output = torch.cat((transformer_output, user_features), dim=1)

        rating = self.linear(combined_output)
        rating = rating.squeeze()
        if self.y_range is None:
            return rating
        else:
            return rating * (self.y_range[1] - self.y_range[0]) + self.y_range[0]


We can see that, as a default, we feed our sequence of movie embeddings into a single transformer layer, before concatenating the output with the user features - here, just the user ID - and using this as the input to a fully connected network. Here, we are using only a simple positional encoding that is learned to represent the sequence in which the movies were rated; using a sine- and cosine-based approach provided no benefit during my experiments, but feel free to try it out if you are interested!

Once again, let's define a training function for this model; except for the model initialization, this is identical to the one we used to train the matrix factorization model.

In [36]:
for x, y in tqdm(DataLoader(train_dataset, batch_size=8)):
    #print(images[0].shape, texts[0], labels[0] , labels.shape)
    break
display(x)


  0%|          | 0/162941 [00:00<?, ?it/s]

[{'user_id': tensor([17560, 17560, 17560, 17560, 17560, 17560, 17560, 17560],
         device='cuda:0'),
  'movie_ids': tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0, 65],
          [ 0,  0,  0,  0,  0,  0,  0,  0, 65, 57],
          [ 0,  0,  0,  0,  0,  0,  0, 65, 57, 45],
          [ 0,  0,  0,  0,  0,  0, 65, 57, 45, 73],
          [ 0,  0,  0,  0,  0, 65, 57, 45, 73, 11],
          [ 0,  0,  0,  0, 65, 57, 45, 73, 11,  5],
          [ 0,  0,  0, 65, 57, 45, 73, 11,  5, 85],
          [ 0,  0, 65, 57, 45, 73, 11,  5, 85, 48]], device='cuda:0'),
  'ratings': tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
            0.0000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
            3.2000],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  3.2000,
           -7.4300],
          [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  3.2000, -7.4300,
           -7.1800],
          [ 0.0000, 

In [42]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def save(model, name):
    os.mkdir(f"artifacts_wonderfund_v2/{name}")
    #torch.save(model, f"{name}/model.pkl")
    torch.save(model.state_dict(), f"artifacts_wonderfund_v2/{name}/checkpoint.pth")
    
def load(name):
    return torch.load(f"artifacts_wonderfund_v2/{name}/model.pkl")

def load2(name, model):
    model.load_state_dict(torch.load(f"artifacts_wonderfund_v2/{name}/checkpoint.pth"))
    
def train_model(epoch_start, model, train_loader, val_loader, loss, optimizer, num_epochs, scheduler, loss_train_history, loss_val_history):   
    bet_model_name = None
    best_loss = compute_accuracy(model, val_loader, loss)
    print('loss:', best_loss)
    for epoch in range(epoch_start, epoch_start + num_epochs):
        model.train()        
        t1 = time.time()
        loss_accum = 0
        for i_step, (x, y) in enumerate(tqdm(train_loader)):
            prediction = model(x)    
            loss_value = loss(prediction, y)
            optimizer.zero_grad()
            loss_value.backward()
            optimizer.step()            
            loss_accum += loss_value.cpu().detach().numpy()

        ave_loss = loss_accum / (i_step + 1)
        loss_val = compute_accuracy(model, val_loader, loss)
        
        loss_train_history.append(float(ave_loss))
        loss_val_history.append(loss_val)
        
        if scheduler != None:
            scheduler.step()
            

        if loss_val < best_loss:
            best_loss = loss_val
            bet_model_name = f'{datetime.datetime.now().strftime("%d.%m.%Y_%H.%M.%S.%f")}_epoch_{epoch}_loss_{round(best_loss, 4)}'
            save(model, bet_model_name)
            print(f"saved {bet_model_name}")

        print("Epoch: %i lr: %f; Train loss: %f, Val loss: %f, time: %i s" % (epoch, get_lr(optimizer), ave_loss, loss_val,
                                                                            round(time.time() - t1)))
    return bet_model_name
        
    
def compute_accuracy(model, loader, loss):
    """
    Computes accuracy on the dataset wrapped in a loader    
    Returns: accuracy as a float value between 0 and 1
    """
    model.eval()
    loss_accum = 0
    for i_step, (x, y) in enumerate(tqdm(loader)):
        prediction = model(x)
        loss_value = loss(prediction, y)
        loss_accum += loss_value.cpu().detach().numpy()

    ave_loss = loss_accum / (i_step + 1)         
    return float(ave_loss)

In [38]:
model = BstTransformer(len(movie_lookup), len(user_lookup), sequence_length, embedding_size=120).to(device)
loss_train_history, loss_val_history = [], []

In [39]:
#compute_accuracy(model, DataLoader(valid_dataset, batch_size=500), RMSE_loss)

In [40]:
#assert False

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.01)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)

bet_model_name = train_model(0,
    model, 
    DataLoader(train_dataset, batch_size=10000),
    DataLoader(valid_dataset, batch_size=10000),
    RMSE_loss, optimizer, 100, scheduler, loss_train_history, loss_val_history)
print('end!')
print(bet_model_name)

  0%|          | 0/15 [00:00<?, ?it/s]

loss: 4.986687850952149


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

saved 23.04.2023_09.25.55.412324_epoch_0_loss_4.9155
Epoch: 0 lr: 0.010000; Train loss: 4.903167, Val loss: 4.915476, time: 436 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

saved 23.04.2023_09.33.15.900581_epoch_1_loss_4.633
Epoch: 1 lr: 0.010000; Train loss: 4.641369, Val loss: 4.632962, time: 440 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

saved 23.04.2023_09.40.31.726302_epoch_2_loss_4.5568
Epoch: 2 lr: 0.010000; Train loss: 4.443023, Val loss: 4.556802, time: 436 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

saved 23.04.2023_09.47.49.286109_epoch_3_loss_4.4666
Epoch: 3 lr: 0.010000; Train loss: 4.364772, Val loss: 4.466634, time: 438 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 4 lr: 0.008000; Train loss: 4.309002, Val loss: 4.515197, time: 437 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

saved 23.04.2023_10.02.22.667561_epoch_5_loss_4.3735
Epoch: 5 lr: 0.008000; Train loss: 4.232088, Val loss: 4.373486, time: 436 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

saved 23.04.2023_10.09.39.490416_epoch_6_loss_4.3461
Epoch: 6 lr: 0.008000; Train loss: 4.162678, Val loss: 4.346130, time: 437 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

saved 23.04.2023_10.16.55.945353_epoch_7_loss_4.3257
Epoch: 7 lr: 0.008000; Train loss: 4.140000, Val loss: 4.325688, time: 436 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 8 lr: 0.008000; Train loss: 4.139509, Val loss: 4.368664, time: 434 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

saved 23.04.2023_10.31.27.915225_epoch_9_loss_4.3089
Epoch: 9 lr: 0.006400; Train loss: 4.137212, Val loss: 4.308871, time: 438 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 10 lr: 0.006400; Train loss: 4.123512, Val loss: 4.324999, time: 437 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 11 lr: 0.006400; Train loss: 4.090162, Val loss: 4.326645, time: 439 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 12 lr: 0.006400; Train loss: 4.068935, Val loss: 4.331707, time: 437 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 13 lr: 0.006400; Train loss: 4.053818, Val loss: 4.314532, time: 438 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 14 lr: 0.005120; Train loss: 4.035491, Val loss: 4.369647, time: 439 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 15 lr: 0.005120; Train loss: 4.010960, Val loss: 4.368348, time: 441 s


  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 16 lr: 0.005120; Train loss: 3.961759, Val loss: 4.310513, time: 441 s


  0%|          | 0/131 [00:00<?, ?it/s]

In [None]:
fig = plt.figure(figsize=(10, 8))    
plt.xlabel("#iteration")
plt.ylabel("loss")
plt.plot(loss_train_history, label='train loss')
plt.plot(loss_val_history, label='val loss')
fig.legend()
plt.show()

In [None]:
assert False

In [None]:


test_joke_df_nofactrating['Rating'] = np.zeros((len(test_joke_df_nofactrating)))
test_dataset = UserItemRatingDataset(test_joke_df_nofactrating, movie_lookup, user_lookup)


#best_model_name = '22.04.2023_16.39.07.862958_epoch_0_loss_10.5438'

best_model = MfDotBias(120, len(user_lookup), len(movie_lookup), ratings_range=[-10, 10]).to(device)
print(compute_accuracy(best_model, DataLoader(test_dataset, batch_size=5000), RMSE_loss))

load2(best_model_name, best_model)
print(compute_accuracy(best_model, DataLoader(test_dataset, batch_size=5000), RMSE_loss))

result = []
for x, y in tqdm(DataLoader(test_dataset, batch_size=5000)):
    predict = model(x)
    result.extend(predict.cpu().detach().numpy())
    
    
    
test_joke_df_nofactrating['Rating'] = result
test_joke_df_nofactrating['Rating'].to_frame().head(5)
test_joke_df_nofactrating['Rating'].to_frame().to_csv('nn_embedding.csv')






