# Introduction

This Notebooks is a join notebook from both the prepare_data and pytorch-bst in order to be run in google colab.

# Prepare data section

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
from tqdm import tqdm
import math
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import torch.nn as nn
import numpy as np
from math import sqrt
import torch.utils.data as data
from torchvision import transforms
import ast
from torch.nn.utils.rnn import pad_sequence

In [None]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


### Data loaders

In [None]:
class MovieDataset(data.Dataset):
    """Movie dataset."""

    def __init__(
        self, ratings_file,test=False
    ):
        """
        Args:
            csv_file (string): Path to the csv file with user,past,future.
        """
        self.ratings_frame = pd.read_csv(
            ratings_file,
            delimiter=",",
            # iterator=True,
        )
        self.test = test

    def __len__(self):
        return len(self.ratings_frame)

    def __getitem__(self, idx):
        data = self.ratings_frame.iloc[idx]
        user_id = data.user_id
        
        movie_history = eval(data.sequence_movie_ids)
        movie_history_ratings = eval(data.sequence_ratings)
        target_movie_id = movie_history[-1:][0]
        target_movie_rating = movie_history_ratings[-1:][0]
        
        movie_history = torch.LongTensor(movie_history[:-1])
        movie_history_ratings = torch.LongTensor(movie_history_ratings[:-1])
        
        sex = data.sex
        age_group = data.age_group
        occupation = data.occupation
        
        return user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating, sex, age_group, occupation

In [None]:
users = pd.read_csv(
    "/content/drive/MyDrive/WSTM_latest/data/users.csv",
    sep=",",
)

ratings = pd.read_csv(
    "/content/drive/MyDrive/WSTM_latest/data/ratings.csv",
    sep=",",
)

movies = pd.read_csv(
    "/content/drive/MyDrive/WSTM_latest/data/movies.csv", sep=","
)

In [None]:
movies.head(5)

Unnamed: 0,movie_id,title,genres,year,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Animation|Children's|Comedy,75,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,75,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,75,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,75,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,75,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_dataset = MovieDataset("/content/drive/MyDrive/WSTM_latest/data/train.csv")
val_dataset = MovieDataset("/content/drive/MyDrive/WSTM_latest/data/validation.csv")
test_dataset = MovieDataset("/content/drive/MyDrive/WSTM_latest/data/test.csv")

train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=256,
            shuffle=True
        )
val_dataloader = torch.utils.data.DataLoader(
            val_dataset,
            batch_size=512,
            shuffle=True
        )

test_dataloader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=512,
            shuffle=True
        )

print("Finished Dataloaders")

Finished Dataloaders


# BST Model

In [None]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

In [None]:
class PositionalEmbedding(nn.Module):
    """
    Computes positional embedding following "Attention is all you need"
    """

    def __init__(self, max_len, d_model):
        super().__init__()

        # Compute the positional encodings once in log space.
        self.pe = nn.Embedding(max_len, d_model)

    def forward(self, x):
        batch_size = x.size(0)
        return self.pe.weight.unsqueeze(0).repeat(batch_size, 1, 1)


class BST(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Embedding layers

        # Users embedding using the user id and metadata like sex, age group, occupation and zip code
        self.embeddings_user_id = nn.Embedding(
            int(users.user_id.max())+1, int(math.sqrt(users.user_id.max()))+1
        )
        
        # Users metadata embeddings
        self.embeddings_user_sex = nn.Embedding(
            len(users.sex.unique()), int(math.sqrt(len(users.sex.unique())))
        )
        self.embeddings_age_group = nn.Embedding(
            len(users.age_group.unique()), int(math.sqrt(len(users.age_group.unique())))
        )
        self.embeddings_user_occupation = nn.Embedding(
            len(users.occupation.unique()), int(math.sqrt(len(users.occupation.unique())))
        )
        self.embeddings_user_zip_code = nn.Embedding(
            len(users.zip_code.unique()), int(math.sqrt(len(users.sex.unique())))
        )
        
        # Movies
        self.embeddings_movie_id = nn.Embedding(
            int(movies.movie_id.max())+1, int(math.sqrt(movies.movie_id.max()))+1
        )
        
        # Movies features embeddings using genres
        genre_vectors = movies[genres].to_numpy()
        self.embeddings_movie_genre = nn.Embedding(
            genre_vectors.shape[0], genre_vectors.shape[1]
        )
        
           
        self.embeddings_movie_year = nn.Embedding(
            len(movies.year.unique()), int(math.sqrt(len(movies.year.unique())))
        )
        
        # self.positional_embedding = PositionalEmbedding(8, 9)
        self.positional_embedding = PositionalEmbedding(8, 63)
        
        # Network
        self.transfomerlayer = nn.TransformerEncoderLayer(63, 3, dropout=0.2)
        self.linear = nn.Sequential(
            nn.Linear(
                589,
                1024,
            ),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 1),
        )
        self.criterion = torch.nn.MSELoss()
       
        self.opt = torch.optim.AdamW(self.parameters(), lr=0.0005)
        


    def encode_input(self,inputs):
        inputs = [x.to(device) for x in inputs]
        user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating, sex, age_group, occupation = inputs
               
        #MOVIES
        movie_history = self.embeddings_movie_id(movie_history)
        target_movie = self.embeddings_movie_id(target_movie_id)
         
        target_movie = torch.unsqueeze(target_movie, 1)
        transfomer_features = torch.cat((movie_history, target_movie),dim=1)

        #USERS
        user_id = self.embeddings_user_id(user_id)
        
        sex = self.embeddings_user_sex(sex)
        age_group = self.embeddings_age_group(age_group)
        occupation = self.embeddings_user_occupation(occupation)
        user_features = torch.cat((user_id, sex, age_group,occupation), 1)
        
        return transfomer_features, user_features, target_movie_rating.float(), movie_history_ratings
    
    def forward(self, batch):
        transfomer_features, user_features, target_movie_rating, movie_history_ratings = self.encode_input(batch)
        positional_embedding = self.positional_embedding(transfomer_features)
        
        transfomer_features = transfomer_features + positional_embedding        
        movie_history_ratings = torch.concat((movie_history_ratings, 5 * torch.ones(movie_history_ratings.size()[0], 1).to(device)), 1)[:, :, None]/5.0

        transfomer_features = transfomer_features * movie_history_ratings

        transformer_output = self.transfomerlayer(transfomer_features)
        transformer_output = torch.flatten(transformer_output,start_dim=1)
        
        #Concat with other features
        features = torch.cat((transformer_output,user_features),dim=1)

        output = self.linear(features)
        return output, target_movie_rating


In [None]:
def validate (model, data_loader, criterion):
  mae_loss = torch.nn.L1Loss()
  with tqdm(data_loader, unit="batch", total=len(data_loader)) as batch_iterator:
    model.eval()
    val_loss = 0.0
    for i, batch_data in enumerate(batch_iterator, start=1):
        
        output, target = model.forward(batch_data)
        output = output.flatten()
        target = target.flatten()

        loss = mae_loss(output, target)
        val_loss += loss.item()
  
        batch_iterator.set_postfix(mean_loss=val_loss / i, current_loss=loss.item(), total_loss = val_loss)

  return val_loss

In [None]:
from tqdm.notebook import trange, tqdm

def training(model, data_loader, val_dataloader, num_epochs, criterion, optimizer, file_path=None):
  val_loss_lst = []
  train_loss = []
  mae_loss = torch.nn.L1Loss()

  for epoch in trange(num_epochs, desc="training", unit="epoch"):

    with tqdm(data_loader, desc="epoch {}".format(epoch + 1), unit="batch", total=len(data_loader)) as batch_iterator:
        model.train()
        total_loss = 0.0
        running_loss = 0.0
        for i, batch_data in enumerate(batch_iterator, start=1):
            optimizer.zero_grad()
            
            output, target = model(batch_data)
            output = output.flatten()
            target = target.flatten()
          
            loss = criterion(output, target)
            total_loss += loss.item()
            running_loss += mae_loss(output, target).item()

            loss.backward()
            optimizer.step()

            batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item(), total_loss=total_loss)

            if(i%200 == 0):
              print(f"Running Train Loss: {running_loss/200}")
              running_loss = 0.0
        
        train_loss.append(total_loss)

        
    print("Validation Set")
    val_loss = validate(model, val_dataloader, criterion)
    val_loss_lst.append(val_loss)

    if file_path is not None:
      torch.save(model.state_dict(), file_path)
  return model


In [None]:
bst = BST().to(device)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.AdamW(bst.parameters(), lr=0.0005)
mae_loss = torch.nn.L1Loss()

training(bst, train_dataloader, val_dataloader, 2, criterion, optimizer)

In [None]:
torch.save(bst.state_dict(), "/content/drive/MyDrive/WSTM_latest/weights/bst_noemb.pth")

In [None]:
validate(bst, test_dataloader, criterion)