# Introduction

This Notebooks is a join notebook from both the prepare_data and pytorch-bst in order to be run in google colab.

# Prepare data section

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
from tqdm import tqdm
import math
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import torch.nn as nn
import numpy as np
from math import sqrt
import torch.utils.data as data
from torchvision import transforms
import ast
from torch.nn.utils.rnn import pad_sequence
import pickle

In [None]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
print("Using device:", device)

True
Using device: cuda


### Data loaders

In [None]:
class MovieDataset(data.Dataset):
    """Movie dataset."""

    def __init__(
        self, ratings_file,test=False
    ):
        """
        Args:
            csv_file (string): Path to the csv file with user,past,future.
        """
        self.ratings_frame = pd.read_csv(
            ratings_file,
            delimiter=",",
        )
        self.test = test

    def __len__(self):
        return len(self.ratings_frame)

    def __getitem__(self, idx):
        data = self.ratings_frame.iloc[idx]
        user_id = data.user_id
        
        movie_history = eval(data.sequence_movie_ids)
        movie_history_ratings = eval(data.sequence_ratings)
        target_movie_id = movie_history[-1:][0]
        target_movie_rating = movie_history_ratings[-1:][0]
        
        movie_history = torch.LongTensor(movie_history[:-1])
        movie_history_ratings = torch.LongTensor(movie_history_ratings[:-1])
        
        sex = data.sex
        age_group = data.age_group
        occupation = data.occupation
        
        return user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating, sex, age_group, occupation

In [None]:
users = pd.read_csv(
    "/content/drive/MyDrive/WSTM_latest/data/users.csv",
    sep=",",
)

ratings = pd.read_csv(
    "/content/drive/MyDrive/WSTM_latest/data/ratings.csv",
    sep=",",
)

movies = pd.read_csv(
    "/content/drive/MyDrive/WSTM_latest/data/movies.csv", sep=","
)

In [None]:
num_users = len(users)

In [None]:
train_dataset = MovieDataset("/content/drive/MyDrive/WSTM_latest/data/train.csv")
val_dataset = MovieDataset("/content/drive/MyDrive/WSTM_latest/data/validation.csv")
test_dataset = MovieDataset("/content/drive/MyDrive/WSTM_latest/data/test.csv")

train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=256,
            shuffle=True
        )
val_dataloader = torch.utils.data.DataLoader(
            val_dataset,
            batch_size=512,
            shuffle=True
        )

test_dataloader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=512,
            shuffle=True
        )

print("Finished Dataloaders")

Finished Dataloaders


# BST Model

In [None]:
#load the NGCF mdoel embeddings
ngcf_emb = pickle.load(open("/content/drive/MyDrive/WSTM_latest/data/ngcf_emb_18.pkl", "rb"))

In [None]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

In [None]:
# User ID first, followed by movie ID
ngcf_emb.size()

torch.Size([9993, 18])

In [None]:
class PositionalEmbedding(nn.Module):
    """
    Computes positional embedding following "Attention is all you need"
    """

    def __init__(self, max_len, d_model):
        super().__init__()

        # Compute the positional encodings once in log space.
        self.pe = nn.Embedding(max_len, d_model)

    def forward(self, x):
        batch_size = x.size(0)
        return self.pe.weight.unsqueeze(0).repeat(batch_size, 1, 1)


class BST(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Embedding layers
        # Users 
        self.embeddings_user_id = nn.Embedding(
            int(users.user_id.max())+1, int(math.sqrt(users.user_id.max()))+1
        )
        # Users features embeddings
        self.embeddings_user_sex = nn.Embedding(
            len(users.sex.unique()), int(math.sqrt(len(users.sex.unique())))
        )
        self.embeddings_age_group = nn.Embedding(
            len(users.age_group.unique()), int(math.sqrt(len(users.age_group.unique())))
        )
        self.embeddings_user_occupation = nn.Embedding(
            len(users.occupation.unique()), int(math.sqrt(len(users.occupation.unique())))
        )
        self.embeddings_user_zip_code = nn.Embedding(
            len(users.zip_code.unique()), int(math.sqrt(len(users.sex.unique())))
        )
        
        # Movies
        self.embeddings_movie_id = nn.Embedding(
            int(movies.movie_id.max())+1, int(math.sqrt(movies.movie_id.max()))+1
        )
        
        # Movies features embeddings
        genre_vectors = movies[genres].to_numpy()
        self.embeddings_movie_genre = nn.Embedding(
            genre_vectors.shape[0], genre_vectors.shape[1]
        )
        
         
        self.embeddings_movie_year = nn.Embedding(
            len(movies.year.unique()), int(math.sqrt(len(movies.year.unique())))
        )

        self.movie_ngcf_emb_extend = torch.nn.Linear(
            18, 36
        )
        self.user_ngcf_emb_extend = torch.nn.Linear(
            18, 36
        )
        
        self.positional_embedding = PositionalEmbedding(8, 99)
        
        # Network
        self.transfomerlayer = nn.TransformerEncoderLayer(99, 9, dropout=0.2)
        self.linear = nn.Sequential(
            nn.Linear(
                913,
                1024,
            ),
            nn.LeakyReLU(),
            nn.Linear(1024, 512),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            nn.LeakyReLU(),
            nn.Linear(256, 1),
        )
        self.criterion = torch.nn.MSELoss()
        
        self.opt = torch.optim.AdamW(self.parameters(), lr=0.0005)
        


    def encode_input(self,inputs):
        inputs = [x.to(device) for x in inputs]
        user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating, sex, age_group, occupation = inputs
               
        #MOVIES
        movie_id_ngcf = movie_history + num_users
        movie_ngcf_emb = ngcf_emb[movie_id_ngcf]
        movie_ngcf_extend = self.movie_ngcf_emb_extend(movie_ngcf_emb)

        movie_history = self.embeddings_movie_id(movie_history)
        target_movie = self.embeddings_movie_id(target_movie_id)
        
        movie_history = torch.cat((movie_history, movie_ngcf_extend), dim=2)
        
        target_movie_ngcf_emb = ngcf_emb[target_movie_id]
        target_movie_ngcf_extend = self.movie_ngcf_emb_extend(target_movie_ngcf_emb)

        target_movie = torch.cat((target_movie, target_movie_ngcf_extend), dim=1)
        
        target_movie = torch.unsqueeze(target_movie, 1)
        transfomer_features = torch.cat((movie_history, target_movie),dim=1)
        

        #USERS
        ngcf_user_emb = ngcf_emb[user_id]
        ngcf_user_extend = self.user_ngcf_emb_extend(ngcf_user_emb)
        user_id = self.embeddings_user_id(user_id)
        
        sex = self.embeddings_user_sex(sex)
        age_group = self.embeddings_age_group(age_group)
        occupation = self.embeddings_user_occupation(occupation)

        
        user_features = torch.cat((user_id, sex, age_group, occupation, ngcf_user_extend), 1)
        
        
        return transfomer_features, user_features, target_movie_rating.float(), movie_history_ratings
    
    def forward(self, batch):
        transfomer_features, user_features, target_movie_rating, movie_history_ratings = self.encode_input(batch)
        positional_embedding = self.positional_embedding(transfomer_features)
        
        transfomer_features = transfomer_features + positional_embedding        
        movie_history_ratings = torch.concat((movie_history_ratings, 5 * torch.ones(movie_history_ratings.size()[0], 1).to(device)), 1)[:, :, None]/5.0

        transfomer_features = transfomer_features * movie_history_ratings

        transformer_output = self.transfomerlayer(transfomer_features)
        transformer_output = torch.flatten(transformer_output,start_dim=1)
        
        #Concat with other features
        features = torch.cat((transformer_output,user_features),dim=1)

        output = self.linear(features)
        return output, target_movie_rating


In [None]:
def validate (model, data_loader, criterion):
  mae_loss = torch.nn.L1Loss()
  with tqdm(data_loader, unit="batch", total=len(data_loader)) as batch_iterator:
    model.eval()
    val_loss = 0.0
    for i, batch_data in enumerate(batch_iterator, start=1):
        
        output, target = model.forward(batch_data)
        output = output.flatten()
        target = target.flatten()

        loss = mae_loss(output, target)
        val_loss += loss.item()
  
        batch_iterator.set_postfix(mean_loss=val_loss / i, current_loss=loss.item(), total_loss = val_loss)

  return val_loss

In [None]:
from tqdm.notebook import trange, tqdm

def training(model, data_loader, val_dataloader, num_epochs, criterion, optimizer, file_path=None):
  val_loss_lst = []
  train_loss = []
  mae_loss = torch.nn.L1Loss()

  for epoch in trange(num_epochs, desc="training", unit="epoch"):

    with tqdm(data_loader, desc="epoch {}".format(epoch + 1), unit="batch", total=len(data_loader)) as batch_iterator:
        model.train()
        total_loss = 0.0
        running_loss = 0.0
        for i, batch_data in enumerate(batch_iterator, start=1):
            optimizer.zero_grad()
            
            output, target = model(batch_data)
            output = output.flatten()
            target = target.flatten()
          
            loss = criterion(output, target)
            total_loss += loss.item()
            running_loss += mae_loss(output, target).item()

            loss.backward()
            optimizer.step()

            batch_iterator.set_postfix(mean_loss=total_loss / i, current_loss=loss.item(), total_loss=total_loss)

            if(i%200 == 0):
              print(f"Running Train Loss: {running_loss/200}")
              running_loss = 0.0
        
        train_loss.append(total_loss)

        
    print("Validation Set")
    val_loss = validate(model, val_dataloader, criterion)
    val_loss_lst.append(val_loss)

    if file_path is not None:
      torch.save(model.state_dict(), file_path)
  return model


In [None]:
bst = BST().to(device)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.AdamW(bst.parameters(), lr=0.0005)
mae_loss = torch.nn.L1Loss()

training(bst, train_dataloader, test_dataloader, 2, criterion, optimizer)

training:   0%|          | 0/2 [00:00<?, ?epoch/s]

epoch 1:   0%|          | 0/2958 [00:00<?, ?batch/s]

Running Train Loss: 0.8921073868870735
Running Train Loss: 0.8113760507106781
Running Train Loss: 0.7922285917401314
Running Train Loss: 0.7848416057229042
Running Train Loss: 0.7794637820124626
Running Train Loss: 0.771085716187954
Running Train Loss: 0.7632506522536278
Running Train Loss: 0.7609473821520806
Running Train Loss: 0.7643308424949646
Running Train Loss: 0.75472895860672
Running Train Loss: 0.7561417597532273
Running Train Loss: 0.7518568202853203
Running Train Loss: 0.7526811963319778
Running Train Loss: 0.7432983928918838
Validation Set


  0%|          | 0/180 [00:00<?, ?batch/s]

epoch 2:   0%|          | 0/2958 [00:00<?, ?batch/s]

Running Train Loss: 0.739039899110794
Running Train Loss: 0.7359848171472549
Running Train Loss: 0.7405188396573067
Running Train Loss: 0.7388720816373825
Running Train Loss: 0.7330356141924859
Running Train Loss: 0.7352435842156411
Running Train Loss: 0.7260927423834801
Running Train Loss: 0.724106507897377
Running Train Loss: 0.73085622549057
Running Train Loss: 0.7268070691823959
Running Train Loss: 0.7317532649636269
Running Train Loss: 0.7268500164151191
Running Train Loss: 0.7241646924614906
Running Train Loss: 0.7261874145269394
Validation Set


  0%|          | 0/180 [00:00<?, ?batch/s]

BST(
  (embeddings_user_id): Embedding(6041, 78)
  (embeddings_user_sex): Embedding(2, 1)
  (embeddings_age_group): Embedding(7, 2)
  (embeddings_user_occupation): Embedding(21, 4)
  (embeddings_user_zip_code): Embedding(3439, 1)
  (embeddings_movie_id): Embedding(3953, 63)
  (embeddings_movie_genre): Embedding(3883, 18)
  (embeddings_movie_year): Embedding(81, 9)
  (movie_ngcf_emb_extend): Linear(in_features=18, out_features=36, bias=True)
  (user_ngcf_emb_extend): Linear(in_features=18, out_features=36, bias=True)
  (positional_embedding): PositionalEmbedding(
    (pe): Embedding(8, 99)
  )
  (transfomerlayer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=99, out_features=99, bias=True)
    )
    (linear1): Linear(in_features=99, out_features=2048, bias=True)
    (dropout): Dropout(p=0.2, inplace=False)
    (linear2): Linear(in_features=2048, out_features=99, bias=True)
    (norm1): LayerNorm((99,), eps=1e

In [None]:
torch.save(bst.state_dict(), "/content/drive/MyDrive/WSTM_latest/weights/ngcf_bst.pth")

In [None]:
validate(model, test_dataloader, criterion)