<a href="https://colab.research.google.com/github/alrz199/recommender-system/blob/main/pytorch_recommender_deep_negative_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Deep recommender system for IMDB data
### In this data, we have some users and their ratings of movies that they have watched



In [None]:
#loading data...The data can be downloaded from the Internet
import pandas as pd
import numpy as np
movies = pd.read_csv('/content/drive/MyDrive/ML_100k/movies.csv')
rating = pd.read_csv('/content/drive/MyDrive/ML_100k/ratings.csv')

In [None]:
# turn timestamp column into pandas date time
rating['timestamp']=pd.to_datetime(rating['timestamp'], unit='s')
rating.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47


In [None]:
!pip --quiet install pytorch_lightning

In [None]:
# importing libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl

In [None]:
rating.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04


In [None]:
rating.shape

(100836, 4)

In [None]:
# In the current table, IDs start from 1; we will change this to 0.
rating.userId=rating.userId -1

In [None]:
rating.movieId=rating.movieId -1

### normalize the ratings

In [None]:
rating.rating.max(),rating.rating.min()

(5.0, 0.5)

In [None]:
rating.rating=rating.rating/5

In [None]:
# split data into train and test
# first we should create an order of the data for each user,
# then we use the latest rating of each user in the test set and other data points for training
rating['rank_latest'] = rating.groupby(['userId'])['timestamp'] \
                                .rank(method = 'first',ascending=False)

train_ratings = rating[rating['rank_latest'] != 1]
test_ratings = rating[rating['rank_latest'] == 1]

# drop columns that we no Longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId','movieId','rating']]

In [None]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
0,0,0,0.8,2000-07-30 18:45:03,86.0
1,0,2,0.8,2000-07-30 18:20:47,196.0
2,0,5,0.8,2000-07-30 18:37:04,141.0
3,0,46,1.0,2000-07-30 19:03:35,18.0
4,0,49,1.0,2000-07-30 18:48:51,66.0


In [None]:
rating.userId.nunique(),rating.shape

(610, (100836, 5))

In [None]:
# creating the dataset for our observations
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
class mvl_Dataset(Dataset):

    def __init__(self, ratings,istrain):
        self.flag = istrain
        self.users, self.items, self.labels = self.get_dataset(ratings)

    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__ (self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    # we can call len(dataset) to return the size
    def __len__(self):
        return len(self.users)

    def get_dataset(self, ratings):
        if self.flag==1:
          users, items, labels = [], [], []
          user_item_rating_set = set(zip(ratings['userId'], ratings['movieId'],ratings['rating']))
          all_movieIds=ratings['movieId'].unique()
          num_negatives = 4
# we should add each user-rating as one sample, we will also add four negative samples per real sample, in the training set,
# which means we should choose four movies that user has not watched and add them to the train set,
# and for this notebook we assign the rating 0 for these negative pairs
          for u, i,r in user_item_rating_set:
              users.append(u)
              items.append(i)
              labels.append(r)
              user_movieIds=ratings[ratings['userId']==u]['movieId'].unique()

              negative_items = np.random.choice(list(set(all_movieIds)-set(user_movieIds)),num_negatives)
              users.extend(list([u])*num_negatives)
              items.extend(list(negative_items))
              labels.extend(list([0])*num_negatives)
        else:
          users = list(ratings['userId'].values)
          items = list(ratings['movieId'].values)
          labels = list(ratings['rating'].values)
        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [None]:
train_dataset = mvl_Dataset(train_ratings,1)
train_data_loader = DataLoader(dataset=train_dataset,
                          batch_size=64,
                          shuffle=True,
                          num_workers=2)

In [None]:
test_dataset = mvl_Dataset(test_ratings,0)
test_data_loader = DataLoader(dataset=test_dataset,
                          batch_size=64,
                          shuffle=False,
                          num_workers=2)

In [None]:
# look at one random batch of data
dataiter = iter(train_data_loader)
data = next(dataiter)
user, item,rating = data
print(user.shape, item.shape,rating.shape)

  self.pid = os.fork()


torch.Size([64]) torch.Size([64]) torch.Size([64])


In [None]:
#import libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# createour deep neural network...we will use two potimizers
# since in the first layers we have embeddings and we want to separate its optimizer from
# the rest of the parameters in the network
import pytorch_lightning as pl
learning_rate1=0.01
learning_rate2=0.001
n_users=610
n_factors=8
n_items=193609
H1=4
D_out=1
class nnn(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.automatic_optimization = False
   	# user and item embedding layers
        self.user_factors = torch.nn.Embedding(n_users, n_factors,
                                               sparse=True)
        self.item_factors = torch.nn.Embedding(n_items, n_factors,
                                               sparse=True)
   	# linear layers
        self.linear1 = torch.nn.Linear(n_factors*2, H1)
        self.linear2 = torch.nn.Linear(H1, D_out)

    def forward(self, x,y):
        users=x
        items=y
        users_embedding = self.user_factors(users)
        items_embedding = self.item_factors(items)
	# concatenate user and item embeddings to form input
        x = torch.cat([users_embedding, items_embedding], 1)
        h1_relu = F.relu(self.linear1(x))
        output_scores = self.linear2(h1_relu)
        return output_scores

    def training_step(self, batch, batch_idx):
        x, y,z = batch
        out=self.forward(x,y)
        # rmse loss
        loss = F.mse_loss(out,z)
        loss=torch.sqrt(loss)
        # Calling self.log will surface up scalars for you in TensorBoard
        self.log("train_loss", loss)
        op1, op2 = self.optimizers()
        op1.zero_grad()
        op2.zero_grad()
        self.manual_backward(loss)
        op1.step()
        op2.step()
        return loss

    def validation_step(self, batch, batch_idx):
        x, y ,z = batch
        out=self.forward(x,y)
        loss = F.mse_loss(out,z)
        # Calling self.log will surface up scalars for you in TensorBoard
        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
      # use SparseAdam for user_embedding and item_embedding, and AdamW for the rest of the network
        optimizer = [torch.optim.SparseAdam(list(self.parameters())[0:2], lr=learning_rate1),torch.optim.AdamW(list(self.parameters())[2:], lr=learning_rate2)]
        return optimizer
        # return {
        # "optimizer": optimizer,
        # "lr_scheduler": {
        #   "scheduler":  lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)}}


In [None]:
model=nnn()
print(model)

nnn(
  (user_factors): Embedding(610, 8, sparse=True)
  (item_factors): Embedding(193609, 8, sparse=True)
  (linear1): Linear(in_features=16, out_features=4, bias=True)
  (linear2): Linear(in_features=4, out_features=1, bias=True)
)


In [None]:
# train the model with early_stopping based on val_loss..we should regenerate negative samples in the training set each time
from pytorch_lightning import  Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
model=nnn()
trainer = Trainer(reload_dataloaders_every_n_epochs=1,
    max_epochs=20,callbacks=[EarlyStopping(monitor="val_loss")])

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_data_loader,val_dataloaders=test_data_loader)
print('Finished Training')

INFO:pytorch_lightning.callbacks.model_summary:
  | Name         | Type      | Params
-------------------------------------------
0 | user_factors | Embedding | 4.9 K 
1 | item_factors | Embedding | 1.5 M 
2 | linear1      | Linear    | 68    
3 | linear2      | Linear    | 5     
-------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.215     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()
  loss = F.mse_loss(out,z)


Training: |          | 0/? [00:00<?, ?it/s]

  loss = F.mse_loss(out,z)
  loss = F.mse_loss(out,z)
  self.pid = os.fork()


Validation: |          | 0/? [00:00<?, ?it/s]

  loss = F.mse_loss(out,z)


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Finished Training


In [None]:
# Start tensorboard.
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

#### Now, we will examine our model based on the hit ratio. We will provide a list containing one movie which the user has watched and 99 movies that the user has not seen. Then, we will use our model to score this set of 100 movies, and we will choose ten movies with the highest scores to show them to the user. It will be desirable for us if the model puts the movie that the user actually had seen in those first ten recommended movies; in other words, we would call that a hit.

In [None]:
# User-item pairs for testing
from tqdm.notebook import tqdm
all_movieIds=rating['movieId'].unique()
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = rating.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]

    predicted_labels = np.squeeze(model(torch.tensor([u]*100),
                                        torch.tensor(test_items)).detach().numpy())

    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]

    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)

print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/610 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 1.00
