<a href="https://colab.research.google.com/github/alrz199/recommender-system/blob/main/graphSage_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Deep recommender system for IMDB data
### our approach in this notebook will be graph-based neural networks

In [None]:
#loading the data
import pandas as pd
import numpy as np
rating = pd.read_csv('/content/drive/MyDrive/ML_pyg_100k/rating_for_pyg2.csv')

In [None]:
!pip --quiet install pytorch_lightning
!pip --quiet install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cpu.html

In [None]:
rating.head(3)
# The difference between this data and what can be found on the internet is that
# This dataset has three masks for training, testing, and validation
# Moreover, since users and movies should be nodes of the graph, we can add 610 to all movie IDs
# to have consistent ids for movies and users

Unnamed: 0,userId,movieId,rating,timestamp,Uid,Mid,rank_latest,train_mask,test_mask,val_mask
0,1,1,0.8,2000-07-30 18:45:03,0,610,86.0,True,False,False
1,1,3,0.8,2000-07-30 18:20:47,0,612,196.0,True,False,False
2,1,6,0.8,2000-07-30 18:37:04,0,615,141.0,True,False,False


In [None]:
rating.shape

(100836, 10)

In [None]:
#devide data into train-val-test
train_ratings=rating[rating['train_mask']]
val_ratings=rating[rating['val_mask']]
test_ratings=rating[rating['test_mask']]

In [None]:
# making sure that in the validation or test set, we do not have new items or users
val_ratings=val_ratings[val_ratings['Mid'].isin(train_ratings['Mid'])]
test_ratings=test_ratings[test_ratings['Mid'].isin(train_ratings['Mid'])]

In [None]:
train_ratings.shape,val_ratings.shape,test_ratings.shape

((99616, 10), (588, 10), (595, 10))

### now  we should provide features for our nodes; thus, we will create two pivot tables for users and movies, then we will apply pca in order to get features for movies or users

In [None]:
table = pd.pivot_table(rating, values='rating', index=['Mid'],
                    columns=['Uid'], aggfunc=np.sum)

In [None]:
table=table.fillna(0)
table=table.sort_index()
table.head(3)

Uid,0,1,2,3,4,5,6,7,8,9,...,600,601,602,603,604,605,606,607,608,609
Mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
610,0.8,0.0,0.0,0.0,0.8,0.0,0.9,0.0,0.0,0.0,...,0.8,0.0,0.8,0.6,0.8,0.5,0.8,0.5,0.6,1.0
611,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.8,0.0,0.0,...,0.0,0.8,0.0,1.0,0.7,0.0,0.0,0.4,0.0,0.0
612,0.8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=8)
movie_features=pca.fit_transform(table.values)

In [None]:
table = pd.pivot_table(rating, values='rating', index=['Uid'],
                    columns=['Mid'], aggfunc=np.sum)
table=table.fillna(0)
table=table.sort_index()
user_features=pca.fit_transform(table.values)

In [None]:
#normalizing the data
from sklearn.preprocessing import MinMaxScaler
user_features = MinMaxScaler().fit_transform(user_features)
movie_features = MinMaxScaler().fit_transform(movie_features)

In [None]:
import torch
# create graph's edges,that are basically item-user pairs
userId=torch.tensor(train_ratings['Uid'].values).view(1,train_ratings.shape[0])
movieId=torch.tensor(train_ratings['Mid'].values).view(1,train_ratings.shape[0])
edge_index=torch.cat((userId,movieId),dim=0)
X_t=torch.cat((torch.tensor(user_features),torch.tensor(movie_features)),dim=0)

In [None]:
train_ratings.shape[0]

99616

In [None]:
edge_index_t=edge_index.type(torch.long)
X_t=X_t.type(torch.float32)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl

In [None]:
# how to use torch_geometric negative sampling
from torch_geometric.utils import negative_sampling
edge_index = torch.as_tensor([[0, 0, 1, 2],
                              [3, 4, 6, 7]])
ng=negative_sampling(edge_index,num_neg_samples=2*edge_index.shape[1],num_nodes=[3,4])
ng

tensor([[0, 0, 0, 1, 1, 1, 2, 2],
        [0, 1, 2, 1, 2, 3, 0, 1]])

In [None]:
#create a dataset for our data
class mvl_Dataset(Dataset):
    def __init__(self, ratings,istrain):
        self.flag= istrain
        self.users, self.items, self.labels = self.get_dataset(ratings)
    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__ (self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    # we can call len(dataset) to return the size
    def __len__(self):
        return len(self.users)

    def get_dataset(self, ratings):
# we should add each user-rating as one sample, we will also add four negative samples per real sample, in the training set,
# which means we should choose four movies that user has not watched and add them to the train set,
# and for this notebook we assign the rating 0 for these negative pairs
        if self.flag==1:
          users = torch.tensor(ratings['Uid'].values).view(1,ratings.shape[0])
          items = torch.tensor(ratings['Mid'].values).view(1,ratings.shape[0])
          items=items-610
          labels = torch.tensor(ratings['rating'].values)

          edge_index=torch.cat((users,items),dim=0)
          num_negatives = 4
          print(users.max()+1,items.max()+1)
          ng=negative_sampling(edge_index,num_neg_samples=4*edge_index.shape[1],num_nodes =[users.max()+1,items.max()+1])
          edge_index=torch.cat((edge_index,ng),dim=1)
          users = edge_index[0,:]
          items = edge_index[1,:]+610
          labels=torch.cat((labels,torch.zeros(ng.shape[1])))
        else:
          users = list(ratings['Uid'].values)
          items = list(ratings['Mid'].values)
          labels = list(ratings['rating'].values)
        return users.type(torch.long), items.type(torch.long), labels.type(torch.float32)

In [None]:
# create train data loader
# we will send all data points in one batch to our graph neural network,
# since it will use all nodes to calculate the embeddings
train_dataset = mvl_Dataset(train_ratings,1)
train_data_loader = DataLoader(dataset=train_dataset,
                          batch_size=99616,
                          shuffle=True)

tensor(610) tensor(9724)


In [None]:
# look at one random batch of data
dataiter = iter(train_data_loader)
data = next(dataiter)
user, item,rating = data
print(user.shape, item.shape,rating.shape)

torch.Size([99616]) torch.Size([99616]) torch.Size([99616])


In [None]:
user

tensor([ 89,  23,  70,  ..., 300, 327, 428])

In [None]:
item

tensor([ 916, 2732, 3440,  ..., 6814, 4197, 8304])

In [None]:
val_dataset = mvl_Dataset(val_ratings,0)
val_data_loader = DataLoader(dataset=val_dataset,
                          batch_size=64,
                          shuffle=False)

In [None]:
# importing libraries
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GraphSAGE

In [None]:
# createour deep neural network...
# in the first layers we will have embeddings from the graph neural network
# then we will concat them to predict the rating
import pytorch_lightning as pl
learning_rate=0.01
H1=8
D_out=1
class Sage_rm(pl.LightningModule):

    def __init__(self):
        super().__init__()
        self.sage=GraphSAGE(in_channels=8,hidden_channels=8,out_channels=8,num_layers=2)
        self.lin = torch.nn.Linear(2*H1, H1)
        self.lin2 = torch.nn.Linear(H1, D_out)
        self.sig=torch.nn.Sigmoid()

    def forward(self, x,y):
        users=x
        items=y
        embed=self.sage(X_t,edge_index_t)
        user_embed=embed[users]
        item_embed=embed[items]
        x = torch.cat([user_embed, item_embed], 1)
        output_scores = F.sigmoid(self.lin2(self.lin(x)))
        return output_scores

    def training_step(self, batch, batch_idx):
        x, y,z = batch
        out=self.forward(x,y)
        #rmse loss
        loss = F.mse_loss(out.view(out.shape[0]),z.view(out.shape[0]))
        loss=torch.sqrt(loss)
        # Calling self.log will surface up scalars for you in TensorBoard
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y ,z = batch
        out=self.forward(x,y)
        loss = F.mse_loss(out.view(out.shape[0]),z.view(out.shape[0]))
        # Calling self.log will surface up scalars for you in TensorBoard
        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=learning_rate)
        return optimizer
        # return {
        # "optimizer": optimizer,
        # "lr_scheduler": {
        #   "scheduler":  lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)}}


In [None]:
model=Sage_rm()
print(model)

Sage_rm(
  (sage): GraphSAGE(8, 8, num_layers=2)
  (lin): Linear(in_features=16, out_features=8, bias=True)
  (lin2): Linear(in_features=8, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [None]:
# train the model, we need to reload dataloader each time to have new negative samples
from pytorch_lightning import  Trainer
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
AVAIL_GPUS = min(1, torch.cuda.device_count())
checkpoint = pl.callbacks.ModelCheckpoint(monitor='val_loss', save_top_k=1)
trainer = Trainer(reload_dataloaders_every_n_epochs=1,
    max_epochs=20,callbacks=[checkpoint])

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model, train_data_loader,val_dataloaders=val_data_loader)
print('Finished Training')

INFO:pytorch_lightning.callbacks.model_summary:
  | Name | Type      | Params
-----------------------------------
0 | sage | GraphSAGE | 272   
1 | lin  | Linear    | 136   
2 | lin2 | Linear    | 9     
3 | sig  | Sigmoid   | 0     
-----------------------------------
417       Trainable params
0         Non-trainable params
417       Total params
0.002     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (5) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


Finished Training


#### Now, we will examine our model based on the hit ratio. We will provide a list containing one movie which the user has watched and 99 movies that the user has not seen. Then, we will use our model to score this set of 100 movies, and we will choose ten movies with the highest scores to show them to the user. It will be desirable for us if the model puts the movie that the user actually had seen in those first ten recommended movies; in other words, we would call that a hit.

In [None]:
# User-item pairs for testing
from tqdm.notebook import tqdm
all_movieIds=rating['Mid'].unique()
test_user_item_set = set(zip(test_ratings['Uid'], test_ratings['Mid']))

# Dict of all items that are interacted with by each user
user_interacted_items = rating.groupby('Uid')['Mid'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]

    predicted_labels = np.squeeze(model(torch.tensor([u]*100),
                                        torch.tensor(test_items)).detach().numpy())

    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]

    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)

print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/595 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.61
