In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install pytorch_lightning 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pytorch_lightning as pl
from tqdm.notebook import tqdm

Collecting pytorch_lightning
[?25l  Downloading https://files.pythonhosted.org/packages/81/d0/84a2f072cd407f93a1e50dff059656bce305f084e63a45cbbceb2fdb67b4/pytorch_lightning-1.1.0-py3-none-any.whl (665kB)
[K     |████████████████████████████████| 675kB 12.8MB/s 
Collecting PyYAML>=5.1
[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)
[K     |████████████████████████████████| 276kB 30.2MB/s 
[?25hCollecting fsspec>=0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/ec/80/72ac0982cc833945fada4b76c52f0f65435ba4d53bc9317d1c70b5f7e7d5/fsspec-0.8.5-py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 11.7MB/s 
Collecting future>=0.17.1
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 31.9MB/s 
Buil

In [5]:
print(torch.cuda.get_device_name(0))

Tesla V100-SXM2-16GB


In [6]:
import pandas as pd
import numpy as np


class Preprocess():
    def __init__(self):
        self.ratings = pd.read_csv('/content/drive/MyDrive/USML_Data/NCF/ratings.csv')
        self.movies = pd.read_csv('/content/drive/MyDrive/USML_Data/movies.csv')
        self.users = None
        self.items = None
        self.movie_ids = list()
        
    def subset_data(self,user_percent):
        # select random users % wise
        rand_users = np.random.choice(self.ratings['userId'].unique(), 
                                size=int(len(self.ratings['userId'].unique())*(user_percent/100)), 
                                replace=False)                
        print(rand_users.shape)
        new_ratings = self.ratings.loc[self.ratings['userId'].isin(rand_users)]
        self.train,self.test = self.train_test_split()
        self.users = self.ratings['userId'].max() + 1
        self.items = self.ratings['movieId'].max() + 1
        self.movie_ids = self.ratings['movieId'].unique()
        
        return self.train,self.test,self.users,self.items,self.movie_ids,new_ratings,self.movies
    
    def train_test_split(self):
        # randomly splitting dta wont work, as user preference canges over time.
        # hence we need to split the data cronologically

        train = self.ratings[(self.ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False) != 1)]
        test = self.ratings[(self.ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False) == 1)]
        
        return train.drop(['timestamp'],axis = 1),test.drop(['timestamp'], axis =1)

    def generate_lists(self):

        user_list, item_list, label_list = list(), list(), list()
          # a user to item dictionary: {user:[list of items that user has interacted with]}
        user_item_set = set(zip(self.ratings['userId'], self.ratings['movieId']))
        num_negatives = 4
        for user, item in tqdm(user_item_set):
            user_list.append(user)
            item_list.append(item)
            # 1 signifies that the user has given an explicit feedback
            label_list.append(1)
            # adding implicit feedbacks since out model will be trained using an implicit feedback.
            # this changes our problem from trying to predict movie rating to wether the user will interact with the movie
            # the problem with this is that there aren't any implicit feedbacks.
            # hence we manually introduce implicit feedback, 0 implies implicit feedback
            # in out case we are considering a 3:1 implicit feedback ratio.
            # we can incrase the number of implicit:explicit feedback ratio, but we wont due to memory constraints.
            for i in range(num_negatives):
                random_negative_item = np.random.choice(self.movie_ids)
                while (user, random_negative_item) in user_item_set:
                    random_negative_item = np.random.choice(self.movie_ids)
                user_list.append(user)
                item_list.append(random_negative_item)
                label_list.append(0)

        return torch.tensor(user_list), torch.tensor(item_list), torch.tensor(label_list)

In [15]:
import pickle
class Checkpoint():
    
    def save_params(self,test,train,movie_ids,ratings,items,users,model_name = ""):
        with open(r"/content/drive/MyDrive/USML_Data/NCF/model_data" + model_name, 'wb') as fp:
            pickle.dump([test,train,movie_ids,ratings,items,users], fp)
        return

    def load_params(self,model_name):
        with open(r"/content/drive/MyDrive/USML_Data/NCF/model_data" + model_name, 'rb') as fp:
            test,train,movie_ids,ratings,items,users = pickle.load(fp)
        return test,train,movie_ids,ratings,items,users
    
    def saveModel(self,model,model_name):
        torch.save(model.state_dict(), r"/content/drive/MyDrive/USML_Data/NCF/model_data" + model_name + ".pt")
        return model
    
    def loadModel(self,model_name, model = None):
        state_dict = torch.load(r"/content/drive/MyDrive/USML_Data/NCF/model_data" + model_name + ".pt")
        model.load_state_dict(state_dict)
        return model

In [8]:
class DataWrapper(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings, movie_ids):
        self.users, self.items, self.labels = self.get_dataset(ratings, movie_ids)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, movie_ids):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(movie_ids)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(movie_ids)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [16]:

class Net(pl.LightningModule):

    def __init__(self,users,items,movie_ids,ratings, pr = None):
        super().__init__()
        self.user_embedding = nn.Embedding(users, 8)
        self.item_embedding = nn.Embedding(items, 8)
        self.hidden1 = nn.Linear(16,64)
        self.hidden2 = nn.Linear(64,32)
        self.output = nn.Linear(32,1)
        self.ratings = ratings
        self.movie_ids = movie_ids
        self.pr = pr

    def forward(self, user_list, item_list):
        user_out = self.user_embedding(user_list)
        item_out = self.item_embedding(item_list)
        combined = torch.cat([user_out,item_out],dim= -1)
        combined = nn.ReLU()(self.hidden1(combined))
        combined = nn.ReLU()( self.hidden2(combined))
        pred = nn.Sigmoid()(self.output(combined))
        
        return pred
    
    def training_step(self, batch, batch_idx):
        user_list, item_list, label_list = batch
        y_hat = self(user_list,item_list)
        # BCE: Also called Softmax Loss. It is a Softmax activation plus a Cross-Entropy loss
        loss = nn.BCELoss()(y_hat, label_list.view(-1, 1).float())
        return loss
    
    def configure_optimizers(self):
        # adam optimizer with lr = 0.001
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    
    def train_dataloader(self):
        return DataLoader(DataWrapper(self.ratings, self.movie_ids),
                          batch_size=512, num_workers=4)




In [43]:
def compute_hits(ratings,test,movie_ids,net):
    '''
    For each user, randomly select 99 items that the user has not interacted with
    Combine these 99 items with the test item (the actual item that the user interacted with). We now have 100 items.
    Run the model on these 100 items, and rank them according to their predicted probabilities
    Select the top 10 items from the list of 100 items. If the test item is present within the top 10 items, then we say that this is a hit.
    Repeat the process for all users. The Hit Ratio is then the average hits.
    '''
    
    user_item_set = set(zip(test['userId'], test['movieId']))
    # Dict of all items that are interacted with by each user
    user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

    hits = []
    rmse = 0
    # Repeat the process for all users.
    for (user,item) in tqdm(user_item_set):
        if user in user_interacted_items:
          interacted_items = user_interacted_items[user]
          # not interacted with = 1 - interacted_with
          not_interacted_items = set(movie_ids) - set(interacted_items)
          #For each user, randomly select 99 items that the user has not interacted with
          selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
          # Combine these 99 items with the test item (the actual item that the user interacted with). We now have 100 items.
          test_items = selected_not_interacted + [item]
          # Run the model on these 100 items
          predicted_labels = np.squeeze(net(torch.tensor([user]*100), 
                                              torch.tensor(test_items)).detach().numpy())
          # rank them according to their predicted probabilities; Select the top 10 items from the list of 100 items.
          labels = np.argsort(predicted_labels)[::-1][0:10].tolist()
          top10_items = [test_items[i] for i in labels]
          
          #If the test item is present within the top 10 items, then we say that this is a hit.
          if item in top10_items:
              hits.append(1)
          else:
              hits.append(0)
              
    # The Hit Ratio is then the average hits.
    print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))
    
    return


In [11]:

def reccomend_user_movies(u,ratings,movie_ids,net,movies):
    print("Showing recommendations for user: {}".format(u))
    print("====" * 9)
    print("Movies with high ratings from user")
    print("----" * 8)

    user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(movie_ids) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 100))    
    predicted_labels = np.squeeze(net(torch.tensor([u]*100), torch.tensor(selected_not_interacted)).detach().numpy())
    labels = np.argsort(predicted_labels)[::-1][0:10].tolist()
    top10_items = [selected_not_interacted[i] for i in labels]
    user_top_ratings = ratings[ratings["movieId"].isin(user_interacted_items[u])].sort_values('rating', ascending = False).head().movieId
    movie_df_rows = movies[movies["movieId"].isin(user_top_ratings)]
    for row in movie_df_rows.itertuples():
        print(row.title, ":", row.genres)

    print("----" * 8)
    print("Top 10 movie recommendations")
    print("----" * 8)
    recommended_movies = movies[movies["movieId"].isin(top10_items)]
    for row in recommended_movies.itertuples():
        print(row.title, ":", row.genres)

    return

In [39]:

if __name__ == "__main__":
    
    pr = Preprocess()
    cp = Checkpoint()
    train,test,users,items,movie_ids,ratings,movies = pr.subset_data(50)
    net = Net(users,items,movie_ids,ratings,pr)
    print(net)
    trainer = pl.Trainer(max_epochs=2, gpus = -1, reload_dataloaders_every_epoch=True,
                     progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

    trainer.fit(net)
    model_name = "NCF_50"
    cp.save_params(test,train,movie_ids,ratings,items,users,model_name)
    cp.saveModel(net,model_name)
    compute_hits(ratings,test,movie_ids,net)
    reccomend_user_movies(1234,ratings,movie_ids,net,movies)

(81270,)


GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 1.3 M 
1 | item_embedding | Embedding | 1.7 M 
2 | hidden1        | Linear    | 1.1 K 
3 | hidden2        | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params


Net(
  (user_embedding): Embedding(162542, 8)
  (item_embedding): Embedding(209172, 8)
  (hidden1): Linear(in_features=16, out_features=64, bias=True)
  (hidden2): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




HBox(children=(FloatProgress(value=0.0, max=162541.0), HTML(value='')))

KeyError: ignored

In [11]:
cp.save_params(test,train,movie_ids,ratings,items,users,model_name)
cp.saveModel(net,model_name)

Net(
  (user_embedding): Embedding(162542, 8)
  (item_embedding): Embedding(209172, 8)
  (hidden1): Linear(in_features=16, out_features=64, bias=True)
  (hidden2): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
)

In [44]:
cp = Checkpoint()
model_name = "NCF_70"
test,train,movie_ids,ratings,items,users = cp.load_params(model_name)
net = Net(users,items,movie_ids,ratings)
net = cp.loadModel(model_name,net)
print(net)
print(test.shape)
print(ratings.head())
print(ratings.shape)

Net(
  (user_embedding): Embedding(162542, 8)
  (item_embedding): Embedding(209172, 8)
  (hidden1): Linear(in_features=16, out_features=64, bias=True)
  (hidden2): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
)
(162541, 3)
   userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510
(12537860, 4)


In [31]:
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

[2, 5, 6, 7, 11]


In [37]:
ratings.groupby('userId')['movieId'].apply(list)

userId
2         [1, 62, 110, 150, 151, 236, 260, 261, 266, 318...
5         [1, 19, 32, 36, 39, 47, 50, 88, 95, 104, 113, ...
6         [161, 260, 318, 527, 593, 608, 858, 902, 912, ...
7         [10, 17, 28, 58, 150, 153, 165, 185, 232, 265,...
11        [277, 372, 527, 593, 1203, 1207, 2329, 2581, 2...
                                ...                        
162525    [24, 423, 858, 968, 1127, 1193, 1499, 2023, 23...
162529    [1, 2, 3, 5, 6, 7, 11, 14, 18, 21, 25, 26, 29,...
162530    [1, 12, 36, 47, 110, 235, 260, 296, 318, 364, ...
162531    [31, 47, 50, 132, 147, 150, 161, 173, 204, 227...
162532    [260, 296, 541, 1021, 1036, 1136, 1196, 1197, ...
Name: movieId, Length: 48762, dtype: object

In [45]:
compute_hits(ratings,test,movie_ids,net)

HBox(children=(FloatProgress(value=0.0, max=162541.0), HTML(value='')))


The Hit Ratio @ 10 is 0.94
