In [None]:
# !rm -rf /content/Datasets

In [None]:
import pandas as pd
import numpy as np

In [None]:
import os
import zipfile
import requests

if(not os.path.exists("./Datasets/MoviLens.zip")):

  resp = requests.get("http://files.grouplens.org/datasets/movielens/ml-20m.zip")

  os.mkdir("./Datasets")

  with open("./Datasets/MoviLens.zip", "wb") as f:
    f.write(resp.content)

  with zipfile.ZipFile("./Datasets/MoviLens.zip", "r") as zip_ref:
    zip_ref.extractall("./Datasets")

In [None]:
np.random.seed(123)

ratings = pd.read_csv('/content/Datasets/ml-20m/ratings.csv', parse_dates=['timestamp'])

rand_userIds = np.random.choice(ratings['userId'].unique(),
                                size=int(len(ratings['userId'].unique())*0.1),
                                replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

In [None]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

In [None]:
train_ratings.loc[:, 'rating'] = 1

In [None]:
# Get a list of all movie IDs
all_movieIds = ratings['movieId'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in user_item_set:
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_movieIds)
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

In [None]:
import torch
from torch.utils.data import Dataset

class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training

    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds

    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [None]:
!pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-1.6.1-py3-none-any.whl (582 kB)
[?25l[K     |▋                               | 10 kB 25.1 MB/s eta 0:00:01[K     |█▏                              | 20 kB 31.2 MB/s eta 0:00:01[K     |█▊                              | 30 kB 36.9 MB/s eta 0:00:01[K     |██▎                             | 40 kB 39.8 MB/s eta 0:00:01[K     |██▉                             | 51 kB 37.9 MB/s eta 0:00:01[K     |███▍                            | 61 kB 41.9 MB/s eta 0:00:01[K     |████                            | 71 kB 31.7 MB/s eta 0:00:01[K     |████▌                           | 81 kB 33.3 MB/s eta 0:00:01[K     |█████                           | 92 kB 35.8 MB/s eta 0:00:01[K     |█████▋                          | 102 kB 36.1 MB/s eta 0:00:01[K     |██████▏                         | 112 kB 36.1 MB/s eta 0:00:01[K     |██████▊                         | 122 kB 36.1 MB/s eta 0:00:01[K     |███████▎                        | 13

In [None]:
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader

class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)

        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """

    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds

    def forward(self, user_input, item_input):

        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred

    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=4)

In [None]:
num_users = ratings['userId'].max()+1
num_items = ratings['movieId'].max()+1
all_movieIds = ratings['movieId'].unique()

model = NCF(num_users, num_items, train_ratings, all_movieIds)

trainer = pl.Trainer(max_epochs = 4,
                      gpus = 1,
                     reload_dataloaders_every_n_epochs = True,
                     progress_bar_refresh_rate = 50,
                     logger = False,
                     checkpoint_callback = False)

trainer.fit(model)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 1.1 M 
1 | item_embedding | Embedding | 1.0 M 
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.642     Total estimated model params size (MB)
  cpuset_checked))


Training: 0it [00:00, ?it/s]

In [None]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in test_user_item_set:
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]

    predicted_labels = np.squeeze(model(torch.tensor([u]*100),
                                        torch.tensor(test_items)).detach().numpy())

    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]

    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)

print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))
print(top10_items)

The Hit Ratio @ 10 is 0.78
[48774, 6, 1876, 2186, 1635, 4903, 8917, 4823, 3019, 1086]


In [None]:
print(ratings.head())

     userId  movieId  rating  timestamp  rank_latest
236       3        1     4.0  944919407         81.0
237       3       24     3.0  945176048         10.0
238       3       32     4.0  944918047        140.0
239       3       50     5.0  944918018        143.0
240       3      160     3.0  945176048         11.0


In [None]:
ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,rank_latest
236,3,1,4.0,944919407,81.0
237,3,24,3.0,945176048,10.0
238,3,32,4.0,944918047,140.0
239,3,50,5.0,944918018,143.0
240,3,160,3.0,945176048,11.0
241,3,173,2.0,945176099,1.0
242,3,175,5.0,944919133,89.0
243,3,196,3.0,945175939,25.0
244,3,223,5.0,944918444,116.0
245,3,260,5.0,944917742,152.0


In [None]:
new_ratings = pd.read_csv('/content/Datasets/ml-20m/movies.csv')
new_ratings.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
for j in user_interacted_items[u]:
  print(new_ratings[new_ratings['movieId'] == j]['title'])

0    Toy Story (1995)
Name: title, dtype: object
1    Jumanji (1995)
Name: title, dtype: object
16    Sense and Sensibility (1995)
Name: title, dtype: object
31    Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Name: title, dtype: object
33    Babe (1995)
Name: title, dtype: object
35    Dead Man Walking (1995)
Name: title, dtype: object
38    Clueless (1995)
Name: title, dtype: object
43    Mortal Kombat (1995)
Name: title, dtype: object
46    Seven (a.k.a. Se7en) (1995)
Name: title, dtype: object
49    Usual Suspects, The (1995)
Name: title, dtype: object
102    Happy Gilmore (1996)
Name: title, dtype: object
108    Braveheart (1995)
Name: title, dtype: object
148    Apollo 13 (1995)
Name: title, dtype: object
206    Waterworld (1995)
Name: title, dtype: object
214    Billy Madison (1995)
Name: title, dtype: object
228    Dumb & Dumber (Dumb and Dumber) (1994)
Name: title, dtype: object
293    Pulp Fiction (1994)
Name: title, dtype: object
338    Muriel's Wedding (1994)
Name: title, dtype

In [None]:
print(new_ratings[new_ratings['movieId'] == i]['title'])

11400    Children of Men (2006)
Name: title, dtype: object


In [None]:
for k in top10_items:
  print(new_ratings[new_ratings['movieId'] == k]['title'])

583    Terminator 2: Judgment Day (1991)
Name: title, dtype: object
11400    Children of Men (2006)
Name: title, dtype: object
4054    Hannibal (2001)
Name: title, dtype: object
12583    Forgetting Sarah Marshall (2008)
Name: title, dtype: object
4232    Magnificent Seven, The (1960)
Name: title, dtype: object
1962    Flight of the Navigator (1986)
Name: title, dtype: object
10435    Capote (2005)
Name: title, dtype: object
1229    Better Off Dead... (1985)
Name: title, dtype: object
4633    Rat Race (2001)
Name: title, dtype: object
5014    Super Troopers (2001)
Name: title, dtype: object
