In [2]:
#!pip install torch==1.12.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.12.0
  Downloading torch-1.12.0-cp37-cp37m-manylinux1_x86_64.whl (776.3 MB)
[K     |████████████████████████████████| 776.3 MB 18 kB/s 
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu113
    Uninstalling torch-1.12.1+cu113:
      Successfully uninstalled torch-1.12.1+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+cu113 requires torch==1.12.1, but you have torch 1.12.0 which is incompatible.
torchtext 0.13.1 requires torch==1.12.1, but you have torch 1.12.0 which is incompatible.
torchaudio 0.12.1+cu113 requires torch==1.12.1, but you have torch 1.12.0 which is incompatible.[0m
Successfully installed torch-1.12.0


In [5]:
import matplotlib.pyplot as plt
import torch
import torchvision
import pandas as pd
import numpy as np
from torch import nn
from torchvision import transforms
#!pip install torchinfo
from torchinfo import summary
from torch.utils.data import Dataset, DataLoader

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ratings = pd.read_csv('ratings.csv',parse_dates=['timestamp'])
rand_userIds = np.random.choice(ratings['userId'].unique(),size=int(len(ratings['userId'].unique())*0.15),replace=False)
ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]
print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

There are 13076 rows of data from 91 users


In [7]:
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
59858,387,4571,2.5,1095041258
16749,105,91529,4.0,1446571703
97734,606,2212,4.0,1171326613
92434,597,2076,4.0,941639968
29578,202,1641,4.0,974923773


In [8]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
train_ratings = ratings[ratings['rank_latest'] > 2]
test_ratings = ratings[ratings['rank_latest'] == 1]
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [9]:
train_ratings.head()

Unnamed: 0,userId,movieId,rating
1119,10,296,1.0
1120,10,356,3.5
1121,10,588,4.0
1122,10,597,3.5
1123,10,912,4.0


In [10]:
train_ratings.loc[:, 'rating'] = 1
train_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
54111,356,60293,1
29768,202,3704,1
92543,597,2707,1
22419,152,89864,1
13960,89,129354,1


In [11]:
from tqdm.notebook import tqdm
all_movieIds = ratings['movieId'].unique()
users, items, labels = [], [], []
user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))
num_negatives = 3
for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1)
    for _ in range(num_negatives):
        negative_item = np.random.choice(all_movieIds) 
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0)

  0%|          | 0/12894 [00:00<?, ?it/s]

In [12]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 3
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [13]:
class NCF(nn.Module):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))
        vector = nn.ReLU()(self.fc3(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred

In [14]:
train_dataloader=DataLoader(MovieLensTrainDataset(ratings, all_movieIds),batch_size=512, num_workers=4)
test_dataloader=DataLoader(MovieLensTrainDataset(ratings, all_movieIds),batch_size=512, num_workers=4)

  cpuset_checked))


In [15]:
num_users = ratings['userId'].max()+1
num_items = ratings['movieId'].max()+1
all_movieIds = ratings['movieId'].unique()
model = NCF(num_users, num_items, train_ratings, all_movieIds)

In [16]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = torch.nn.BCELoss()
losses = []
for epoch in range(50):
  for batch_idx, (user_input, item_input,labels) in enumerate(train_dataloader):
    predicted_labels = model(user_input, item_input)
    loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
    losses.append(loss.item())
    model.zero_grad()
    loss.backward()
    optimizer.step()

In [17]:
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100),torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/91 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.64


In [19]:
import utils
utils.save_model(model=model,
                 target_dir="models",
                 model_name="ncf.pth")

[INFO] Saving model to: models/ncf.pth


In [20]:
from pathlib import Path
loaded_model = Path("models/ncf.pth")

In [23]:
test_ratings.head()

Unnamed: 0,userId,movieId,rating
1125,10,1088,3.0
1274,11,368,3.0
1323,12,39,4.0
4953,32,224,4.0
6476,44,1639,5.0


In [26]:
test_4=test_ratings[test_ratings['userId']==10]

In [27]:
test_4.head()

Unnamed: 0,userId,movieId,rating
1125,10,1088,3.0


In [55]:
# User-item pairs for testing
test_user_item_set = set(zip(test_4['userId'], test_4['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*50),torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    print(top10_items)
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/1 [00:00<?, ?it/s]

[1569, 1088, 1252, 51709, 2524, 5378, 1537, 2672, 2138, 688]
The Hit Ratio @ 10 is 1.00


In [39]:
u=test_4['userId']
u=u.values[0]
item=test_4['movieId']
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()
interacted_items = user_interacted_items[u]
not_interacted_items = set(all_movieIds) - set(interacted_items)
selected_not_interacted = list(np.random.choice(list(not_interacted_items), 1000))
test_items = selected_not_interacted
predicted_labels = np.squeeze(model(torch.tensor([u]*100),torch.tensor(test_items)).detach().numpy())
top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
print(top10_items)        

[261, 7438, 3593, 89904, 52287, 8810, 159, 76251, 117851, 2717]


In [36]:
u=test_4['userId']
print(u.values[0])

10


In [58]:
def getrecommendations(userId):
  u=userId
  user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()
  interacted_items = user_interacted_items[u]
  not_interacted_items = set(all_movieIds) - set(interacted_items)
  selected_not_interacted = list(np.random.choice(list(not_interacted_items), 1000))
  test_items = selected_not_interacted
  predicted_labels = np.squeeze(model(torch.tensor([u]*1000),torch.tensor(test_items)).detach().numpy())
  top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
  return top10_items

In [59]:
recommendations=getrecommendations(10)

[1198, 1198, 47465, 3408, 2724, 1391, 919, 5333, 2502, 948]


In [61]:
movies=pd.read_csv('movies.csv')