In [None]:
!pip install torch
!pip install tensorflow


In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
#load data frame
df = pd.read_csv("/ratings.csv")

In [None]:
df.info()

In [None]:
df.userId.nunique()

In [None]:
df.movieId.nunique()

In [None]:
df.rating.value_counts()

In [None]:
df.shape

In [None]:
#creating dataset and defining methods
class MovieDataset:
    def __init__(self, users, movies,ratings):
        self.users= users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, item):
        users = self.users[item]
        movies=self.movies[item]
        ratings = self.ratings[item]

        return {
            "users": torch.tensor(users, dtype=torch.long),
            "movies": torch.tensor(movies, dtype=torch.long),
            "ratings": torch.tensor(ratings, dtype=torch.long)
        }

In [None]:
#defining class for converting features in embeddings
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        self.user_embed = nn.Embedding(n_users, 32)
        self.movie_embed = nn.Embedding(n_movies, 32)

        self.out = nn.Linear(64,1)

    def forward(self, users, movies, ratings=None):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        output = torch.cat([user_embeds, movie_embeds], dim=1)

        output = self.out(output)
        return output


In [None]:
#defining labels and test/train datasets
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()
df.userId = lbl_user.fit_transform(df.userId.values)
df.movieId = lbl_movie.fit_transform(df.movieId.values)

df_train , df_valid = model_selection.train_test_split(df, test_size =0.1, random_state=42,stratify=df.rating.values)

train_dataset = MovieDataset(
    users = df_train.userId.values,
    movies = df_train.movieId.values,
    ratings= df_train.rating.values
)

valid_dataset = MovieDataset(
    users = df_valid.userId.values,
    movies = df_valid.movieId.values,
    ratings= df_valid.rating.values
)

In [None]:
#creating dataloaders
train_loader = DataLoader(dataset =train_dataset, batch_size =4, shuffle =True, num_workers=2)
validation_loader =DataLoader(dataset= valid_dataset, batch_size=4, shuffle=True, num_workers=2)
dataiter = iter(train_loader)
dataloader_data = next(dataiter)
print(dataloader_data)

In [None]:
#definng model
model = RecSysModel(n_users = len(lbl_user.classes_),
                    n_movies= len(lbl_movie.classes_),

                    ).to(device)

optimizer = torch.optim.Adam(model.parameters(),lr = 0.0009, weight_decay=1e-5)
sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)
loss_func = nn.MSELoss()

In [None]:
#train model
epochs = 1
total_loss = 0
plot_steps, print_steps = 5000,5000
step_cnt =0
all_losses_list = []

model.train()
for epoch_i in range(epochs):
  for i, train_data in enumerate(train_loader):
    output = model(train_data['users'],
                   train_data['movies']
                   )
    rating = train_data['ratings'].view(4,-1).to(torch.float32)

    loss = loss_func(output, rating)
    total_loss += loss.sum().item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    step_cnt += len(train_data["users"])
    if(step_cnt % plot_steps ==0):
      avg_loss = total_loss/(len(train_data["users"]) * plot_steps)
      print(f"epoch {epoch_i} loss at step: {step_cnt} is {avg_loss}")
      all_losses_list.append(avg_loss)
      total_loss =0

In [None]:
#plot loss curve
plt.figure()
plt.plot(all_losses_list)
plt.show

In [None]:
#evaluate model
from sklearn.metrics import mean_squared_error

model_output_list = []
target_rating_list = []
model.eval()
with torch.no_grad():
  for i,batched_data in enumerate(validation_loader):
    model_output = model(batched_data['users'],
                   batched_data['movies']
                   )
    model_output_list.append(model_output.sum().item() / len(batched_data['users']))
    target_rating = batched_data["ratings"]
    target_rating_list.append(target_rating.sum().item() / len(batched_data['users']))

    print(f"model output: {model_output}")
    print(f"target rating: {target_rating}")

rms = mean_squared_error(target_rating_list, model_output_list, squared=False)
print(f"rms: {rms}")

In [None]:
#evaluate using precision and recall
from collections import defaultdict
user_est_true = defaultdict(list)

with torch.no_grad():
  for i, batched_data in enumerate(validation_loader):
    users = batched_data['users']
    movies = batched_data['movies']
    ratings = batched_data['ratings']

    model_output = model(batched_data['users'], batched_data['movies'])

    for i in range(len(users)):
      user_id = users[i].item()
      movie_id = movies[i].item()
      pred_rating = model_output[i][0].item()
      true_rating = ratings[i].item()

      print(f"{user_id}, {movie_id}, {pred_rating}, {true_rating}")
      user_est_true[user_id].append((pred_rating, true_rating))



In [None]:
with torch.no_grad():
  precesions = dict()
  recalls = dict()

  k =10
  threshold = 3.5
  for uid, user_ratings in user_est_true.items():
    user_ratings.sort(key= lambda x : x[0], reverse = True)
    n_rel = sum ((true_r>=threshold) for (_, true_r)in user_ratings)

    n_rec_k = sum((est>= threshold) for (est,_)in user_ratings[:k])

    n_rel_and_rec_k = sum(((true_r>=threshold) and (est>=threshold))
                          for (est, true_r) in user_ratings[:k])

    print(f"uid {uid}, n_rel {n_rel}, n_rec_k {n_rec_k}, n_rel_and_rec_k {n_rel_and_rec_k}")

    precesions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k !=0 else 0
    recalls[uid] = n_rel_and_rec_k / n_rel if n_rel !=0 else 0


In [None]:
print(f"precsion @ {k}: {sum(prec for prec in precesions.values())/len(precesions)}")
print(f"recall @ {k}: {sum(rec for rec in recalls.values())/len(recalls)}")