In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

In [3]:
df = pd.read_csv('./ml-latest-small/ratings_df.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  100836 non-null  int64  
 1   userId      100836 non-null  int64  
 2   movieId     100836 non-null  int64  
 3   rating      100836 non-null  float64
 4   timestamp   100836 non-null  int64  
dtypes: float64(1), int64(4)
memory usage: 3.8 MB


In [6]:
df.userId.nunique()

610

In [7]:
df.movieId.nunique()

9724

In [8]:
df.shape

(100836, 5)

In [9]:
class MovieDataset:
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, item):
        users = self.users[item]
        movies = self.movies[item]
        ratings = self.ratings[item]
        return {
            "users": torch.tensor(users, dtype=torch.long),
            "movies": torch.tensor(movies, dtype=torch.long),
            "ratings": torch.tensor(ratings, dtype=torch.long),
        }

#### Model

In [10]:
class RecModel(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()

        self.user_embed = nn.Embedding(n_users, 32)
        self.movie_embed = nn.Embedding(n_movies, 32)
        # user and movie should be concatenated
        self.out = nn.Linear(64,1)

    def forward(self, users, movies, ratings=None):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        output = torch.cat([user_embeds, movie_embeds], dim=1)
        output = self.out(output)
        
        return output

In [11]:
# Encoding user and movie id to start from index 0
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()
df.userId = lbl_user.fit_transform(df.userId.values)
df.movieId = lbl_movie.fit_transform(df.movieId.values)

df_train, df_valid = model_selection.train_test_split(
    df, test_size=0.1, random_state=42, stratify=df.rating.values
)

train_dataset = MovieDataset(
    users = df_train.userId.values,
    movies = df_train.movieId.values,
    ratings = df_train.rating.values
)

valid_dataset = MovieDataset(
    users = df_valid.userId.values,
    movies = df_valid.movieId.values,
    ratings = df_valid.rating.values
)

In [12]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=4,
                          shuffle=True,
                          num_workers=2)

validation_loader = DataLoader(dataset=valid_dataset,
                               batch_size=4,
                               shuffle=True,
                               num_workers=2)

dataiter = iter(train_loader)
dataloader_data=next(dataiter)
print(dataloader_data)

  "ratings": torch.tensor(ratings, dtype=torch.long),
  "ratings": torch.tensor(ratings, dtype=torch.long),
  from .autonotebook import tqdm as notebook_tqdm
  from .autonotebook import tqdm as notebook_tqdm
  "ratings": torch.tensor(ratings, dtype=torch.long),


{'users': tensor([ 65, 473,   5,  73]), 'movies': tensor([3885, 1913,  762, 1819]), 'ratings': tensor([5, 3, 4, 2])}


In [13]:
model = RecModel(
    n_users=len(lbl_user.classes_),
    n_movies=len(lbl_movie.classes_),
).to(device)


optimizer = torch.optim.Adam(model.parameters())
sch = torch.optim.lr_scheduler.StepLR(optimizer,step_size=3, gamma=0.7)
loss_func = nn.MSELoss()


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
user_embed = nn.Embedding(len(lbl_user.classes_), 32)
movie_embed = nn.Embedding(len(lbl_movie.classes_), 32)

out = nn.Linear(64,1)

In [15]:
user_embeds = user_embed(dataloader_data['users'])
movie_embeds = movie_embed(dataloader_data['movies'])
print(f"user_embeds {user_embeds.size()}")
print(f"user_embeds {movie_embeds.size()}")

user_embeds torch.Size([4, 32])
user_embeds torch.Size([4, 32])


In [16]:
output = torch.cat([user_embeds, movie_embeds], dim=1)
print(f"output: {output.size()}")

output: torch.Size([4, 64])


In [20]:
with torch.no_grad():
    precisions = dict()
    recalls = dict()

    k=100
    threshold = 3.5
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in ratings[:k])

        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k]
        )

    precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
    recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

NameError: name 'user_est_true' is not defined

In [None]:
print(f" precision @ {k} : {sum(prec for prec in precisions.values()) / len(precision)}")
print(f" recall @ {k} : {sum(rec for rec in recalls.values()) / len(recalls)}")
