In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")



In [2]:
# rating_df = pd.read_csv('../data/interim/rating_df.csv')
rating_df = pd.read_csv('/kaggle/input/movielens/rating_df.csv')
rating_df.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [3]:
train_df, val_df = train_test_split(rating_df, test_size=0.2, 
                                    stratify=rating_df['user_id'], random_state=42)

In [4]:
print(train_df.shape, val_df.shape)

(80000, 3) (20000, 3)


## DataLoader

In [5]:
from torch.utils.data import Dataset, DataLoader

class MovieDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user_id, item_id, rating = self.df.iloc[idx]
        sample = {"user": user_id - 1, "item": item_id - 1, "rating": rating}
        return sample
    
train_loader = DataLoader(MovieDataset(train_df), batch_size=16, shuffle=True)
val_loader = DataLoader(MovieDataset(val_df), batch_size=16, shuffle=False,)

## Matrix Factorization Model

Reference: https://towardsdatascience.com/recommendation-system-matrix-factorization-d61978660b4b

In [6]:
import torch
import torch.nn as nn
from torch import optim

class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=20):
        super(MatrixFactorization, self).__init__()
        self.user_embeddings = nn.Embedding(num_users, embedding_dim, sparse=True)
        self.item_embeddings = nn.Embedding(num_items, embedding_dim, sparse=True)

    def forward(self, user_input, item_input):
        user_emb = self.user_embeddings(user_input)
        item_emb = self.item_embeddings(item_input)
        return torch.sum(user_emb * item_emb, dim=1)
    
num_user = 943
num_item = 1682

model = MatrixFactorization(num_user, num_item)

In [7]:
from tqdm import tqdm

def train(model, device, num_epochs, train_loader, valid_loader, loss_fn, optimizer):
    model.to(device)
    for epoch in range(1, num_epochs + 1):
        model.train()
        epoch_loss = 0
        with tqdm(
            total=len(train_df), desc=f"Epoch {epoch}/{num_epochs}", unit="items"
        ) as pbar:
            for batch in train_loader:
                user = batch['user']
                item = batch['item']
                rating = batch['rating']
                
                user, item, rating = user.to(device), item.to(device), rating.to(device)
                optimizer.zero_grad()

                outputs = model(user, item)
                loss = loss_fn(outputs, rating.float())
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                pbar.update(user.shape[0])
                pbar.set_postfix(**{"loss (batch)": loss.item()})
                
        model.eval()
        
        with tqdm(total=len(val_df), desc=f"Validation", unit="items") as pbar:
            with torch.no_grad():
                for batch in val_loader:
                    user = batch['user']
                    item = batch['item']
                    rating = batch['rating']
                    user, item, rating = user.to(device), item.to(device), rating.to(device)

                    outputs = model(user, item)
                    loss = loss_fn(outputs, rating)
            
                    pbar.update(user.shape[0])
                    epoch_loss += loss.item()
                    pbar.set_postfix(**{"loss (batch)": loss.item()})

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
num_epoch = 30
train(model, device, num_epoch, train_loader, val_loader, loss_fn, optimizer)

Epoch 1/30: 100%|██████████| 80000/80000 [00:15<00:00, 5321.33items/s, loss (batch)=30.6]
Validation: 100%|██████████| 20000/20000 [00:02<00:00, 6891.18items/s, loss (batch)=18]  
Epoch 2/30: 100%|██████████| 80000/80000 [00:14<00:00, 5525.66items/s, loss (batch)=22.1]
Validation: 100%|██████████| 20000/20000 [00:03<00:00, 6511.16items/s, loss (batch)=15.9]
Epoch 3/30: 100%|██████████| 80000/80000 [00:14<00:00, 5466.38items/s, loss (batch)=14.2]
Validation: 100%|██████████| 20000/20000 [00:02<00:00, 6881.42items/s, loss (batch)=14.2]
Epoch 4/30: 100%|██████████| 80000/80000 [00:14<00:00, 5405.47items/s, loss (batch)=16]  
Validation: 100%|██████████| 20000/20000 [00:02<00:00, 6820.36items/s, loss (batch)=12.1]
Epoch 5/30: 100%|██████████| 80000/80000 [00:14<00:00, 5444.51items/s, loss (batch)=4.07]
Validation: 100%|██████████| 20000/20000 [00:02<00:00, 6769.60items/s, loss (batch)=10.1]
Epoch 6/30: 100%|██████████| 80000/80000 [00:14<00:00, 5424.25items/s, loss (batch)=6.49]
Validation

In [9]:
torch.save(model.state_dict(), "matrix_factorixation.pt")

## Prediction

In [10]:
# model = MatrixFactorization(num_user, num_item)
# model.load_state_dict(torch.load('/kaggle/working/matrix_factorixation.pt'))

In [11]:
item_df = pd.read_csv('/kaggle/input/itemsmovie/item_df.csv')
# item_df = pd.read_csv('../data/interim/item_df.csv')
item_df.head()

Unnamed: 0,movie_id,movie_title,release_date,unknown,action,adventure,animation,childrens,comedy,crime,...,fantasy,film_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [12]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   user_id   100000 non-null  int64
 1   movie_id  100000 non-null  int64
 2   rating    100000 non-null  int64
dtypes: int64(3)
memory usage: 2.3 MB


In [13]:
from sklearn.preprocessing import MinMaxScaler

def predict_rating(rec_df, rating_df):
    model.eval()
    dataloader = DataLoader(MovieDataset(rec_df), batch_size=16, shuffle=False,)
    pbar = tqdm(dataloader, total=len(dataloader))
    preds = []
    for data in pbar:
        user = data['user']
        item = data['item']
        rating = data['rating']
        user, item, rating = user.to(device), item.to(device), rating.to(device)

        with torch.no_grad():
            preds.append(model(user, item))

    preds = torch.cat(preds).cpu().detach().numpy()
    movie_ids = rec_df['movie_id'].unique()
    
    # Calculate average ratings for movies in rec_df
    avg_ratings = rating_df[rating_df['movie_id'].isin(movie_ids)].groupby('movie_id')['rating'].mean()
    scaler = MinMaxScaler(feature_range=(1, 5)) # Scale average ratings to range (1, 5)
    avg_ratings = scaler.fit_transform(avg_ratings.values.reshape(-1, 1))
    avg_ratings = avg_ratings.flatten()

    rec_df['rating'] = rec_df['movie_id'].map(dict(zip(movie_ids, avg_ratings))).fillna(0)
    return rec_df, preds


def recommend_for_user(user_id, rating_df, item_df, top_n=10):
    rec_df = rating_df.query("user_id != @user_id")
    rec_df['user_id'] = user_id
    rec_df = rec_df.drop_duplicates(subset=['user_id', 'movie_id'])

    rec_df, preds = predict_rating(rec_df, rating_df)

    d = dict(zip(item_df.movie_id, item_df.movie_title))
    rec_df['title'] = rec_df['movie_id'].map(d)
    rec_df['rating'] = preds
    rec_df['rating'] = preds.clip(1, 5)
    rec_df = rec_df.sort_values('rating', ascending=False)

    #  Print top_n recommended movies with their ratings
    print('\nRECOMMENDATIONS')
    print(rec_df[['title','rating']].head(top_n))

    user_df = rating_df.query("user_id == @user_id")
    user_df['title'] = user_df['movie_id'].map(d)
    user_df = user_df.sort_values('rating', ascending=False)

    # Print top_n movies already watched by the user with their ratings
    print('\nWHAT ALREADY WATCHED')
    print(user_df[['title','rating']].head(top_n))

In [14]:
import random

user_id = random.choice(rating_df.user_id.values)
print(user_id)

recommend_for_user(user_id, rating_df, item_df)

398


100%|██████████| 106/106 [00:00<00:00, 846.96it/s]


RECOMMENDATIONS
                                                   title  rating
24592                                   Show, The (1995)     5.0
7288                                Nobody's Fool (1994)     5.0
5291                                   Out to Sea (1997)     5.0
1282                            Gay Divorcee, The (1934)     5.0
7341                                Mrs. Dalloway (1997)     5.0
4320   Nosferatu (Nosferatu, eine Symphonie des Graue...     5.0
424               American Werewolf in London, An (1981)     5.0
1498                               Paths of Glory (1957)     5.0
426                         Home for the Holidays (1995)     5.0
451                            Three Colors: Blue (1993)     5.0

WHAT ALREADY WATCHED
                                           title  rating
99373                           Toy Story (1995)       5
23123                        My Fair Lady (1964)       5
72668                 Singin' in the Rain (1952)       5
63569             




## Evaluation 

In [15]:
from sklearn.metrics import mean_squared_error, precision_score, recall_score
import numpy as np


def predict_rating_for_user(model, user_id, watched_movies, target_movie_id, device):
    model.eval()
    user_input = torch.tensor([user_id - 1], dtype=torch.long).to(device)
    watched_movie_input = torch.tensor([movie_id - 1 for movie_id in watched_movies], dtype=torch.long).to(device)
    target_movie_input = torch.tensor([target_movie_id - 1], dtype=torch.long).to(device)

    # Extract embeddings for the user and movies
    user_embedding = model.user_embeddings(user_input)
    watched_movie_embeddings = model.item_embeddings(watched_movie_input)
    target_movie_embedding = model.item_embeddings(target_movie_input)

    # Compute the dot product of user and target movie embeddings
    prediction = torch.sum(user_embedding * target_movie_embedding)

    return prediction.item()


true_positives = 0
false_positives = 0
false_negatives = 0

unique_user_ids = rating_df['user_id'].unique()
mse_scores = []

for user_id in tqdm(unique_user_ids, desc="Users", unit="user"):
    user_df = rating_df[rating_df['user_id'] == user_id].copy()

    for index, row in user_df.iterrows():
        movie_id = row['movie_id']
        user_df_excluded = user_df[user_df['movie_id'] != movie_id]

        pred_rating = round(predict_rating_for_user(model, user_id, np.array(user_df_excluded['movie_id']), movie_id, device))
        true_rating = row['rating']

        mse = mean_squared_error([true_rating], [pred_rating])
        mse_scores.append(mse)

        threshold = 4.5  
        if round(pred_rating) >= threshold and true_rating >= threshold:
            true_positives += 1
        elif round(pred_rating) >= threshold and true_rating < threshold:
            false_positives += 1
        elif round(pred_rating) < threshold and true_rating >= threshold:
            false_negatives += 1

average_mse = np.mean(mse_scores)
print(f"Average MSE: {average_mse}")

precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

print(f"Precision: {precision}")
print(f"Recall: {recall}")

Users: 100%|██████████| 943/943 [01:54<00:00,  8.22user/s]

Average MSE: 1.35172
Precision: 0.5308557205390095
Recall: 0.29173152209801423



