Recommender Net with Label Encoder Movielens

In [None]:
!pip install gdown pandas scikit-learn torch tqdm --quiet

import gdown
file_urls = {
    'movies':  "https://drive.google.com/uc?id=1z1UUE-TRQhaiydYF-NaoLLJO9Qb0JgXW",
    'ratings': "https://drive.google.com/uc?id=1v-DFz7NKN_nqJC_KSKILHsOmG2-dLHtv",
    'links':   "https://drive.google.com/uc?id=1nCG7xhkTcU15jKZnqA6twR4IHn_cn6qe",
    'tags':    "https://drive.google.com/uc?id=12RifUdEgvsdIKY-ofJhYHi4XdUoDYOnk",
}
for name, url in file_urls.items():
    gdown.download(url, f"{name}.csv", quiet=False)

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split

movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
links = pd.read_csv("links.csv")
tags = pd.read_csv("tags.csv")

tags['tag'] = tags['tag'].fillna('').astype(str)
tags_agg = tags.groupby("movieId")['tag'].apply(lambda x: '|'.join(set(x))).reset_index()
movie_features = movies.merge(links, on='movieId', how='left').merge(tags_agg, on='movieId', how='left')
data = ratings.merge(movie_features, on='movieId', how='left')

user_enc = LabelEncoder()
item_enc = LabelEncoder()
data['user_enc'] = user_enc.fit_transform(data['userId'])
data['item_enc'] = item_enc.fit_transform(data['movieId'])

data['genres'] = data['genres'].fillna('').apply(lambda x: x.split('|'))
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(data['genres'])

X = {
    'user': data['user_enc'].values,
    'item': data['item_enc'].values,
    'genre': genre_encoded
}
y = data['rating'].values.astype(np.float32)

train_idx, val_idx = train_test_split(np.arange(len(y)), test_size=0.1, random_state=42)
X_train = {k: v[train_idx] for k, v in X.items()}
X_val = {k: v[val_idx] for k, v in X.items()}
y_train, y_val = y[train_idx], y[val_idx]

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm import tqdm

class MovieLensDataset(Dataset):
    def __init__(self, X, y):
        self.user = torch.tensor(X['user'], dtype=torch.long)
        self.item = torch.tensor(X['item'], dtype=torch.long)
        self.genre = torch.tensor(X['genre'], dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.user)

    def __getitem__(self, idx):
        return self.user[idx], self.item[idx], self.genre[idx], self.y[idx]

class RecommenderNet(nn.Module):
    def __init__(self, n_users, n_items, genre_dim):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, 64)
        self.item_emb = nn.Embedding(n_items, 64)
        self.fc1 = nn.Linear(64 + 64 + genre_dim, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, user, item, genre):
        u = self.user_emb(user)
        i = self.item_emb(item)
        x = torch.cat([u, i, genre], dim=1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x).squeeze()

n_users = data['user_enc'].nunique()
n_items = data['item_enc'].nunique()
genre_dim = genre_encoded.shape[1]

model = RecommenderNet(n_users, n_items, genre_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_loader = DataLoader(MovieLensDataset(X_train, y_train), batch_size=512, shuffle=True)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(5):
    for user, item, genre, target in tqdm(train_loader):
        user, item, genre, target = user.to(device), item.to(device), genre.to(device), target.to(device)
        optimizer.zero_grad()
        preds = model(user, item, genre)
        loss = loss_fn(preds, target)
        loss.backward()
        optimizer.step()

# Predict regression output
model.eval()
with torch.no_grad():
    user = torch.tensor(X_val['user'], dtype=torch.long).to(device)
    item = torch.tensor(X_val['item'], dtype=torch.long).to(device)
    genre = torch.tensor(X_val['genre'], dtype=torch.float32).to(device)
    preds_val = model(user, item, genre).cpu().numpy()

from collections import defaultdict
from sklearn.metrics import mean_squared_error, mean_absolute_error

val_data = data.iloc[val_idx]

relevant_items_by_user = (
    val_data[val_data["rating"] >= 4]
    .groupby("user_enc")["item_enc"]
    .apply(set)
    .to_dict()
)

all_items = np.unique(data['item_enc'])
top_k_preds = []
ground_truth = []
K = 10

model.eval()
with torch.no_grad():
    for user in np.unique(X_val['user']):
        user_tensor = torch.tensor([user] * len(all_items), dtype=torch.long).to(device)
        item_tensor = torch.tensor(all_items, dtype=torch.long).to(device)
        genre_tensor = torch.tensor(
            np.repeat([X_val['genre'][X_val['user'] == user][0]], len(all_items), axis=0),
            dtype=torch.float32
        ).to(device)

        scores = model(user_tensor, item_tensor, genre_tensor).cpu().numpy()
        top_items = all_items[np.argsort(scores)[-K:][::-1]]

        top_k_preds.append(list(top_items))
        ground_truth.append(list(relevant_items_by_user.get(user, [])))

#Metric functions
def precision_at_k(y_true, y_pred, k):
    return np.mean([len(set(pred[:k]) & set(true)) / k for pred, true in zip(y_pred, y_true)])

def recall_at_k(y_true, y_pred, k):
    return np.mean([len(set(pred[:k]) & set(true)) / len(true) if true else 0 for pred, true in zip(y_pred, y_true)])

def ndcg_at_k(y_true, y_pred, k):
    def dcg(rel):
        return sum(r / np.log2(i + 2) for i, r in enumerate(rel))
    scores = []
    for true, pred in zip(y_true, y_pred):
        rel = [1 if p in true else 0 for p in pred[:k]]
        ideal = sorted(rel, reverse=True)
        score = dcg(rel) / dcg(ideal) if dcg(ideal) > 0 else 0
        scores.append(score)
    return np.mean(scores)

def mean_average_precision_at_k(y_true, y_pred, k):
    average_precisions = []
    for true, pred in zip(y_true, y_pred):
        hits, sum_precisions = 0, 0
        for i, p in enumerate(pred[:k]):
            if p in true:
                hits += 1
                sum_precisions += hits / (i + 1)
        average_precisions.append(sum_precisions / min(len(true), k) if true else 0)
    return np.mean(average_precisions)

#Compute and print results
rmse = np.sqrt(mean_squared_error(y_val, preds_val))
mae = mean_absolute_error(y_val, preds_val)
precision = precision_at_k(ground_truth, top_k_preds, K)
recall = recall_at_k(ground_truth, top_k_preds, K)
ndcg = ndcg_at_k(ground_truth, top_k_preds, K)
mapk = mean_average_precision_at_k(ground_truth, top_k_preds, K)

import pandas as pd
metrics_table_corrected = pd.DataFrame({
    "Model": ["Recommender Net"],
    "RMSE": [rmse],
    "MAE": [mae],
    "Precision@10": [precision],
    "Recall@10": [recall],
    "NDCG@10": [ndcg],
    "MAP@10": [mapk]
})

metrics_table_corrected


RMSE 0.814063

MAE 0.616479

Precision@10 0.006991

Recall@10 0.011049

NDCG@10 0.034319

MAP@10 0.004726