Recommender Net with Label Encoder Movielens

In [None]:
import pandas as pd
import numpy as np
!pip install rank_bm25
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')  # corrected from 'punkt_tab'

# Load datasets
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

# Merge tags into movies
tagged = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movies = pd.merge(movies, tagged, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')

# Combine genres and tags into one metadata string
movies['metadata'] = movies['genres'].str.replace('|', ' ', regex=False) + ' ' + movies['tag']

# Tokenize metadata for BM25
tokenized_corpus = movies['metadata'].apply(lambda x: word_tokenize(x.lower())).tolist()
bm25 = BM25Okapi(tokenized_corpus)
movie_indices = pd.Series(movies.index, index=movies['movieId'])

# Merge ratings into movies (average rating per movie)
avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
avg_ratings.columns = ['movieId', 'avg_rating']
movies = pd.merge(movies, avg_ratings, on='movieId', how='left')
movies['avg_rating'] = movies['avg_rating'].fillna(0)

# Recommendation function based on BM25 and rating boost
def content_based_recommendations(movie_id, top_n=10, rating_weight=0.2):
    idx = movie_indices[movie_id]
    query = tokenized_corpus[idx]
    bm25_scores = bm25.get_scores(query)

    # Enhance BM25 score with average rating
    enhanced_scores = []
    for i, score in enumerate(bm25_scores):
        rating_boost = rating_weight * movies.iloc[i]['avg_rating'] / 5
        enhanced_scores.append((i, score + rating_boost))

    enhanced_scores = sorted(enhanced_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices_rec = [i[0] for i in enhanced_scores]
    return movie_indices_rec

# Evaluation with NDCG
def evaluate_ndcg(sample_size=100, top_n=10):
    sample = ratings[ratings['rating'] >= 4.0].groupby('userId').sample(n=1, random_state=42)
    ndcg_scores = []

    for _, row in sample.iterrows():
        movie_id = row['movieId']
        relevant_idx = movie_indices.get(movie_id)
        if pd.isna(relevant_idx):
            continue

        recommendations = content_based_recommendations(movie_id, top_n=top_n)

        y_true = np.zeros((1, len(movies)))
        y_score = np.zeros((1, len(movies)))

        y_true[0, int(relevant_idx)] = 1
        for rank, idx in enumerate(recommendations):
            y_score[0, idx] = top_n - rank

        score = ndcg_score(y_true, y_score)
        ndcg_scores.append(score)

    return np.mean(ndcg_scores) if ndcg_scores else 0

# Example usage
ndcg_result = evaluate_ndcg(sample_size=100, top_n=10)
print(f"Average NDCG@10 Score using BM25: {ndcg_result:.4f}")

from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

def additional_metrics(sample_size=100, top_k=10):
    sample = ratings[ratings['rating'] >= 4.0].groupby('userId').sample(n=1, random_state=42)

    y_true_all = []
    y_pred_all = []

    ndcg_scores = []
    precision_scores = []
    recall_scores = []

    for _, row in sample.iterrows():
        user_movie_id = row['movieId']
        user_rating = row['rating']
        relevant_idx = movie_indices.get(user_movie_id)

        if pd.isna(relevant_idx):
            continue

        rec_indices = content_based_recommendations(user_movie_id, top_n=top_k)

        for idx in rec_indices:
            y_pred_all.append(movies.iloc[idx]['avg_rating'])
            y_true_all.append(user_rating)  # Ground truth is the high rating (≥4.0)

        # Binary relevance vector
        y_true_bin = np.zeros(len(movies))
        y_true_bin[int(relevant_idx)] = 1

        y_score_bin = np.zeros(len(movies))
        for rank, idx in enumerate(rec_indices):
            y_score_bin[idx] = top_k - rank

        ndcg_scores.append(ndcg_score([y_true_bin], [y_score_bin]))

        # Precision & Recall @K
        hits = 1 if int(relevant_idx) in rec_indices else 0
        precision_scores.append(hits / top_k)
        recall_scores.append(hits / 1)  # Only one relevant item in this simplified case

    # Error Metrics
    rmse = sqrt(mean_squared_error(y_true_all, y_pred_all)) if y_true_all else 0
    mae = mean_absolute_error(y_true_all, y_pred_all) if y_true_all else 0
    mape = np.mean(np.abs((np.array(y_true_all) - np.array(y_pred_all)) / np.array(y_true_all))) * 100 if y_true_all else 0

    return {
        'RMSE': rmse,
        'MAE': mae,
        'MAPE': mape,
        f'NDCG@{top_k}': np.mean(ndcg_scores),
        f'Precision@{top_k}': np.mean(precision_scores),
        f'Recall@{top_k}': np.mean(recall_scores)
    }

metrics_result = additional_metrics(sample_size=100, top_k=10)
print("\nEvaluation Metrics:")
for metric, value in metrics_result.items():
    print(f"{metric}: {value:.4f}")


RMSE: 1.0541

MAE: 0.8178

MAPE: 18.1852

NDCG@10: 0.1771

Precision@10: 0.0171

Recall@10: 0.1708