Content based Recommender Net TF-IDF Movielens

In [None]:
# Content-Based Recommender System using TF-IDF and Ratings (MovieLens)

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score

# Load datasets
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

# Merge tags into movies
tagged = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movies = pd.merge(movies, tagged, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')

# Combine genres and tags into one metadata string
movies['metadata'] = movies['genres'].str.replace('|', ' ', regex=False) + ' ' + movies['tag']

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['metadata'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
movie_indices = pd.Series(movies.index, index=movies['movieId'])

# Merge ratings into movies (average rating per movie)
avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
avg_ratings.columns = ['movieId', 'avg_rating']
movies = pd.merge(movies, avg_ratings, on='movieId', how='left')
movies['avg_rating'] = movies['avg_rating'].fillna(0)

# Recommendation function based on TF-IDF similarity and rating boost
def content_based_recommendations(movie_id, top_n=10, rating_weight=0.2):
    idx = movie_indices[movie_id]
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Enhance similarity with average ratings
    enhanced_scores = []
    for i, score in sim_scores:
        rating_boost = rating_weight * movies.iloc[i]['avg_rating'] / 5  # normalize rating to 0-1 scale
        enhanced_scores.append((i, score + rating_boost))

    enhanced_scores = sorted(enhanced_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices_rec = [i[0] for i in enhanced_scores]
    return movie_indices_rec
from sklearn.metrics import ndcg_score
from collections import defaultdict

# Evaluation with NDCG, Precision, Recall, MAP
def evaluate_all_metrics(sample_size=100, top_n=10):
    sample = ratings[ratings['rating'] >= 4.0].groupby('userId').sample(n=1, random_state=42)

    ndcg_scores = []
    precisions = []
    recalls = []
    average_precisions = []

    for _, row in sample.iterrows():
        movie_id = row['movieId']
        relevant_idx = movie_indices.get(movie_id)
        if pd.isna(relevant_idx):
            continue

        recommendations = content_based_recommendations(movie_id, top_n=top_n)

        # NDCG calculation
        y_true = np.zeros((1, len(movies)))
        y_score = np.zeros((1, len(movies)))
        y_true[0, int(relevant_idx)] = 1
        for rank, idx in enumerate(recommendations):
            y_score[0, idx] = top_n - rank

        ndcg = ndcg_score(y_true, y_score)
        ndcg_scores.append(ndcg)

        # Precision, Recall, MAP calculation
        relevant_set = {int(relevant_idx)}
        recommended_set = set(recommendations)

        hits = relevant_set & recommended_set
        precision = len(hits) / top_n
        recall = len(hits) / len(relevant_set)  # always 1 relevant item here
        precisions.append(precision)
        recalls.append(recall)

        # MAP
        ap = 0
        for i, rec in enumerate(recommendations):
            if rec in relevant_set:
                ap = 1 / (i + 1)
                break
        average_precisions.append(ap)

    return {
        "Precision@10": np.mean(precisions),
        "Recall@10": np.mean(recalls),
        "NDCG@10": np.mean(ndcg_scores),
        "MAP@10": np.mean(average_precisions)
    }

# Example usage
results = evaluate_all_metrics(sample_size=100, top_n=10)
metrics_table = pd.DataFrame([{
    "Model": "TF-IDF Content-Based",
    "Precision@10": results["Precision@10"],
    "Recall@10": results["Recall@10"],
    "NDCG@10": results["NDCG@10"],
    "MAP@10": results["MAP@10"]
}])

print(metrics_table)

Precision@10  0.015435   

Recall@10     0.154351

NDCG@10       0.164802

MAP@10.       0.072563
