# Movie Recommender Systems

This notebook implements various movie recommendation approaches:
1. Popularity-based
2. Content-based Filtering
3. Collaborative Filtering
4. Matrix Factorization
5. Hybrid Approach

In [341]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import torch
import warnings
import asyncio
from functools import lru_cache
warnings.filterwarnings('ignore')

# Setup device and optimizations for Apple Silicon
device = torch.device('mps')
torch.backends.mps.enable_fallback_kernels = True
print(f"Using Apple Metal device: {device}")

# Advanced caching mechanism
class RecommendationCache:
    def __init__(self, max_size=1000):
        self.cache = {}
        self.max_size = max_size
    
    @lru_cache(maxsize=1000)
    def get_movie_features(self, movie_idx):
        return self.latent_matrix_gpu[movie_idx]
    
    def get(self, key):
        return self.cache.get(key)
    
    def set(self, key, value):
        if len(self.cache) >= self.max_size:
            self.cache.pop(next(iter(self.cache)))
        self.cache[key] = value

cache = RecommendationCache()

Using Apple Metal device: mps


## 1. Data Loading and Preprocessing

In [342]:
# Read the datasets
movies_df = pd.read_csv('ml-latest-small/movies.csv')
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
tags_df = pd.read_csv('ml-latest-small/tags.csv')


# Merge tags for each movie
tags_grouped = tags_df.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movies_with_tags = pd.merge(movies_df, tags_grouped, on='movieId', how='left')
movies_with_tags['tag'] = movies_with_tags['tag'].fillna('')

## 2. Popularity-based Recommender

In [343]:
def popularity_recommender(n_recommendations=10):
    # Calculate mean rating and number of ratings for each movie using training data
    movie_stats = train_ratings.groupby('movieId').agg({
        'rating': ['count', 'mean']
    }).reset_index()
    
    # Rename columns
    movie_stats.columns = ['movieId', 'rating_count', 'rating_mean']
    
    # Filter movies with minimum number of ratings (e.g., 100)
    popular_movies = movie_stats[movie_stats['rating_count'] >= 100]
    
    # Sort by rating mean and count
    popular_movies = popular_movies.sort_values(['rating_mean', 'rating_count'], ascending=[False, False])
    
    # Get movie titles
    recommendations = pd.merge(popular_movies, movies_df, on='movieId')
    
    return recommendations[['movieId', 'title', 'rating_mean', 'rating_count', 'genres']].head(n_recommendations) 

## 3. Content-based Filtering

In [344]:
# Create TF-IDF vectorizer and convert to dense GPU tensor
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_with_tags['tag'])

# Convert sparse matrix to dense GPU tensor directly
tfidf_matrix_gpu = torch.tensor(tfidf_matrix.toarray(), device=device, dtype=torch.float32)  # Use full precision for SVD

# Perform SVD on GPU using torch operations
U, S, V = torch.svd(tfidf_matrix_gpu)  # Already dense
latent_matrix_gpu = (U[:, :100] @ torch.diag(S[:100])).to(device=device)  # Keep in full precision for stability

# Cache for frequently accessed movie features
@lru_cache(maxsize=1000)
def get_cached_movie_features(movie_idx):
    return latent_matrix_gpu[movie_idx]

def content_based_recommender(movie_title, n_recommendations=10):
    # Get movie index and features from cache
    movie_idx = movies_with_tags[movies_with_tags['title'] == movie_title].index[0]
    query_vector = get_cached_movie_features(movie_idx)
    
    # Batch compute similarities using optimized operations
    similarities = torch.nn.functional.cosine_similarity(
        query_vector.unsqueeze(0).unsqueeze(0),
        latent_matrix_gpu.unsqueeze(0)
    ).squeeze()
    
    # Get top recommendations using MPS-optimized topk
    _, similar_indices = similarities.topk(n_recommendations + 1)
    similar_indices = similar_indices[1:].cpu().numpy()
    
    # Cache the results
    cache.set(f"content_{movie_title}", similar_indices)
    
    recommendations = movies_with_tags.iloc[similar_indices][['movieId', 'title', 'genres']]                                                                      
    recommendations['title'] = recommendations['title'].str.ljust(50)                                                                                   
    return recommendations  

## 4. Collaborative Filtering

In [345]:
# Create user-movie matrix from training data
user_movie_matrix = train_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Perform SVD for collaborative filtering
svd_collab = TruncatedSVD(n_components=100)
latent_matrix_2 = svd_collab.fit_transform(user_movie_matrix)

# Convert matrices to GPU tensors once
latent_matrix_2_gpu = torch.tensor(latent_matrix_2, device=device, dtype=torch.float32)
components_gpu = torch.tensor(svd_collab.components_, device=device, dtype=torch.float32)

def collaborative_recommender(user_id, n_recommendations=10):
    # Get user's latent features
    user_idx = user_movie_matrix.index.get_loc(user_id)
    user_features = latent_matrix_2_gpu[user_idx]
    user_ratings = torch.tensor(user_movie_matrix.loc[user_id].values, device=device, dtype=torch.float32)
    
    # Calculate predicted ratings on GPU
    predicted_ratings = torch.matmul(user_features, components_gpu)
    
    # Create mask for unrated movies
    unrated_mask = (user_ratings == 0)
    
    # Set rated movies to negative infinity to exclude them
    predictions = predicted_ratings.clone()
    predictions[~unrated_mask] = float('-inf')
    
    # Get top recommendations
    _, indices = torch.topk(predictions, n_recommendations)
    top_movie_ids = user_movie_matrix.columns[indices.cpu().numpy()]
    
    # Get recommended movies
    recommendations = movies_df[movies_df['movieId'].isin(top_movie_ids)]
    return recommendations[['movieId', 'title', 'genres']]  

## 5. Matrix Factorization using Surprise

In [346]:
# Train SVD model on training data
reader = Reader(rating_scale=(0.5, 5))
trainset = Dataset.load_from_df(train_ratings[['userId', 'movieId', 'rating']], reader).build_full_trainset()
svd_model = SVD(n_factors=100, random_state=17)
svd_model.fit(trainset)

# Convert SVD matrices to GPU tensors once
svd_pu = torch.tensor(svd_model.pu, device=device, dtype=torch.float32)
svd_qi = torch.tensor(svd_model.qi, device=device, dtype=torch.float32)
svd_bu = torch.tensor(svd_model.bu, device=device, dtype=torch.float32)
svd_bi = torch.tensor(svd_model.bi, device=device, dtype=torch.float32)
svd_mu = torch.tensor([svd_model.trainset.global_mean], device=device, dtype=torch.float32)

def matrix_factorization_recommender(user_id, n_recommendations=10):
    # Get all movies that are in the trainset
    all_movies = set(svd_model.trainset._raw2inner_id_items.keys())
    
    # Get user's rated movies from training data
    rated_movies = train_ratings[train_ratings['userId'] == user_id]['movieId'].unique()
    
    # Get unrated movies that are in the trainset
    unrated_movies = np.array(list(all_movies - set(rated_movies)))
    
    # Get user index in SVD model
    user_inner_id = svd_model.trainset.to_inner_uid(user_id)
    
    # Get movie indices in SVD model (all are guaranteed to be in trainset now)
    movie_inner_ids = unrated_movies  # Already in inner ids from trainset
    
    # Convert to GPU tensors
    user_factors = svd_pu[user_inner_id]
    movie_factors = svd_qi[movie_inner_ids]
    user_bias = svd_bu[user_inner_id]
    movie_biases = svd_bi[movie_inner_ids]
    
    # Calculate predictions on GPU
    predictions = torch.matmul(user_factors, movie_factors.T) + user_bias + movie_biases + svd_mu
    
    # Get top recommendations
    _, indices = torch.topk(predictions, n_recommendations)
    recommended_movie_ids = [unrated_movies[idx] for idx in indices.cpu().numpy()]
    
    # Get recommended movies
    recommended_movies = movies_df[movies_df['movieId'].isin(recommended_movie_ids)]
    return recommended_movies[['movieId', 'title', 'genres']]

## 6. Hybrid Recommender

In [347]:
async def hybrid_recommender(user_id, movie_title, n_recommendations=10):
    # Get movie index for content-based
    movie_idx = movies_with_tags[movies_with_tags['title'] == movie_title].index[0]
    query_vector = get_cached_movie_features(movie_idx)
    
    # Get user features for collaborative
    user_idx = user_movie_matrix.index.get_loc(user_id)
    user_features = latent_matrix_2_gpu[user_idx]
    user_ratings = torch.tensor(user_movie_matrix.loc[user_id].values, device=device, dtype=torch.float32)
    
    # Get all valid movie indices
    all_movies = set(svd_model.trainset._raw2inner_id_items.keys())
    rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    unrated_movies = np.array(list(all_movies - set(rated_movies)))
    
    # Map movie IDs to indices
    movie_id_to_idx = {mid: idx for idx, mid in enumerate(movies_df['movieId'])}
    movie_indices = np.array([movie_id_to_idx[mid] for mid in unrated_movies])
    
    # Get all movie features first
    all_movie_features = latent_matrix_gpu
    
    # Compute content-based similarities for all movies
    all_content_similarities = torch.nn.functional.cosine_similarity(
        query_vector.unsqueeze(0).unsqueeze(0),
        all_movie_features.unsqueeze(0)
    ).squeeze()
    
    # Compute collaborative filtering scores for all movies
    all_collab_predictions = torch.matmul(user_features, components_gpu)
    
    # Compute matrix factorization scores for all movies
    user_inner_id = svd_model.trainset.to_inner_uid(user_id)
    user_factors = svd_pu[user_inner_id]
    all_movie_factors = svd_qi
    all_movie_biases = svd_bi
    all_mf_predictions = torch.matmul(user_factors, all_movie_factors.T) + svd_bu[user_inner_id] + all_movie_biases + svd_mu
    
    # Extract scores for unrated movies only
    content_similarities = all_content_similarities[movie_indices]
    collab_predictions = all_collab_predictions[movie_indices]
    mf_predictions = all_mf_predictions[movie_indices]
    
    # Verify tensor sizes match
    print(f"Sizes - Content: {content_similarities.size()}, Collab: {collab_predictions.size()}, MF: {mf_predictions.size()}")
    
    # Normalize scores to [0,1] range for each set of predictions
    content_scores = (content_similarities - content_similarities.min()) / (content_similarities.max() - content_similarities.min())
    collab_scores = (collab_predictions - collab_predictions.min()) / (collab_predictions.max() - collab_predictions.min())
    mf_scores = (mf_predictions - mf_predictions.min()) / (mf_predictions.max() - mf_predictions.min())
    
    # Combine scores with weights
    combined_scores = content_scores * 0.3 + collab_scores * 0.4 + mf_scores * 0.3
    
    # Get top recommendations
    _, top_indices = combined_scores.topk(n_recommendations)
    top_indices = top_indices.cpu().numpy()
    
    # Get recommended movies
    recommended_movie_ids = unrated_movies[top_indices]
    recommended_movies = movies_df[movies_df['movieId'].isin(recommended_movie_ids)].copy()
    recommended_movies['scores'] = combined_scores[top_indices].cpu().numpy()
    recommended_movies['content_score'] = content_scores[top_indices].cpu().numpy()
    recommended_movies['collab_score'] = collab_scores[top_indices].cpu().numpy()
    recommended_movies['mf_score'] = mf_scores[top_indices].cpu().numpy()
    
    # Cache the results
    cache.set(f"hybrid_{user_id}_{movie_title}", recommended_movie_ids)
    
    return recommended_movies.sort_values('scores', ascending=False)

## 7. Example Usage

In [349]:
# Set pandas display options for wide tables
from tabulate import tabulate
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

def display_recommendations(df, show_index=False):
    print(tabulate(df, headers='keys', tablefmt='pipe', showindex=show_index))

# Example usage with recommendations
print("Popular Movies:")
display_recommendations(popularity_recommender())

print("\nContent-based Recommendations for 'Toy Story (1995)':")
display_recommendations(content_based_recommender('Toy Story (1995)'))

print("\nCollaborative Filtering Recommendations for user 1:")
display_recommendations(collaborative_recommender(1))

print("\nMatrix Factorization Recommendations for user 1:")
display_recommendations(matrix_factorization_recommender(1))

print("\nHybrid Recommendations for user 1 and 'Toy Story (1995)':")
result = await hybrid_recommender(1, 'Toy Story (1995)')
display_recommendations(result)

Popular Movies:
|   movieId | title                                                                          |   rating_mean |   rating_count | genres                                  |
|----------:|:-------------------------------------------------------------------------------|--------------:|---------------:|:----------------------------------------|
|       318 | Shawshank Redemption, The (1994)                                               |       4.42935 |            276 | Crime|Drama                             |
|      2959 | Fight Club (1999)                                                              |       4.32749 |            171 | Action|Crime|Drama|Thriller             |
|       858 | Godfather, The (1972)                                                          |       4.27673 |            159 | Crime|Drama                             |
|     58559 | Dark Knight, The (2008)                                                        |       4.25    |            132 | Action