# Movie Recommender Systems

This notebook implements various movie recommendation approaches:
1. Popularity-based
2. Content-based Filtering
3. Collaborative Filtering
4. Matrix Factorization
5. Hybrid Approach

In [114]:
# Import required libraries
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from surprise import Dataset, Reader, SVD


import warnings
warnings.filterwarnings('ignore')

import asyncio
from functools import lru_cache
from tqdm import tqdm
import gc
from scipy import sparse
import h5py
import joblib
import psutil
from pathlib import Path
import torch


In [115]:
# Setup device and optimizations for Apple Silicon
device = torch.device('mps')
torch.backends.mps.enable_fallback_kernels = True
print(f"Using Apple Metal device: {device}")

# Advanced caching mechanism
class RecommendationCache:
    def __init__(self, max_size=1000):
        self.cache = {}
        self.max_size = max_size
    
    @lru_cache(maxsize=1000)
    def get_movie_features(self, movie_idx):
        return self.latent_matrix_gpu[movie_idx]
    
    def get(self, key):
        return self.cache.get(key)
    
    def set(self, key, value):
        if len(self.cache) >= self.max_size:
            self.cache.pop(next(iter(self.cache)))
        self.cache[key] = value

cache = RecommendationCache()

Using Apple Metal device: mps


## 1. Data Loading and Preprocessing

In [116]:
# Calculate chunk size based on available memory
def get_optimal_chunk_size():
    available_mem = psutil.virtual_memory().available
    # Use 20% of available memory for chunk size
    return int((available_mem * 0.2) / (8 * 1024))

# Read datasets in chunks with disk caching
def read_chunked_csv(filename, chunksize=None):
    cache_file = Path(f".cache_{Path(filename).stem}.joblib")
    if cache_file.exists():
        return joblib.load(cache_file)
    
    chunksize = chunksize or get_optimal_chunk_size()
    chunks = []
    total_rows = sum(1 for _ in open(filename)) - 1  # Subtract header
    
    with tqdm(total=total_rows, desc=f"Loading {filename}") as pbar:
        for chunk in pd.read_csv(filename, chunksize=chunksize):
            chunks.append(chunk)
            pbar.update(len(chunk))
    
    result = pd.concat(chunks)
    joblib.dump(result, cache_file)
    return result

print("Loading movies...")
movies_df = pd.read_csv('ml-latest-small/movies.csv')

print("Loading ratings...")
ratings_df = read_chunked_csv('ml-latest-small/ratings.csv')

print("Loading tags...")
tags_df = read_chunked_csv('ml-latest-small/tags.csv')

# Process tags in chunks
print("Processing tags...")
tags_grouped = tags_df.groupby('movieId')['tag'].apply(
    lambda x: ' '.join(x[:1000] if len(x) > 1000 else x)
).reset_index()

# Merge and clean up
movies_with_tags = pd.merge(movies_df, tags_grouped, on='movieId', how='left')
movies_with_tags['tag'] = movies_with_tags['tag'].fillna('')

# Clean up memory
del tags_df, tags_grouped
gc.collect()

Loading movies...
Loading ratings...
Loading tags...
Processing tags...


23

## 2. Popularity-based Recommender

In [117]:
def popularity_recommender(n_recommendations=10):
    # Calculate mean rating and number of ratings for each movie
    movie_stats = ratings_df.groupby('movieId').agg({
        'rating': ['count', 'mean']
    }).reset_index()
    
    # Rename columns
    movie_stats.columns = ['movieId', 'rating_count', 'rating_mean']
    
    # Filter movies with minimum number of ratings (e.g., 100)
    popular_movies = movie_stats[movie_stats['rating_count'] >= 100]
    
    # Sort by rating mean and count
    popular_movies = popular_movies.sort_values(['rating_mean', 'rating_count'], ascending=[False, False])
    
    # Get movie titles
    recommendations = pd.merge(popular_movies, movies_df, on='movieId')
    
    return recommendations[['movieId', 'title', 'rating_mean', 'rating_count', 'genres']].head(n_recommendations) 

## 3. Content-based Filtering

In [118]:
def process_tfidf_in_batches(texts, batch_size=1000):
    cache_file = Path(".cache_tfidf.h5")
    if cache_file.exists():
        with h5py.File(cache_file, 'r') as f:
            return torch.tensor(f['latent_matrix'][()], device=device)
    
    print("Creating TF-IDF vectors...")
    tfidf = TfidfVectorizer(stop_words='english')
    
    # First pass to fit vocabulary
    print("Fitting TF-IDF vocabulary...")
    tfidf.fit(texts)
    
    # Process in batches
    n_batches = (len(texts) + batch_size - 1) // batch_size
    latent_matrices = []
    
    for i in tqdm(range(n_batches), desc="Processing TF-IDF batches"):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(texts))
        batch_texts = texts[start_idx:end_idx]
        
        # Transform batch and keep sparse
        batch_tfidf = tfidf.transform(batch_texts)
        
        # Compute SVD for batch with explicit float32 dtype for MPS compatibility
        batch_U, batch_S, batch_V = torch.svd(
            torch.tensor(batch_tfidf.toarray(), device=device, dtype=torch.float32)
        )
        batch_latent = (batch_U[:, :100] @ torch.diag(batch_S[:100]))
        latent_matrices.append(batch_latent.cpu().numpy())
    
    # Combine results
    latent_matrix = np.vstack(latent_matrices)
    
    # Cache results
    with h5py.File(cache_file, 'w') as f:
        f.create_dataset('latent_matrix', data=latent_matrix)
    
    return torch.tensor(latent_matrix, device=device)

print("Processing TF-IDF in batches...")
latent_matrix_gpu = process_tfidf_in_batches(movies_with_tags['tag'])

# Cache for frequently accessed movie features
@lru_cache(maxsize=1000)
def get_cached_movie_features(movie_idx):
    return latent_matrix_gpu[movie_idx]

def content_based_recommender(movie_title, n_recommendations=10):
    # Get movie index and features from cache
    movie_idx = movies_with_tags[movies_with_tags['title'] == movie_title].index[0]
    query_vector = get_cached_movie_features(movie_idx)
    
    # Batch compute similarities using optimized operations
    similarities = torch.nn.functional.cosine_similarity(
        query_vector.unsqueeze(0).unsqueeze(0),
        latent_matrix_gpu.unsqueeze(0)
    ).squeeze()
    
    # Get top recommendations using MPS-optimized topk
    _, similar_indices = similarities.topk(n_recommendations + 1)
    similar_indices = similar_indices[1:].cpu().numpy()
    
    # Cache the results
    cache.set(f"content_{movie_title}", similar_indices)
    
    recommendations = movies_with_tags.iloc[similar_indices][['movieId', 'title', 'genres']]                                                                      
    recommendations['title'] = recommendations['title'].str.ljust(50)                                                                                   
    return recommendations  

Processing TF-IDF in batches...


## 4. Collaborative Filtering

In [119]:
def create_sparse_matrix(ratings_df, batch_size=100000):
    print("Creating sparse user-movie matrix...")
    cache_file = Path(".cache_collab.npz")
    if cache_file.exists():
        return sparse.load_npz(cache_file)
    
    rows, cols, data = [], [], []
    user_map = {}
    movie_map = {}
    
    # Process in batches
    total_rows = len(ratings_df)
    with tqdm(total=total_rows, desc="Building sparse matrix") as pbar:
        for start in range(0, total_rows, batch_size):
            batch = ratings_df.iloc[start:start + batch_size]
            
            for _, row in batch.iterrows():
                if row['userId'] not in user_map:
                    user_map[row['userId']] = len(user_map)
                if row['movieId'] not in movie_map:
                    movie_map[row['movieId']] = len(movie_map)
                
                rows.append(user_map[row['userId']])
                cols.append(movie_map[row['movieId']])
                data.append(row['rating'])
                
            pbar.update(len(batch))
    
    matrix = sparse.csr_matrix((data, (rows, cols)), 
                              shape=(len(user_map), len(movie_map)))
    
    # Cache the results
    sparse.save_npz(cache_file, matrix)
    joblib.dump((user_map, movie_map), ".cache_collab_maps.joblib")
    
    return matrix, user_map, movie_map

print("Creating user-movie matrix...")
user_movie_matrix, user_map, movie_map = create_sparse_matrix(ratings_df)

print("Performing collaborative filtering SVD...")
with tqdm(total=1, desc="SVD Progress") as pbar:
    svd_collab = TruncatedSVD(n_components=100)
    latent_matrix_2 = svd_collab.fit_transform(user_movie_matrix)
    pbar.update(1)

# Convert matrices to GPU tensors once
latent_matrix_2_gpu = torch.tensor(latent_matrix_2, device=device, dtype=torch.float32)
components_gpu = torch.tensor(svd_collab.components_, device=device, dtype=torch.float32)

@lru_cache(maxsize=1000)
def get_user_predictions(user_id, batch_size=10000):
    # Get user's latent features
    user_idx = user_map[user_id]
    user_features = latent_matrix_2_gpu[user_idx]
    
    # Get user's rated movies for masking
    rated_movies = set(ratings_df[ratings_df['userId'] == user_id]['movieId'])
    
    # Process predictions in batches
    all_predictions = []
    n_movies = len(movie_ids)
    
    for start_idx in range(0, n_movies, batch_size):
        end_idx = min(start_idx + batch_size, n_movies)
        batch_components = components_gpu[:, start_idx:end_idx]
        
        # Calculate batch predictions
        batch_predictions = torch.matmul(user_features, batch_components)
        
        # Mask rated movies in this batch
        batch_movie_ids = movie_ids[start_idx:end_idx]
        mask = torch.tensor([mid not in rated_movies for mid in batch_movie_ids], 
                          device=device, dtype=torch.bool)
        
        batch_predictions = torch.where(mask, batch_predictions, 
                                       torch.tensor(float('-inf'), device=device))
        all_predictions.append(batch_predictions)
    
    return torch.cat(all_predictions)

def collaborative_recommender(user_id, n_recommendations=10):
    # Get cached predictions
    predictions = get_user_predictions(user_id)
    
    # Get top recommendations
    _, indices = torch.topk(predictions, n_recommendations)
    top_movie_ids = [movie_ids[i] for i in indices.cpu().numpy()]
    
    # Get recommended movies
    recommendations = movies_df[movies_df['movieId'].isin(top_movie_ids)]
    return recommendations[['movieId', 'title', 'genres']]

Creating user-movie matrix...
Creating sparse user-movie matrix...


ValueError: too many values to unpack (expected 3)

## 5. Matrix Factorization using Surprise

In [87]:
print("Training SVD model...")
reader = Reader(rating_scale=(0.5, 5))
trainset = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader).build_full_trainset()
svd_model = SVD(n_factors=100, random_state=17)
with tqdm(total=1, desc="SVD Training") as pbar:
    svd_model.fit(trainset)
    pbar.update(1)

# Convert SVD matrices to GPU tensors once
svd_pu = torch.tensor(svd_model.pu, device=device, dtype=torch.float32)
svd_qi = torch.tensor(svd_model.qi, device=device, dtype=torch.float32)
svd_bu = torch.tensor(svd_model.bu, device=device, dtype=torch.float32)
svd_bi = torch.tensor(svd_model.bi, device=device, dtype=torch.float32)
svd_mu = torch.tensor([svd_model.trainset.global_mean], device=device, dtype=torch.float32)

def matrix_factorization_recommender(user_id, n_recommendations=10, batch_size=10000):
    # Get all movies that are in the trainset
    all_movies = set(svd_model.trainset._raw2inner_id_items.keys())
    
    # Get user's rated movies
    rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    
    # Get unrated movies that are in the trainset
    unrated_movies = np.array(list(all_movies - set(rated_movies)))
    
    # Get user index in SVD model
    user_inner_id = svd_model.trainset.to_inner_uid(user_id)
    
    # Get user factors and bias
    user_factors = svd_pu[user_inner_id]
    user_bias = svd_bu[user_inner_id]
    
    # Process movies in batches
    all_predictions = []
    for i in range(0, len(unrated_movies), batch_size):
        batch_movies = unrated_movies[i:i + batch_size]
        
        # Get batch factors and biases
        movie_factors = svd_qi[batch_movies]
        movie_biases = svd_bi[batch_movies]
        
        # Calculate predictions for batch
        batch_predictions = torch.matmul(user_factors, movie_factors.T) + user_bias + movie_biases + svd_mu
        all_predictions.append(batch_predictions)
    
    # Combine all predictions
    predictions = torch.cat(all_predictions)
    
    # Get top recommendations
    _, indices = torch.topk(predictions, n_recommendations)
    recommended_movie_ids = [unrated_movies[idx] for idx in indices.cpu().numpy()]
    
    # Get recommended movies
    recommended_movies = movies_df[movies_df['movieId'].isin(recommended_movie_ids)]
    return recommended_movies[['movieId', 'title', 'genres']]

Training SVD model...


SVD Training: 100%|██████████| 1/1 [00:00<00:00,  3.22it/s]


## 6. Hybrid Recommender

In [88]:
async def compute_content_scores(movie_idx, all_movie_features):
    query_vector = get_cached_movie_features(movie_idx)
    return torch.nn.functional.cosine_similarity(
        query_vector.unsqueeze(0).unsqueeze(0),
        all_movie_features.unsqueeze(0)
    ).squeeze()

async def compute_collab_scores(user_features, components_gpu):
    return torch.matmul(user_features, components_gpu)

async def compute_mf_scores(user_inner_id, user_factors, all_movie_factors, all_movie_biases):
    return torch.matmul(user_factors, all_movie_factors.T) + svd_bu[user_inner_id] + all_movie_biases + svd_mu

async def hybrid_recommender(user_id, movie_title, n_recommendations=10):
    # Get movie index for content-based
    movie_idx = movies_with_tags[movies_with_tags['title'] == movie_title].index[0]
    
    # Get user features for collaborative
    user_idx = user_movie_matrix.index.get_loc(user_id)
    user_features = latent_matrix_2_gpu[user_idx]
    user_ratings = torch.tensor(user_movie_matrix.loc[user_id].values, device=device, dtype=torch.float32)
    
    # Get all valid movie indices
    all_movies = set(svd_model.trainset._raw2inner_id_items.keys())
    rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    unrated_movies = np.array(list(all_movies - set(rated_movies)))
    
    # Map movie IDs to indices
    movie_id_to_idx = {mid: idx for idx, mid in enumerate(movies_df['movieId'])}
    movie_indices = np.array([movie_id_to_idx[mid] for mid in unrated_movies])
    
    # Get all movie features first
    all_movie_features = latent_matrix_gpu
    
    # Run all predictions concurrently
    user_inner_id = svd_model.trainset.to_inner_uid(user_id)
    user_factors = svd_pu[user_inner_id]
    all_movie_factors = svd_qi
    all_movie_biases = svd_bi
    
    content_task = compute_content_scores(movie_idx, all_movie_features)
    collab_task = compute_collab_scores(user_features, components_gpu)
    mf_task = compute_mf_scores(user_inner_id, user_factors, all_movie_factors, all_movie_biases)
    
    # Wait for all tasks to complete
    all_content_similarities, all_collab_predictions, all_mf_predictions = await asyncio.gather(
        content_task, collab_task, mf_task
    )
    
    # Extract scores for unrated movies only
    content_similarities = all_content_similarities[movie_indices]
    collab_predictions = all_collab_predictions[movie_indices]
    mf_predictions = all_mf_predictions[movie_indices]
    
    # Verify tensor sizes match
    print(f"Sizes - Content: {content_similarities.size()}, Collab: {collab_predictions.size()}, MF: {mf_predictions.size()}")
    
    # Normalize scores to [0,1] range for each set of predictions
    content_scores = (content_similarities - content_similarities.min()) / (content_similarities.max() - content_similarities.min())
    collab_scores = (collab_predictions - collab_predictions.min()) / (collab_predictions.max() - collab_predictions.min())
    mf_scores = (mf_predictions - mf_predictions.min()) / (mf_predictions.max() - mf_predictions.min())
    
    # Combine scores with weights
    combined_scores = content_scores * 0.3 + collab_scores * 0.4 + mf_scores * 0.3
    
    # Get top recommendations
    _, top_indices = combined_scores.topk(n_recommendations)
    top_indices = top_indices.cpu().numpy()
    
    # Get recommended movies
    recommended_movie_ids = unrated_movies[top_indices]
    recommended_movies = movies_df[movies_df['movieId'].isin(recommended_movie_ids)].copy()
    recommended_movies['scores'] = combined_scores[top_indices].cpu().numpy()
    recommended_movies['content_score'] = content_scores[top_indices].cpu().numpy()
    recommended_movies['collab_score'] = collab_scores[top_indices].cpu().numpy()
    recommended_movies['mf_score'] = mf_scores[top_indices].cpu().numpy()
    
    # Cache the results
    cache.set(f"hybrid_{user_id}_{movie_title}", recommended_movie_ids)
    
    return recommended_movies.sort_values('scores', ascending=False)

## 7. Evaluation Metrics

The following metrics show predicted ratings for recommended movies to help evaluate recommendation quality.

In [89]:
# Set pandas display options for wide tables
from tabulate import tabulate
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

def predict_ratings(user_id, movie_ids):
    """Predict ratings for given user and movies using SVD model"""
    predictions = []
    for movie_id in movie_ids:
        try:
            pred = svd_model.predict(user_id, movie_id).est
            predictions.append(round(pred, 2))
        except:
            predictions.append(None)
    return predictions

def display_recommendations(df, show_index=False, user_id=None):
    """Display recommendations with predicted ratings if user_id is provided"""
    if user_id is not None and 'movieId' in df.columns:
        df = df.copy()
        df['predicted_rating'] = predict_ratings(user_id, df['movieId'])
    print(tabulate(df, headers='keys', tablefmt='pipe', showindex=show_index))

# Example usage with recommendations and predicted ratings
user_id = 1
movie_title = 'Toy Story (1995)'

print("Popular Movies:")
display_recommendations(popularity_recommender(), user_id=user_id)

print("\nContent-based Recommendations for 'Toy Story (1995)':")
display_recommendations(content_based_recommender(movie_title), user_id=user_id)

print("\nCollaborative Filtering Recommendations for user 1:")
display_recommendations(collaborative_recommender(user_id), user_id=user_id)

print("\nMatrix Factorization Recommendations for user 1:")
display_recommendations(matrix_factorization_recommender(user_id), user_id=user_id)

print("\nHybrid Recommendations for user 1 and 'Toy Story (1995)':")
result = asyncio.run(hybrid_recommender(user_id, movie_title))
display_recommendations(result, user_id=user_id)

Popular Movies:
|   movieId | title                            |   rating_mean |   rating_count | genres                                  |   predicted_rating |
|----------:|:---------------------------------|--------------:|---------------:|:----------------------------------------|-------------------:|
|       318 | Shawshank Redemption, The (1994) |       4.42902 |            317 | Crime|Drama                             |               5    |
|       858 | Godfather, The (1972)            |       4.28906 |            192 | Crime|Drama                             |               5    |
|      2959 | Fight Club (1999)                |       4.27294 |            218 | Action|Crime|Drama|Thriller             |               4.93 |
|      1221 | Godfather: Part II, The (1974)   |       4.25969 |            129 | Crime|Drama                             |               4.97 |
|     48516 | Departed, The (2006)             |       4.25234 |            107 | Crime|Drama|Thriller            

AttributeError: 'csr_matrix' object has no attribute 'columns'