# Movie Recommender Systems

This notebook implements various movie recommendation approaches:
1. Popularity-based
2. Content-based Filtering
3. Collaborative Filtering
4. Matrix Factorization
5. Hybrid Approach

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import torch
import warnings
warnings.filterwarnings('ignore')

# GPU setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Data Loading and Preprocessing

In [18]:
# Read the datasets
movies_df = pd.read_csv('ml-latest-small/movies.csv')
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
tags_df = pd.read_csv('ml-latest-small/tags.csv')

# Merge tags for each movie
tags_grouped = tags_df.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
movies_with_tags = pd.merge(movies_df, tags_grouped, on='movieId', how='left')
movies_with_tags['tag'] = movies_with_tags['tag'].fillna('')

## 2. Popularity-based Recommender

In [19]:
def popularity_recommender(n_recommendations=10):
    # Calculate mean rating and number of ratings for each movie
    movie_stats = ratings_df.groupby('movieId').agg({
        'rating': ['count', 'mean']
    }).reset_index()
    
    # Rename columns
    movie_stats.columns = ['movieId', 'rating_count', 'rating_mean']
    
    # Filter movies with minimum number of ratings (e.g., 100)
    popular_movies = movie_stats[movie_stats['rating_count'] >= 100]
    
    # Sort by rating mean and count
    popular_movies = popular_movies.sort_values(['rating_mean', 'rating_count'], ascending=[False, False])
    
    # Get movie titles
    recommendations = pd.merge(popular_movies, movies_df, on='movieId')
    
    return recommendations[['title', 'rating_mean', 'rating_count']].head(n_recommendations)

## 3. Content-based Filtering

In [20]:
# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Create document-term matrix
tfidf_matrix = tfidf.fit_transform(movies_with_tags['tag'])

# Perform dimensionality reduction using TruncatedSVD
svd = TruncatedSVD(n_components=100)
latent_matrix_1 = svd.fit_transform(tfidf_matrix)

# Convert latent matrix to GPU tensor once
latent_matrix_gpu = torch.tensor(latent_matrix_1, device=device, dtype=torch.float32)

def content_based_recommender(movie_title, n_recommendations=10):
    # Get movie index
    movie_idx = movies_with_tags[movies_with_tags['title'] == movie_title].index[0]
    
    # Get query vector
    query_vector = latent_matrix_gpu[movie_idx].unsqueeze(0)
    
    # Calculate similarity scores on GPU
    similarities = torch.nn.functional.cosine_similarity(
        query_vector.unsqueeze(0), 
        latent_matrix_gpu.unsqueeze(1)
    ).squeeze()
    
    # Get top similar movies
    _, similar_indices = similarities.topk(n_recommendations + 1)
    similar_indices = similar_indices[1:].cpu().numpy()  # Exclude the query movie
    
    return movies_with_tags.iloc[similar_indices][['title', 'genres']]

## 4. Collaborative Filtering

In [21]:
# Create user-movie matrix
user_movie_matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Perform SVD for collaborative filtering
svd_collab = TruncatedSVD(n_components=100)
latent_matrix_2 = svd_collab.fit_transform(user_movie_matrix)

# Convert matrices to GPU tensors once
latent_matrix_2_gpu = torch.tensor(latent_matrix_2, device=device, dtype=torch.float32)
components_gpu = torch.tensor(svd_collab.components_, device=device, dtype=torch.float32)

def collaborative_recommender(user_id, n_recommendations=10):
    # Get user's latent features
    user_idx = user_movie_matrix.index.get_loc(user_id)
    user_features = latent_matrix_2_gpu[user_idx]
    user_ratings = torch.tensor(user_movie_matrix.loc[user_id].values, device=device)
    
    # Calculate predicted ratings on GPU
    predicted_ratings = torch.matmul(user_features, components_gpu)
    
    # Create mask for unrated movies
    unrated_mask = (user_ratings == 0)
    
    # Set rated movies to negative infinity to exclude them
    predictions = predicted_ratings.clone()
    predictions[~unrated_mask] = float('-inf')
    
    # Get top recommendations
    _, indices = torch.topk(predictions, n_recommendations)
    top_movie_ids = user_movie_matrix.columns[indices.cpu().numpy()]
    
    # Get recommended movies
    recommended_movies = movies_df[movies_df['movieId'].isin(top_movie_ids)]
    return recommended_movies[['title', 'genres']]

## 5. Matrix Factorization using Surprise

In [22]:
# Prepare data for Surprise
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split data
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# Train SVD model
svd_model = SVD(n_factors=100, random_state=42)
svd_model.fit(trainset)

def matrix_factorization_recommender(user_id, n_recommendations=10):
    # Get all movies
    all_movies = movies_df['movieId'].unique()
    
    # Get user's rated movies
    rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    
    # Get unrated movies
    unrated_movies = np.setdiff1d(all_movies, rated_movies)
    
    # Predict ratings for unrated movies
    predictions = [svd_model.predict(user_id, movie_id) for movie_id in unrated_movies]
    
    # Sort predictions
    sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:n_recommendations]
    
    # Get recommended movies
    recommended_movies = movies_df[movies_df['movieId'].isin([pred.iid for pred in sorted_predictions])]
    
    return recommended_movies[['title', 'genres']]

## 6. Hybrid Recommender

In [23]:
def hybrid_recommender(user_id, movie_title, n_recommendations=10):
    # Get recommendations from each method
    content_recs = content_based_recommender(movie_title, n_recommendations)
    collab_recs = collaborative_recommender(user_id, n_recommendations)
    mf_recs = matrix_factorization_recommender(user_id, n_recommendations)
    
    # Combine recommendations
    all_recs = pd.concat([
        content_recs.assign(method='content'),
        collab_recs.assign(method='collaborative'),
        mf_recs.assign(method='matrix_factorization')
    ])
    
    # Remove duplicates and sort by frequency of appearance
    final_recs = all_recs.groupby('title').agg({
        'genres': 'first',
        'method': lambda x: ', '.join(x)
    }).reset_index()
    
    final_recs['recommendation_strength'] = final_recs['method'].str.count(',')
    
    return final_recs.sort_values('recommendation_strength', ascending=False).head(n_recommendations)

## 7. Example Usage

In [None]:
# Example usage
print("Popular Movies:")
print(popularity_recommender())

print("\nContent-based Recommendations for 'Toy Story (1995)':")
print(content_based_recommender('Toy Story (1995)'))

print("\nCollaborative Filtering Recommendations for user 1:")
print(collaborative_recommender(1))

print("\nMatrix Factorization Recommendations for user 1:")
print(matrix_factorization_recommender(1))

print("\nHybrid Recommendations for user 1 and 'Toy Story (1995)':")
print(hybrid_recommender(1, 'Toy Story (1995)'))