In [440]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

# Load cleaned ratings data
ratings = pd.read_csv('../data/cleaned_remapped_ratings.csv')
movies = pd.read_csv('../data/cleaned_remapped_movies.csv')

print("Ratings Shape:", ratings.shape)
print("Movies Shape:", movies.shape)


Ratings Shape: (31842705, 4)
Movies Shape: (87382, 6)


In [446]:
# Generate the User-Item Interaction Matrix

# Create a sparse user-item interaction matrix
user_movie_matrix = csr_matrix(
    (ratings['rating'], (ratings['userId'] - 1, ratings['movieId'] - ratings['movieId'].min()))
)
print(user_movie_matrix.shape)

(200948, 31961)


In [456]:
# Apply Truncated SVD
n_components = 50  # Adjust based on dataset and memory constraints
svd = TruncatedSVD(n_components=n_components, random_state=42)
user_factors = svd.fit_transform(user_movie_matrix)  # User latent factors
movie_factors = svd.components_.T  # Movie latent factors

print("::::::::::: Movie factors shape => ", movie_factors.shape)
print("::::::::::: User factors shape => ", user_factors.shape)

# Save the user and movie factors for reuse
with open("../models/user_factors.pkl", "wb") as f:
    pickle.dump(user_factors, f)
with open("../models/movie_factors.pkl", "wb") as f:
    pickle.dump(movie_factors, f)

# Create a Series to map movie titles to their indices
movie_indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
print("::::::::::: Movie indices length => ", len(movie_indices))

# print(movie_indices.head(10))
# Save the movie indices mapping
with open('../models/movie_indices.pkl', 'wb') as f:
    pickle.dump(movie_indices, f)

print(":::::: PKL files saved !!!")

::::::::::: Movie factors shape =>  (31961, 50)
::::::::::: User factors shape =>  (200948, 50)
::::::::::: Movie indices length =>  87382
Saved remapped movies indices !!!
:::::: PKL files saved !!!


In [532]:
# Implement the Collaborative Recommendation Function
def recommend_collaborative(user_id, num_recommendations=5):
    """
    Recommend movies for a user based on collaborative filtering.

    Args:
        user_id (int): The ID of the user for whom to recommend.
        num_recommendations (int): The number of recommendations to return.

    Returns:
        list: Recommended movie titles.
    """
    # Load user and movie factors
    with open("../models/user_factors.pkl", "rb") as f:
        user_factors = pickle.load(f)
    with open("../models/movie_factors.pkl", "rb") as f:
        movie_factors = pickle.load(f)

    # Validate user_id
    if user_id is None:
        raise ValueError("Invalid user_id: user_id cannot be empty")
    if user_id < 1 or user_id > user_factors.shape[0] or user_id is None:
        raise ValueError(f"Invalid user_id: {user_id}. Must be between 1 and {user_factors.shape[0]}.")

    # Retrieve the user's latent factor vector
    user_vector = user_factors[user_id - 1]

    # Compute similarity scores for all movies
    scores = np.dot(movie_factors, user_vector)

    # Get indices of the top movie scores
    recommended_movie_indices = np.argsort(scores)[::-1][:num_recommendations]

    # Map indices to movie titles
    recommended_titles = movies.loc[movies.index.isin(recommended_movie_indices), 'title'].tolist()
    return recommended_titles

In [590]:
def recommend_content_based(title, num_recommendations=10, h5_file='../models/cosine_sim.h5', min_similarity=0.5):
    """
        Adding a min_similarity threshhold dramatically improves this content-based recommendation especially when used
        along in a hybrid recommendation system
    """
    
    # Normalize the movie title
    normalized_title = title.lower().strip()
    
    # Get the index of the input movie
    if normalized_title not in movie_indices:
        raise ValueError(f"Movie '{title}' not found in the dataset.")
    idx = movie_indices[normalized_title]
    
    # Open the HDF5 file and retrieve the relevant row (cosine similarity scores)
    # Instead of loading the entire cosine_sim matrix into memory, retrieve only the relevant row using the index idx.
    with h5py.File(h5_file, 'r') as f:
        sim_scores = f['cosine_sim'][idx]  # Retrieve the row corresponding to the movie

    # Process the similarity scores to get recommendations
    sim_scores = list(enumerate(sim_scores))

    # Filter by similarity threshold
    sim_scores = [(i, score) for i, score in sim_scores if score >= min_similarity]
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]  # Skip the movie itself
    recommended_indices = [i[0] for i in sim_scores]
    
    # Retrieve recommended movie titles and convert to title case
    recommended_titles = movies['title'].iloc[recommended_indices].str.title()
    return recommended_titles

In [592]:
def recommend_hybrid(title, user_id=None, content_weight=0.5, collab_weight=0.5, num_recommendations=10, num_content_rec = 100):
    """
    Hybrid recommendation combining content-based and collaborative filtering.
    
    Args:
        title (str): Input movie title for content-based recommendations.
        user_id (int): User ID for collaborative filtering recommendations.
        content_weight (float): Weight for content-based recommendations.
        collab_weight (float): Weight for collaborative filtering recommendations.
        num_recommendations (int): Number of recommendations to return.
    
    Returns:
        list: Recommended movie titles.
    """
    # Validate weights
    if content_weight + collab_weight != 1.0:
        raise ValueError("Content weight and collaboration weight must sum to 1.0.")
        
    # Content-based recommendations
    content_recommendations = recommend_content_based(title, num_recommendations=num_content_rec)

    # Collaborative recommendations
    if user_id is None:
        collaborative_recommendations = []
    else:
        collaborative_recommendations = recommend_collaborative(user_id, num_recommendations=100)

    # Combine scores (assume both return lists of movie titles)
    combined_recommendations = (
        content_weight * pd.Series(content_recommendations).value_counts(normalize=True) +
        collab_weight * pd.Series(collaborative_recommendations).value_counts(normalize=True)
    ).sort_values(ascending=False)

    # Return top-N recommendations
    return combined_recommendations.head(num_recommendations).index.tolist()

In [610]:
recommendations = recommend_hybrid(
    title="Toy Story (1995)",
    user_id=None,  # No user ID
    content_weight=1.0,
    collab_weight=0.0,
    num_recommendations=10, 
    num_content_rec = 100
)
print(recommendations)
print("\n\n")

recommendations = recommend_hybrid(
    title="Toy Story (1995)",
    user_id=None,  # No user ID
    content_weight=0.5,
    collab_weight=0.5,
    num_recommendations=10, 
    num_content_rec = 100
)
print(recommendations)
print("\n\n")

recommendations = recommend_hybrid(
    title="Toy Story (1995)",
    user_id=1,  # No user ID
    content_weight=0.5,
    collab_weight=0.5,
    num_recommendations=10, 
    num_content_rec = 100
)
print(recommendations)
print("\n\n")

recommendations = recommend_hybrid(
    title="Toy Story (1995)",
    user_id=2,  # No user ID
    content_weight=0.3,
    collab_weight=0.7,
    num_recommendations=10, 
    num_content_rec = 100
)
print(recommendations)
print("\n\n")

['Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Toy Story 4 (2019)', 'Toy Story Of Terror (2013)', 'Toy Story That Time Forgot (2014)', 'Toy Story Toons: Hawaiian Vacation (2011)', 'Toy Story Toons: Small Fry (2011)', 'Toy, The (1982)']



['Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Toy Story 4 (2019)', 'Toy Story Of Terror (2013)', 'Toy Story That Time Forgot (2014)', 'Toy Story Toons: Hawaiian Vacation (2011)', 'Toy Story Toons: Small Fry (2011)', 'Toy, The (1982)']



['Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Toy Story 4 (2019)', 'Toy Story Of Terror (2013)', 'Toy Story That Time Forgot (2014)', 'Toy Story Toons: Hawaiian Vacation (2011)', 'Toy Story Toons: Small Fry (2011)', 'Toy, The (1982)', 'across the sea of time (1995)', 'american werewolf in london, an (1981)']



['Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Toy Story 4 (2019)', 'Toy Story Of Terror (2013)', 'Toy Story That Time Forgot (2014)', 'Toy Story Toons: Hawaiian Vacation (2011)', 'Toy Story Toons: Small Fry (201

In [612]:
# Test recommend_collaborative
# Test valid user ID
try:
    recommendations = recommend_collaborative(user_id=1, num_recommendations=5)
    print(f"Recommendations for User 1: {recommendations}")
except Exception as e:
    print(f"Error for User 1: {e}")
print("\n\n")

# Test user at upper bound
try:
    max_user_id = user_factors.shape[0]
    recommendations = recommend_collaborative(user_id=max_user_id, num_recommendations=5)
    print(f"Recommendations for User {max_user_id}: {recommendations}")
except Exception as e:
    print(f"Error for User {max_user_id}: {e}")
print("\n\n")

# Test user beyond upper bound
try:
    max_user_id = user_factors.shape[0]
    recommendations = recommend_collaborative(user_id=max_user_id + 1, num_recommendations=5)
    print(f"Recommendations for User {max_user_id}: {recommendations}")
except Exception as e:
    print(f"Error for User {max_user_id}: {e}")
print("\n\n")

# Test invalid user ID
try:
    recommendations = recommend_collaborative(user_id=-1, num_recommendations=5)
    print(f"Recommendations for User -1: {recommendations}")
except Exception as e:
    print(f"Expected error for User -1: {e}")

Recommendations for User 1: ['sudden death (1995)', 'casino (1995)', 'powder (1995)', 'big green, the (1995)', 'eye for an eye (1996)']



Recommendations for User 200948: ['boys on the side (1995)', 'man of the house (1995)', 'beverly hills cop iii (1994)', 'inkwell, the (1994)', 'robocop 3 (1993)']



Error for User 200948: Invalid user_id: 200949. Must be between 1 and 200948.



Expected error for User -1: Invalid user_id: -1. Must be between 1 and 200948.


In [614]:
# # Test recommend_hybrid
# # Test valid title and user
try:
    recommendations = recommend_hybrid(
        title="Toy Story (1995)",
        user_id=1,
        content_weight=0.5,
        collab_weight=0.5,
        num_recommendations=5
    )
    print(f"Hybrid Recommendations for 'Toy Story (1995)', User 1: {recommendations}")
except Exception as e:
    print(f"Error for 'Toy Story (1995)', User 1: {e}")
print("\n\n")

# Test with invalid title
try:
    recommendations = recommend_hybrid(
        title="Nonexistent Movie",
        user_id=1,
        content_weight=0.5,
        collab_weight=0.5,
        num_recommendations=5
    )
    print(f"Recommendations for 'Nonexistent Movie', User 1: {recommendations}")
except Exception as e:
    print(f"Expected error for 'Nonexistent Movie': {e}")
print("\n\n")

# Test with invalid user ID
try:
    recommendations = recommend_hybrid(
        title="Toy Story (1995)",
        user_id=-1,
        content_weight=0.5,
        collab_weight=0.5,
        num_recommendations=5
    )
    print(f"Recommendations for 'Toy Story (1995)', User -1: {recommendations}")
except Exception as e:
    print(f"Expected error for User -1: {e}")
print("\n\n")

# Test extreme weights

try:
    print(":::::: Test extreme weights - content only ::::::::::")
    content_only = recommend_hybrid(
        title="Toy Story (1995)",
        user_id=1,
        content_weight=1.0,
        collab_weight=0.0,
        num_recommendations=5
    )
    print(f"Content-Based Only Recommendations: {content_only}")

    print("\n:::::: Test extreme weights - collaborative only ::::::::::")
    collaborative_only = recommend_hybrid(
        title="Toy Story (1995)",
        user_id=1,
        content_weight=0.0,
        collab_weight=1.0,
        num_recommendations=5
    )
    print(f"Collaborative Only Recommendations: {collaborative_only}")
except Exception as e:
    print(f"Error during weight testing: {e}")
print("\n\n")

# Purely content-based recommendations
print("Purely content-based recommendations:")
try:
    recommendations = recommend_hybrid(
        title="Toy Story (1995)",
        user_id=None,  # No user ID
        content_weight=1.0,
        collab_weight=0.0,
        num_recommendations=5
    )
    print(recommendations)
except Exception as e:
    print(f"Error during purely content-based testing: {e}")
print("\n\n")

# Hybrid recommendation with only content-based weight(Hybrid with user_id=None and Custom Weights)
print("Recommendations with user_id=None and content focus:")
# try:
recommendations = recommend_hybrid(
    title="Toy Story (1995)",
    user_id=None,
    content_weight=0.7,  # Adjust weights as needed
    collab_weight=0.3,  # Collaborative filtering weight (ignored due to user_id=None)
    num_recommendations=5
)
print(recommendations)
# except Exception as e:
    # print(f"Error during recommendations with user_id=None and content focus testing: {e}")


Hybrid Recommendations for 'Toy Story (1995)', User 1: ['Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Toy Story 4 (2019)', 'Toy Story Of Terror (2013)', 'Toy Story That Time Forgot (2014)']



Expected error for 'Nonexistent Movie': Movie 'Nonexistent Movie' not found in the dataset.



Expected error for User -1: Invalid user_id: -1. Must be between 1 and 200948.



:::::: Test extreme weights - content only ::::::::::
Content-Based Only Recommendations: ['Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Toy Story 4 (2019)', 'Toy Story Of Terror (2013)', 'Toy Story That Time Forgot (2014)']

:::::: Test extreme weights - collaborative only ::::::::::
Collaborative Only Recommendations: ['Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Toy Story 4 (2019)', 'Toy Story Of Terror (2013)', 'Toy Story That Time Forgot (2014)']



Purely content-based recommendations:
['Toy Story 2 (1999)', 'Toy Story 3 (2010)', 'Toy Story 4 (2019)', 'Toy Story Of Terror (2013)', 'Toy Story That Time Forgot (2014)']

