In [3]:
import pandas as pd

# Load cleaned ratings data
ratings = pd.read_csv('data/cleaned_ratings.csv')

print("Ratings Shape:", ratings.shape)


Ratings Shape: (31842705, 4)


In [7]:
num_users = ratings['userId'].nunique()  # Unique users
num_movies = ratings['movieId'].nunique()  # Unique movies

print("Shape of user-item interaction matrix:", (num_users, num_movies))

Shape of user-item interaction matrix: (200948, 31961)


In [11]:
from scipy.sparse import csr_matrix

# Create user-item interaction matrix
# user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

user_movie_matrix = csr_matrix(
    (ratings['rating'], (ratings['userId'], ratings['movieId']))
)

print("User-Movie Interaction Matrix Shape:", user_movie_matrix.shape)

User-Movie Interaction Matrix Shape: (200949, 292350)


In [27]:
import pickle
from sklearn.decomposition import TruncatedSVD

# Perform SVD
svd = TruncatedSVD(n_components=50, random_state=42)  # 50 latent factors

user_factors = svd.fit_transform(user_movie_matrix)
# Save user factors
np.save("models/user_factors.npy", user_factors)
print("Saved user latent factors to 'models/user_factors.npy'.")


movie_factors = svd.components_.T
# Save movie factors
np.save("models/movie_factors.npy", movie_factors)
print("Saved movie latent factors to 'models/movie_factors.npy'.")

print("User factors shape:", user_factors.shape)
print("Movie factors shape:", movie_factors.shape)


# Save the SVD model to a file
with open("models/svd_model.pkl", "wb") as f:
    pickle.dump(svd, f)

print("SVD model saved as 'models/svd_model.pkl'.")

Saved user latent factors to 'models/user_factors.npy'.
Saved movie latent factors to 'models/movie_factors.npy'.
User factors shape: (200949, 50)
Movie factors shape: (292350, 50)
SVD model saved as 'models/svd_model.pkl'.


In [17]:
import numpy as np

def recommend_collaborative(user_id, num_recommendations=10):
    # Get user vector
    user_vector = user_factors[user_id - 1]  # Assuming user IDs start at 1

    # Compute scores for all movies
    scores = np.dot(movie_factors, user_vector)

    # Get top movie indices
    recommended_movie_indices = np.argsort(scores)[::-1][:num_recommendations]

    # Map indices back to movie IDs
    recommended_movie_ids = [movies.iloc[i]['movieId'] for i in recommended_movie_indices]
    return movies[movies['movieId'].isin(recommended_movie_ids)]['title']

In [21]:
# Blend content-based and collaborative filtering outputs using weighted scores
def recommend_hybrid(title, user_id, num_recommendations=10, content_weight=0.5, collab_weight=0.5):
    # Content-based recommendations
    content_indices = movie_indices[title]
    content_scores = cosine_sim[content_indices]

    # Collaborative recommendations
    user_vector = user_factors[user_id - 1]
    collab_scores = np.dot(movie_factors, user_vector)

    # Combine scores
    combined_scores = content_weight * content_scores + collab_weight * collab_scores

    # Get top movie indices
    recommended_movie_indices = np.argsort(combined_scores)[::-1][:num_recommendations]
    return movies.iloc[recommended_movie_indices]['title']