# User-based Collaborative Filtering

Core idea:
"If Alice and Bob have rated movies similarly in the past, then we can recommend to Alice the movies that Bob liked (but Alice hasn’t seen yet)."

How it works:

Find users similar to the target user (e.g., Alice).

Look at what those similar users liked.

Recommend those movies to Alice.

Example:
If Alice and Bob both liked Inception and The Matrix, and Bob also liked Interstellar, then Interstellar might be recommended to Alice.

In [1]:
import pandas as pd
import os
base_path = r'C:\Users\Sara\Documents\python proj'

User-based Collaborative Filtering

In [2]:
# Load ratings and movie info
movies = pd.read_csv(os.path.join(base_path, 'movies.csv'))
ratings = pd.read_csv(os.path.join(base_path, 'ratings.csv'))

# Merge to get movie titles (optional, for display)
ratings_merged = ratings.merge(movies[['movieId', 'title']], on='movieId', how='left')

# Drop missing values (just in case)
ratings_merged.dropna(subset=['userId', 'movieId', 'rating'], inplace=True)



In [3]:
# Check the structure
ratings_merged.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,296,5.0,1147880044,Pulp Fiction (1994)
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994)
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,1,665,5.0,1147878820,Underground (1995)
4,1,899,3.5,1147868510,Singin' in the Rain (1952)


In [4]:
len(ratings_merged)

25000095

In [5]:
ratings_merged['userId'].nunique()

162541

In [6]:
ratings_merged['movieId'].nunique()

59047

Option 1: Filter to “Active” Users and/or “Popular” Movies

We do this, because the full dataset is too computationally expensive for personal laptops.

In [7]:
# Keep users with at least 500 ratings
user_counts = ratings_merged['userId'].value_counts()
active_users = user_counts[user_counts >= 500].index

# Keep movies with at least 1000 ratings
movie_counts = ratings_merged['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 1000].index

# Filter the DataFrame
filtered = ratings_merged[
    ratings_merged['userId'].isin(active_users) &
    ratings_merged['movieId'].isin(popular_movies)
]

In [8]:
len(filtered)

7127698

In [9]:
# Assuming 'filtered' is your cleaned dataset with userId, movieId, rating, and title
user_item_matrix = filtered.pivot_table(index='userId', columns='movieId', values='rating')

# Mean-center and fill NaNs with 0 (for cosine similarity)
user_item_centered = user_item_matrix.sub(user_item_matrix.mean(axis=1), axis=0).fillna(0)

# Save for later use in prediction
user_ids = user_item_centered.index.tolist()
movie_ids = user_item_centered.columns.tolist()


In [10]:
user_item_centered 

movieId,1,2,3,4,5,6,7,8,9,10,...,188301,189203,189333,189713,192385,192389,192803,194448,195159,201773
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.302050,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
12,0.663313,-1.336687,-1.336687,0.000000,0.000000,0.000000,-0.336687,0.0,0.0,-0.336687,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
72,0.000000,-1.371912,0.000000,0.000000,0.000000,0.628088,0.628088,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
80,0.000000,-1.561441,0.000000,0.000000,0.000000,2.438559,0.000000,0.0,0.0,2.438559,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
120,0.748718,0.000000,0.000000,0.000000,0.000000,0.000000,-0.251282,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162484,0.406332,-0.093668,0.000000,-0.093668,-1.093668,0.000000,0.000000,0.0,0.0,-0.093668,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
162495,0.091423,0.091423,0.591423,0.000000,-0.908577,0.000000,0.000000,0.0,0.0,1.091423,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
162508,1.294654,0.000000,0.000000,0.000000,0.000000,-0.205346,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
162516,1.220662,-0.779338,-2.779338,-1.279338,0.000000,1.220662,-1.279338,0.0,0.0,0.220662,...,0.220662,0.0,0.0,0.0,0.0,0.0,0.720662,0.220662,0.220662,0.0


# Annoy

In [11]:
from annoy import AnnoyIndex

# Number of movies (features per user)
f = len(movie_ids)
annoy_index = AnnoyIndex(f, metric='angular')  # angular ≈ cosine

user_id_map = {}               # userId -> index in annoy
reverse_user_id_map = {}       # index in annoy -> userId

for i, user_id in enumerate(user_ids):
    vector = user_item_centered.loc[user_id].values
    annoy_index.add_item(i, vector)
    user_id_map[user_id] = i
    reverse_user_id_map[i] = user_id

# Build with 10 trees (balance speed/accuracy)
annoy_index.build(n_trees=10) # will deliver randomized results. Optionally we can save the index

True

In [12]:
def predict_rating_annoy(user_id, movie_id, user_item_matrix, annoy_index, user_id_map, reverse_user_id_map, k=30, min_raters=3):
    if user_id not in user_id_map or movie_id not in user_item_matrix.columns:
        return None

    u_idx = user_id_map[user_id]
    neighbors = annoy_index.get_nns_by_item(u_idx, k, include_distances=False)
    neighbor_ids = [reverse_user_id_map[i] for i in neighbors if reverse_user_id_map[i] != user_id]

    ratings = []
    for neighbor_id in neighbor_ids:
        rating = user_item_matrix.loc[neighbor_id, movie_id]
        if not np.isnan(rating):
            ratings.append(rating)

    if len(ratings) < min_raters:
        return user_item_matrix.loc[user_id].mean()  # fallback

    return np.mean(ratings)

In [13]:
def recommend_top_n_annoy(user_id, user_item_matrix, annoy_index, user_id_map, reverse_user_id_map, movies_df, n=5, k_neighbors=30):
    if user_id not in user_item_matrix.index:
        return []

    unrated_movies = user_item_matrix.columns[user_item_matrix.loc[user_id].isna()]
    predictions = []

    for movie_id in unrated_movies:
        pred = predict_rating_annoy(user_id, movie_id, user_item_matrix, annoy_index, user_id_map, reverse_user_id_map, k=k_neighbors)
        if pred is not None:
            predictions.append((movie_id, pred))

    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    return [(mid, movies_df.loc[movies_df.movieId == mid, 'title'].values[0], score) for mid, score in top_n]

In [14]:
import numpy as np

example_user = user_item_matrix.index[0]
example_user1 = user_item_matrix.index[1]
example_user2 = user_item_matrix.index[2]

top_recs = recommend_top_n_annoy(
    user_id=example_user,
    user_item_matrix=user_item_matrix,
    annoy_index=annoy_index,
    user_id_map=user_id_map,
    reverse_user_id_map=reverse_user_id_map,
    movies_df=filtered,
    n=5
)

top_recs1 = recommend_top_n_annoy(
    user_id=example_user1,
    user_item_matrix=user_item_matrix,
    annoy_index=annoy_index,
    user_id_map=user_id_map,
    reverse_user_id_map=reverse_user_id_map,
    movies_df=filtered,
    n=5
)
top_recs2 = recommend_top_n_annoy(
    user_id=example_user2,
    user_item_matrix=user_item_matrix,
    annoy_index=annoy_index,
    user_id_map=user_id_map,
    reverse_user_id_map=reverse_user_id_map,
    movies_df=filtered,
    n=5
)

# Print recommendations
print(f"\nTop 5 Recommendations for User {example_user}:")
for movie_id, title, score in top_recs:
    print(f"{title} (Movie ID: {movie_id}) — Predicted Rating: {score:.2f}")

print(f"\nTop 5 Recommendations for User {example_user1}:")
for movie_id, title, score in top_recs1:
    print(f"{title} (Movie ID: {movie_id}) — Predicted Rating: {score:.2f}")

print(f"\nTop 5 Recommendations for User {example_user2}:")
for movie_id, title, score in top_recs2:
    print(f"{title} (Movie ID: {movie_id}) — Predicted Rating: {score:.2f}")


Top 5 Recommendations for User 3:
In the Mood For Love (Fa yeung nin wa) (2000) (Movie ID: 4144) — Predicted Rating: 4.83
Persona (1966) (Movie ID: 7327) — Predicted Rating: 4.75
On the Waterfront (1954) (Movie ID: 1945) — Predicted Rating: 4.67
Fanny and Alexander (Fanny och Alexander) (1982) (Movie ID: 2068) — Predicted Rating: 4.62
12 Angry Men (1957) (Movie ID: 1203) — Predicted Rating: 4.53

Top 5 Recommendations for User 12:
400 Blows, The (Les quatre cents coups) (1959) (Movie ID: 2731) — Predicted Rating: 4.71
Seven Samurai (Shichinin no samurai) (1954) (Movie ID: 2019) — Predicted Rating: 4.68
Breaking the Waves (1996) (Movie ID: 1354) — Predicted Rating: 4.67
Stranger Than Paradise (1984) (Movie ID: 3925) — Predicted Rating: 4.67
Hearts of Darkness: A Filmmakers Apocalypse (1991) (Movie ID: 26729) — Predicted Rating: 4.67

Top 5 Recommendations for User 72:
Touch of Evil (1958) (Movie ID: 1248) — Predicted Rating: 4.67
Grand Illusion (La grande illusion) (1937) (Movie ID: 31

Evaluation of Annoy

In [15]:
from sklearn.model_selection import train_test_split

# Split the original filtered ratings
train_df, test_df = train_test_split(filtered, test_size=0.2, random_state=42)

In [16]:
from sklearn.model_selection import train_test_split

# Split your filtered data
train_df, test_df = train_test_split(filtered, test_size=0.2, random_state=42)

# Only keep users from the Annoy index
active_user_ids = user_item_matrix.index
test_sample = test_df[test_df['userId'].isin(active_user_ids)].sample(n=10, random_state=42)


In [17]:
test_sample

Unnamed: 0,userId,movieId,rating,timestamp,title
2797770,18551,54286,5.0,1445901557,"Bourne Ultimatum, The (2007)"
21679295,140913,4310,2.5,1112679231,Pearl Harbor (2001)
10683697,69398,48780,4.0,1298477439,"Prestige, The (2006)"
23209387,150681,2701,4.0,1115165140,Wild Wild West (1999)
20355220,132358,180031,3.0,1535534994,The Shape of Water (2017)
11573153,75054,3101,5.0,997377038,Fatal Attraction (1987)
16642645,107945,3730,4.5,1548871485,"Conversation, The (1974)"
19719637,128103,47,4.5,1156923416,Seven (a.k.a. Se7en) (1995)
13356390,86427,368,5.0,966640776,Maverick (1994)
19810534,128740,8641,3.5,1230161589,Anchorman: The Legend of Ron Burgundy (2004)


In [18]:
from sklearn.metrics import root_mean_squared_error

true_ratings = []
predicted_ratings = []

for _, row in test_sample.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    true_rating = row['rating']
    
    pred = predict_rating_annoy(
        user_id, movie_id,
        user_item_matrix=user_item_matrix,
        annoy_index=annoy_index,
        user_id_map=user_id_map,
        reverse_user_id_map=reverse_user_id_map,
        k=30
    )
    
    if pred is not None:
        true_ratings.append(true_rating)
        predicted_ratings.append(pred)

# Final RMSE
rmse = root_mean_squared_error(true_ratings, predicted_ratings)
print(f"✅ RMSE on 100 samples: {rmse:.4f}")

✅ RMSE on 100 samples: 1.0779


# FAISS (Facebook AI Similarity Search)

In [19]:
# Load ratings and movie info
movies = pd.read_csv(os.path.join(base_path, 'movies.csv'))
ratings = pd.read_csv(os.path.join(base_path, 'ratings.csv'))

# Merge to get movie titles (optional, for display)
ratings_merged = ratings.merge(movies[['movieId', 'title']], on='movieId', how='left')

# Drop missing values (just in case)
ratings_merged.dropna(subset=['userId', 'movieId', 'rating'], inplace=True)

# Keep users with at least 500 ratings
user_counts = ratings_merged['userId'].value_counts()
active_users = user_counts[user_counts >= 500].index

# Filter the DataFrame to keep only active users
filtered = ratings_merged[ratings_merged['userId'].isin(active_users)]

# Keep movies with at least 500 ratings
movie_counts = ratings_merged['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 200].index

# Filter the dataset to include only the popular movies
filtered = filtered[filtered['movieId'].isin(popular_movies)]

# Check the filtered dataset size
print(f"Filtered dataset shape: {filtered.shape}")


Filtered dataset shape: (8305815, 5)


In [20]:
filtered["movieId"].nunique(), filtered["userId"].nunique()

(7984, 9713)

In [21]:
import faiss
from sklearn.preprocessing import normalize

# Create user-item matrix
user_item_matrix = filtered.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
user_vectors = normalize(user_item_matrix.values.astype('float32'))

# Build FAISS index (cosine similarity ≈ dot product on normalized vectors)
index = faiss.IndexFlatIP(user_vectors.shape[1])  # IP = Inner Product
index.add(user_vectors)

In [22]:
index

<faiss.swigfaiss_avx2.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x0000029DAA5FBE40> >

In [23]:
target_user_id = 3
target_idx = user_item_matrix.index.get_loc(target_user_id)
D, I = index.search(np.expand_dims(user_vectors[target_idx], axis=0), k=5)

# Output: indices of similar users + similarity scores
similar_users = user_item_matrix.index[I[0]]
print(similar_users)

Index([3, 124561, 149968, 159639, 91040], dtype='int64', name='userId')


In [24]:
def normalize_user_item_matrix(user_item_matrix):
    # Normalize the user-item matrix by subtracting each user's average rating
    # Calculate the mean of each user's ratings (ignoring NaN values)
    user_means = user_item_matrix.mean(axis=1)
    
    # Subtract the user's mean rating from each of their ratings
    user_item_matrix_normalized = user_item_matrix.sub(user_means, axis=0)
    
    return user_item_matrix_normalized

In [25]:
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,201588,201646,201749,201773,202103,202429,202439,203222,204698,205383
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,4.0,2.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,0.0,2.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162484,3.5,3.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162495,3.0,3.0,3.5,0.0,2.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162508,4.5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162516,4.5,2.5,0.5,2.0,0.0,4.5,2.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
def recommend_top_n_faiss(user_id, user_item_matrix, index, user_vectors, movies_df, n=5, k=5):
    """
    Recommend top-N movies for a given user using FAISS-based nearest neighbors.

    Parameters:
    - user_id: ID of the target user.
    - user_item_matrix: DataFrame containing user-item ratings.
    - index: FAISS index built from user vectors.
    - user_vectors: Numpy array of user vectors.
    - movies_df: DataFrame containing movie information with 'movieId' and 'title' columns.
    - n: Number of top recommendations to return.
    - k: Number of nearest neighbors to consider.

    Returns:
    - List of tuples: (movieId, title, predicted_rating)
    """
    if user_id not in user_item_matrix.index:
        return []

    # Normalize the user-item matrix by subtracting the user's average rating
    user_item_matrix_normalized = normalize_user_item_matrix(user_item_matrix)

    # Get the index of the target user
    target_idx = user_item_matrix.index.get_loc(user_id)

    # Search for k+1 nearest neighbors (including the user themselves)
    D, I = index.search(np.expand_dims(user_vectors[target_idx], axis=0), k=k+1)

    # Retrieve neighbor indices and distances
    neighbor_indices = I[0]
    distances = D[0]

    # Map indices to user IDs
    neighbor_ids = user_item_matrix.index[neighbor_indices]

    # Exclude the target user from neighbors
    mask = neighbor_ids != user_id
    neighbor_ids = neighbor_ids[mask]
    distances = distances[mask]

    # Identify movies not yet rated by the target user (those rated as 0)
    unrated_movies = user_item_matrix.columns[user_item_matrix.loc[user_id] == 0]
    
    if unrated_movies.empty:
        print(f"User {user_id} has already rated all movies. No unrated movies left to recommend.")
        return []  # Return empty list if no unrated movies are left
    
    predictions = []

    for movie_id in unrated_movies:
        # Retrieve ratings from neighbors for the current movie (from the normalized matrix)
        neighbor_ratings = user_item_matrix_normalized.loc[neighbor_ids, movie_id]

        # Drop ratings that are NaN (neighbors who haven't rated the movie)
        valid_ratings = neighbor_ratings.dropna()

        if valid_ratings.empty:
            continue  # Skip if no neighbor has rated the movie

        # Compute similarities (inverse of distances)
        similarities = 1 / (distances[:len(valid_ratings)] + 1e-10)  # Add epsilon to avoid division by zero
        
        # Normalize similarities to ensure they are not too skewed
        if np.sum(similarities) > 0:
            similarities = similarities / np.sum(similarities)

        # Compute weighted average of neighbor ratings
        weighted_sum = np.dot(valid_ratings.values, similarities)
        sum_of_weights = np.sum(similarities)

        # Calculate predicted rating in the normalized scale
        predicted_rating_normalized = weighted_sum / sum_of_weights

        # Get the user's average rating to "de-normalize" the prediction
        user_avg_rating = user_item_matrix.loc[user_id].mean()

        # Add the user's average back to the predicted rating
        predicted_rating = predicted_rating_normalized + user_avg_rating
        
        # Clamp predictions between 1 and 5
        predicted_rating = min(5, max(1, predicted_rating))

        predictions.append((movie_id, predicted_rating))

    if not predictions:
        print("No predictions made.")
        return []

    # Sort predictions by predicted rating in descending order
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]

    # Retrieve movie titles
    recommendations = []
    for movie_id, score in top_n:
        title = movies_df.loc[movies_df['movieId'] == movie_id, 'title'].values
        title = title[0] if len(title) > 0 else "Unknown Title"
        recommendations.append((movie_id, title, score))

    return recommendations


In [27]:
user_id = 3  # Replace with the target user's ID
recommendations = recommend_top_n_faiss(user_id, user_item_matrix, index, user_vectors, movies, n=5, k=5)

print(f"Top 5 Recommendations for User {user_id}:")
for movie_id, title, score in recommendations:
    print(f"{title} (Movie ID: {movie_id}) — Predicted Rating: {score:.2f}")

Top 5 Recommendations for User 3:
Green Mile, The (1999) (Movie ID: 3147) — Predicted Rating: 4.27
Indiana Jones and the Last Crusade (1989) (Movie ID: 1291) — Predicted Rating: 4.17
Deadpool (2016) (Movie ID: 122904) — Predicted Rating: 4.07
John Wick (2014) (Movie ID: 115149) — Predicted Rating: 4.07
Love Actually (2003) (Movie ID: 6942) — Predicted Rating: 3.97


In [28]:
# Get the movies that the user has rated
rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0]  # Non-zero ratings

print(f"Movies rated by User {user_id}:")
for movie_id, rating in rated_movies.items():
    title = movies.loc[movies['movieId'] == movie_id, 'title'].values
    title = title[0] if len(title) > 0 else "Unknown Title"
    print(f"{title} (Movie ID: {movie_id}) — Rating: {rating:.2f}")

Movies rated by User 3:
Toy Story (1995) (Movie ID: 1) — Rating: 4.00
City of Lost Children, The (Cité des enfants perdus, La) (1995) (Movie ID: 29) — Rating: 4.50
Twelve Monkeys (a.k.a. 12 Monkeys) (1995) (Movie ID: 32) — Rating: 4.50
Usual Suspects, The (1995) (Movie ID: 50) — Rating: 5.00
Taxi Driver (1976) (Movie ID: 111) — Rating: 4.00
Johnny Mnemonic (1995) (Movie ID: 172) — Rating: 4.00
Judge Dredd (1995) (Movie ID: 173) — Rating: 3.00
Before the Rain (Pred dozhdot) (1994) (Movie ID: 214) — Rating: 5.00
Star Wars: Episode IV - A New Hope (1977) (Movie ID: 260) — Rating: 4.00
Léon: The Professional (a.k.a. The Professional) (Léon) (1994) (Movie ID: 293) — Rating: 5.00
Pulp Fiction (1994) (Movie ID: 296) — Rating: 5.00
Shawshank Redemption, The (1994) (Movie ID: 318) — Rating: 4.00
Forrest Gump (1994) (Movie ID: 356) — Rating: 4.00
Demolition Man (1993) (Movie ID: 442) — Rating: 3.50
Jurassic Park (1993) (Movie ID: 480) — Rating: 2.00
Schindler's List (1993) (Movie ID: 527) — Rati

## Full dense Matrix 
- Saving sparse matrix (so minimal RAM Usage), but computing Faiss on dense matrix since it requires this to run

In [29]:
from scipy.sparse import csr_matrix

user_ids = ratings_merged['userId'].unique()
movie_ids = ratings_merged['movieId'].unique()

user_map = {uid: i for i, uid in enumerate(user_ids)}
movie_map = {mid: i for i, mid in enumerate(movie_ids)}
reverse_user_map = {i: uid for uid, i in user_map.items()}
reverse_movie_map = {i: mid for mid, i in movie_map.items()}

row = ratings_merged['userId'].map(user_map)
col = ratings_merged['movieId'].map(movie_map)
data = ratings_merged['rating']

sparse_matrix = csr_matrix((data, (row, col)), shape=(len(user_ids), len(movie_ids)))

In [30]:
122 in movie_map

True

The following code combines all users and all movies from the sparse_matrix

In [31]:
from sklearn.preprocessing import normalize
import faiss
import numpy as np

# Batch size definition
batch_size = 10000
index = faiss.IndexFlatIP(sparse_matrix.shape[1])  # FAISS expects dense matrix

user_indices = np.arange(sparse_matrix.shape[0])

for i in range(0, len(user_indices), batch_size):
    batch_rows = user_indices[i:i+batch_size]
    dense_batch = sparse_matrix[batch_rows].toarray().astype('float32')
    dense_batch = normalize(dense_batch)
    index.add(dense_batch)

In [32]:
print("FAISS index dimension:", index.d)
print("Number of users indexed:", index.ntotal)
print("Sparse matrix shape:", sparse_matrix.shape)

FAISS index dimension: 59047
Number of users indexed: 162541
Sparse matrix shape: (162541, 59047)


In [33]:
target_idx = user_map[3]  # e.g., userId = 3
query_vector = normalize(sparse_matrix[target_idx].toarray().astype('float32'))

D, I = index.search(query_vector, k=5)

similar_user_ids = [reverse_user_map[i] for i in I[0]]
print(similar_user_ids)

[np.int64(3), np.int64(124561), np.int64(149968), np.int64(51328), np.int64(159639)]


In [34]:
sparse_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 25000095 stored elements and shape (162541, 59047)>

In [35]:
def predict_rating_faiss(user_id, movie_id, sparse_matrix, faiss_index, user_map, movie_map, k=5):
    if user_id not in user_map or movie_id not in movie_map:
        return ("no user/movie")

    user_idx = user_map[user_id]
    movie_idx = movie_map[movie_id]

    user_vector = sparse_matrix[user_idx].toarray().astype('float32')
    norm_vector = user_vector / (np.linalg.norm(user_vector) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]  # exclude self

    ratings = []
    for neighbor in neighbor_indices:
        neighbor_rating = sparse_matrix[neighbor, movie_idx]
        if neighbor_rating != 0:
            ratings.append(neighbor_rating)

    if not ratings:
        global_avg = sparse_matrix.data.mean()
    return float(global_avg)

    return float(np.mean(ratings))

In [36]:
def recommend_top_n_faiss(user_id, sparse_matrix, faiss_index, user_map, movie_map, reverse_movie_map, movies_df, n=5, k=10):
    if user_id not in user_map:
        return []

    user_idx = user_map[user_id]
    user_vector = sparse_matrix[user_idx].toarray().astype('float32')

    if np.count_nonzero(user_vector) == 0:
        return []  # cold-start user

    norm_vector = user_vector / (np.linalg.norm(user_vector) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    unrated_movie_indices = np.where(user_vector[0] == 0)[0]
    predictions = []

    for movie_idx in unrated_movie_indices:
        weighted_ratings = []
        weights = []

        for sim, neighbor in zip(similarities, neighbor_indices):
            rating = sparse_matrix[neighbor, movie_idx]
            if rating != 0:
                weighted_ratings.append(rating * sim)
                weights.append(sim)

        if weights and np.sum(weights) > 0:
            pred = np.sum(weighted_ratings) / (np.sum(weights) + 1e-10)
            movie_id = reverse_movie_map[movie_idx]
            title_row = movies_df.loc[movies_df['movieId'] == movie_id, 'title']
            title = title_row.values[0] if not title_row.empty else "Unknown"
            predictions.append((movie_id, title, pred))

    top_n = sorted(predictions, key=lambda x: x[2], reverse=True)[:n]
    return top_n


In [37]:
# Predict rating
movies_df = ratings_merged[['movieId', 'title']].drop_duplicates()
rating = predict_rating_faiss(
    user_id=3,
    movie_id=122,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map
)
print(f"Predicted rating: {rating}")

# Top-N recommendations
top_recs = recommend_top_n_faiss(
    user_id=3,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    reverse_movie_map=reverse_movie_map,
    movies_df=movies_df,
    n=5
)

for mid, title, score in top_recs:
    print(f"{title} (Movie ID: {mid}) — Predicted Rating: {score:.2f}")

Predicted rating: 3.533854451353085
Talk to Her (Hable con Ella) (2002) (Movie ID: 5878) — Predicted Rating: 5.00
Planet Earth II (2016) (Movie ID: 171011) — Predicted Rating: 5.00
Cinderella (1950) (Movie ID: 1022) — Predicted Rating: 5.00
Peter Pan (1953) (Movie ID: 2087) — Predicted Rating: 5.00
My Cousin Vinny (1992) (Movie ID: 2302) — Predicted Rating: 5.00


In [38]:
rating = predict_rating_faiss(
    user_id=3,
    movie_id=122,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map
)
print(f"Predicted rating: {rating}")

Predicted rating: 3.533854451353085


None of the neighbors have rated the movie 122, so we use the fallback float(np.mean(ratings))

## Fallback + Minimal Overlap between Neighbors (i.e., minimum movies the target user and its nearest neighbors have both rated)

In [39]:
def predict_rating_faiss_hybrid(user_id, movie_id, sparse_matrix, faiss_index, user_map, movie_map, k=20, min_overlap=3):
    if user_id not in user_map or movie_id not in movie_map:
        return "no user/movie"

    user_idx = user_map[user_id]
    movie_idx = movie_map[movie_id]

    user_vector = sparse_matrix[user_idx].toarray().astype('float32')
    if np.count_nonzero(user_vector) == 0:
        return float(sparse_matrix.data.mean())  # cold-start user → global avg

    norm_vector = user_vector / (np.linalg.norm(user_vector) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    ratings = []
    weights = []

    for sim, neighbor_idx in zip(similarities, neighbor_indices):
        neighbor_vector = sparse_matrix[neighbor_idx].toarray().astype('float32')
        both_rated = (user_vector != 0) & (neighbor_vector != 0)
        overlap = np.sum(both_rated)

        rating = sparse_matrix[neighbor_idx, movie_idx]
        if rating != 0 and overlap >= min_overlap:
            adjusted_weight = sim * (overlap / (np.count_nonzero(user_vector) + 1e-10))
            ratings.append(rating * adjusted_weight)
            weights.append(adjusted_weight)

    if ratings and np.sum(weights) > 0:
        pred = np.sum(ratings) / np.sum(weights)
        return float(np.clip(pred, 0.5, 5.0))

    # Fallbacks
    movie_ratings = sparse_matrix[:, movie_idx].data
    if len(movie_ratings) > 0:
        return float(np.clip(np.mean(movie_ratings), 0.5, 5.0))

    user_ratings = sparse_matrix[user_idx].data
    if len(user_ratings) > 0:
        return float(np.clip(np.mean(user_ratings), 0.5, 5.0))

    return float(np.clip(sparse_matrix.data.mean(), 0.5, 5.0))  # Final fallback



In [40]:
pred = predict_rating_faiss_hybrid(
    user_id=3,
    movie_id=122,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    k=20
)
print(f"Predicted rating: {pred}")

Predicted rating: 2.8593924191750277


## Defining Minimum Neighbors on which the decisions are made (we want more evidence - predicting only on one neighbor = too low confidence)

In [41]:
def recommend_top_n_faiss_hybrid_fast(
    user_id,
    sparse_matrix,
    faiss_index,
    user_map,
    movie_map,
    reverse_movie_map,
    movies_df,
    n=5,
    k=20,
    min_overlap=3,
    min_neighbors=3
):
    if user_id not in user_map:
        return []

    user_idx = user_map[user_id]
    user_vector = sparse_matrix[user_idx]
    if user_vector.nnz == 0:
        return []  # cold-start user

    user_dense = user_vector.toarray().astype('float32')
    norm_vector = user_dense / (np.linalg.norm(user_dense) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    neighbor_vectors = {
        idx: sparse_matrix[idx].toarray().astype('float32')
        for idx in neighbor_indices
    }

    unrated_indices = np.where(user_dense[0] == 0)[0]
    user_rated_mask = user_dense[0] != 0
    user_rated_count = np.count_nonzero(user_rated_mask)

    predictions = []

    for movie_idx in unrated_indices:
        weighted_scores = []
        weights = []
        true_ratings = []

        for sim, neighbor_idx in zip(similarities, neighbor_indices):
            neighbor_vec = neighbor_vectors[neighbor_idx]
            neighbor_rating = neighbor_vec[0, movie_idx]

            if neighbor_rating == 0:
                continue

            overlap = np.sum((user_rated_mask) & (neighbor_vec[0] != 0))
            if overlap < min_overlap:
                continue

            weight = sim * (overlap / (user_rated_count + 1e-10))
            weighted_scores.append(neighbor_rating * weight)
            weights.append(weight)
            true_ratings.append(neighbor_rating)

        # Filter: Require enough neighbors
        if len(true_ratings) < min_neighbors:
            continue

        pred = np.sum(weighted_scores) / np.sum(weights)
        pred = float(np.clip(pred, 0.5, 5.0))

        movie_id = reverse_movie_map[movie_idx]
        title_row = movies_df.loc[movies_df['movieId'] == movie_id, 'title']
        title = title_row.values[0] if not title_row.empty else "Unknown"

        predictions.append((movie_id, title, pred, true_ratings))

    top_n = sorted(predictions, key=lambda x: x[2], reverse=True)[:n]

    for movie_id, title, score, ratings_used in top_n:
        print(f"{title} (Movie ID: {movie_id})")
        print(f"  → Predicted Rating: {score:.2f}")
        print(f"  → Neighbors Used: {len(ratings_used)}")
        print(f"  → Ratings Used: {[float(r) for r in ratings_used]}")
        print("-" * 60)

    return [(movie_id, title, score) for movie_id, title, score, _ in top_n]


In [42]:
top_recs = recommend_top_n_faiss_hybrid_fast(
    user_id=3,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    reverse_movie_map=reverse_movie_map,
    movies_df=movies_df,
    n=5
)


Planet Earth II (2016) (Movie ID: 171011)
  → Predicted Rating: 5.00
  → Neighbors Used: 3
  → Ratings Used: [5.0, 5.0, 5.0]
------------------------------------------------------------
Planet Earth (2006) (Movie ID: 159817)
  → Predicted Rating: 4.67
  → Neighbors Used: 7
  → Ratings Used: [5.0, 4.0, 5.0, 5.0, 3.5, 5.0, 5.0]
------------------------------------------------------------
Paperman (2012) (Movie ID: 98491)
  → Predicted Rating: 4.58
  → Neighbors Used: 4
  → Ratings Used: [4.5, 5.0, 4.0, 5.0]
------------------------------------------------------------
Blue Planet II (2017) (Movie ID: 179135)
  → Predicted Rating: 4.53
  → Neighbors Used: 5
  → Ratings Used: [4.0, 5.0, 4.5, 4.0, 5.0]
------------------------------------------------------------
Darjeeling Limited, The (2007) (Movie ID: 55269)
  → Predicted Rating: 4.46
  → Neighbors Used: 6
  → Ratings Used: [4.0, 5.0, 4.5, 5.0, 4.5, 3.5]
------------------------------------------------------------


## Optimal k-value (tbd)
- k most similar users to the target user (based on cosine similarity)
-   Comparing top recommendations across different k values

In [43]:
def grid_search_k_for_user(user_id, sparse_matrix, faiss_index, user_map, movie_map, reverse_movie_map, movies_df, k_values=[5, 10, 20, 30], min_overlap=3, n=5, min_neighbors=3):
    results = {}

    for k in k_values:
        print(f"\nRunning recommend_top_n_faiss_hybrid_fast with k={k}...")
        top_recs = recommend_top_n_faiss_hybrid_fast(
            user_id=user_id,
            sparse_matrix=sparse_matrix,
            faiss_index=index,
            user_map=user_map,
            movie_map=movie_map,
            reverse_movie_map=reverse_movie_map,
            movies_df=movies_df,
            n=n,
            k=k,
            min_overlap=min_overlap,
            min_neighbors=min_neighbors
        )
        results[k] = top_recs

        print(f"\nTop {n} Recommendations for k={k}:")
        for mid, title, score in top_recs:
            print(f"{title} (Movie ID: {mid}) — Predicted Rating: {score:.2f}")
        print("-" * 60)

    return results

In [44]:
grid_results = grid_search_k_for_user(
    user_id=3,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    reverse_movie_map=reverse_movie_map,
    movies_df=movies_df,
    k_values=[5, 10, 20, 30],  # test different k values
    min_overlap=3,
    n=5,
    min_neighbors=3  # ensure quality predictions only
)


Running recommend_top_n_faiss_hybrid_fast with k=5...
Warrior (2011) (Movie ID: 89774)
  → Predicted Rating: 4.83
  → Neighbors Used: 3
  → Ratings Used: [5.0, 5.0, 4.5]
------------------------------------------------------------
Rocky (1976) (Movie ID: 1954)
  → Predicted Rating: 4.67
  → Neighbors Used: 3
  → Ratings Used: [5.0, 4.5, 4.5]
------------------------------------------------------------
Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001) (Movie ID: 4973)
  → Predicted Rating: 4.57
  → Neighbors Used: 4
  → Ratings Used: [5.0, 5.0, 5.0, 3.5]
------------------------------------------------------------
Darjeeling Limited, The (2007) (Movie ID: 55269)
  → Predicted Rating: 4.53
  → Neighbors Used: 3
  → Ratings Used: [4.0, 5.0, 4.5]
------------------------------------------------------------
Blue Planet II (2017) (Movie ID: 179135)
  → Predicted Rating: 4.53
  → Neighbors Used: 3
  → Ratings Used: [4.0, 5.0, 4.5]
----------------------------------------------------------

## Structured Output Generation

In [45]:
def recommend_top_n_faiss_hybrid_fast_structured(
    user_id,
    sparse_matrix,
    faiss_index,
    user_map,
    movie_map,
    reverse_movie_map,
    movies_df,
    n=5,
    k=20,
    min_overlap=3,
    min_neighbors=3
):
    import pandas as pd

    if user_id not in user_map:
        return pd.DataFrame()

    user_idx = user_map[user_id]
    user_vector = sparse_matrix[user_idx]
    if user_vector.nnz == 0:
        return pd.DataFrame()  # cold-start user

    user_dense = user_vector.toarray().astype('float32')
    norm_vector = user_dense / (np.linalg.norm(user_dense) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    neighbor_vectors = {
        idx: sparse_matrix[idx].toarray().astype('float32')
        for idx in neighbor_indices
    }

    unrated_indices = np.where(user_dense[0] == 0)[0]
    user_rated_mask = user_dense[0] != 0
    user_rated_count = np.count_nonzero(user_rated_mask)

    predictions = []

    for movie_idx in unrated_indices:
        weighted_scores = []
        weights = []
        true_ratings = []

        for sim, neighbor_idx in zip(similarities, neighbor_indices):
            neighbor_vec = neighbor_vectors[neighbor_idx]
            neighbor_rating = neighbor_vec[0, movie_idx]

            if neighbor_rating == 0:
                continue

            overlap = np.sum((user_rated_mask) & (neighbor_vec[0] != 0))
            if overlap < min_overlap:
                continue

            weight = sim * (overlap / (user_rated_count + 1e-10))
            weighted_scores.append(neighbor_rating * weight)
            weights.append(weight)
            true_ratings.append(neighbor_rating)

        if len(true_ratings) < min_neighbors:
            continue

        pred = np.sum(weighted_scores) / np.sum(weights)
        pred = float(np.clip(pred, 0.5, 5.0))

        movie_id = reverse_movie_map[movie_idx]
        title_row = movies_df.loc[movies_df['movieId'] == movie_id, 'title']
        genres_row = movies_df.loc[movies_df['movieId'] == movie_id, 'genres']
        title = title_row.values[0] if not title_row.empty else "Unknown"
        genres = genres_row.values[0] if not genres_row.empty else "Unknown"

        predictions.append({
            'userId': user_id,
            'movieId': movie_id,
            'title': title,
            'genres': genres,
            'predicted_rating': round(pred, 2),
            'neighbors_used': len(true_ratings)
        })

    top_n_df = pd.DataFrame(predictions).sort_values(by='predicted_rating', ascending=False).head(n)
    return top_n_df.reset_index(drop=True)


In [46]:
user3_recs_df = recommend_top_n_faiss_hybrid_fast_structured(
    user_id=3,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    reverse_movie_map=reverse_movie_map,
    movies_df=movies, 
    n=5,
    k=20,
    min_overlap=3,
    min_neighbors=3
)

user3_recs_df

Unnamed: 0,userId,movieId,title,genres,predicted_rating,neighbors_used
0,3,171011,Planet Earth II (2016),Documentary,5.0,3
1,3,159817,Planet Earth (2006),Documentary,4.67,7
2,3,98491,Paperman (2012),Animation|Comedy|Romance,4.58,4
3,3,179135,Blue Planet II (2017),Documentary,4.53,5
4,3,55269,"Darjeeling Limited, The (2007)",Adventure|Comedy|Drama,4.46,6


## Optimization (tbd - hardware limitations)

In [47]:
def recommend_top_n_faiss_hybrid_optimization(
    user_id,
    sparse_matrix,
    faiss_index,
    user_map,
    movie_map,
    reverse_movie_map,
    movies_df,
    n=5,
    k=20,
    min_overlap=3,
    min_neighbors=3
):
    import pandas as pd
    import numpy as np

    if user_id not in user_map:
        return pd.DataFrame()

    user_idx = user_map[user_id]
    user_vector = sparse_matrix[user_idx]
    if user_vector.nnz == 0:
        return pd.DataFrame()

    user_dense = user_vector.toarray().astype('float32')
    norm_vector = user_dense / (np.linalg.norm(user_dense) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    # Vectorized neighbor matrix
    neighbor_matrix = sparse_matrix[neighbor_indices].toarray().astype('float32')

    unrated_indices = np.where(user_dense[0] == 0)[0]
    user_rated_mask = user_dense[0] != 0
    user_rated_count = np.count_nonzero(user_rated_mask)

    predictions = []

    for movie_idx in unrated_indices:
        weighted_scores = []
        weights = []
        true_ratings = []

        for i, neighbor_vec in enumerate(neighbor_matrix):
            rating = neighbor_vec[movie_idx]
            if rating == 0:
                continue

            overlap = np.sum(user_rated_mask & (neighbor_vec != 0))
            if overlap < min_overlap:
                continue

            weight = similarities[i] * (overlap / (user_rated_count + 1e-10))
            weighted_scores.append(rating * weight)
            weights.append(weight)
            true_ratings.append(rating)

        if len(true_ratings) < min_neighbors:
            continue

        pred = np.sum(weighted_scores) / np.sum(weights)
        pred = float(np.clip(pred, 0.5, 5.0))

        movie_id = reverse_movie_map[movie_idx]
        title_row = movies_df.loc[movies_df['movieId'] == movie_id, 'title']
        genres_row = movies_df.loc[movies_df['movieId'] == movie_id, 'genres']
        title = title_row.values[0] if not title_row.empty else "Unknown"
        genres = genres_row.values[0] if not genres_row.empty else "Unknown"

        print(f"{title} (Movie ID: {movie_id})")
        print(f"  → Predicted Rating: {pred:.2f}")
        print(f"  → Neighbors Used: {len(true_ratings)}")
        print(f"  → Ratings Used: {[float(r) for r in true_ratings]}")
        print("-" * 60)

        predictions.append({
            'userId': user_id,
            'movieId': movie_id,
            'title': title,
            'genres': genres,
            'predicted_rating': round(pred, 2),
            'neighbors_used': len(true_ratings)
        })

    top_n_df = pd.DataFrame(predictions).sort_values(by='predicted_rating', ascending=False).head(n)
    return top_n_df.reset_index(drop=True)


In [48]:
user3_recs_df = recommend_top_n_faiss_hybrid_optimization(
    user_id=3,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    reverse_movie_map=reverse_movie_map,
    movies_df=movies,  
    n=5,
    k=20,
    min_overlap=3,
    min_neighbors=3
)

Back to the Future Part II (1989) (Movie ID: 2011)
  → Predicted Rating: 3.81
  → Neighbors Used: 12
  → Ratings Used: [3.0, 5.0, 4.0, 4.0, 4.0, 3.0, 3.5, 3.0, 4.0, 3.5, 3.5, 5.0]
------------------------------------------------------------
Back to the Future Part III (1990) (Movie ID: 2012)
  → Predicted Rating: 3.73
  → Neighbors Used: 9
  → Ratings Used: [3.5, 4.0, 3.0, 3.5, 3.0, 4.0, 2.5, 5.0, 5.0]
------------------------------------------------------------
NeverEnding Story, The (1984) (Movie ID: 2161)
  → Predicted Rating: 2.95
  → Neighbors Used: 3
  → Ratings Used: [4.0, 4.0, 0.5]
------------------------------------------------------------
Good Morning, Vietnam (1987) (Movie ID: 3448)
  → Predicted Rating: 3.34
  → Neighbors Used: 5
  → Ratings Used: [4.0, 3.5, 3.0, 4.0, 2.0]
------------------------------------------------------------
Requiem for a Dream (2000) (Movie ID: 3949)
  → Predicted Rating: 3.52
  → Neighbors Used: 16
  → Ratings Used: [3.5, 4.0, 3.5, 3.5, 3.5, 4.5,

In [49]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import numpy as np

def recommend_top_n_faiss_hybrid_parallel(
    user_id,
    sparse_matrix,
    faiss_index,
    user_map,
    movie_map,
    reverse_movie_map,
    movies_df,
    n=5,
    k=20,
    min_overlap=3,
    min_neighbors=3,
    max_threads=8
):
    if user_id not in user_map:
        return pd.DataFrame()

    user_idx = user_map[user_id]
    user_vector = sparse_matrix[user_idx]
    if user_vector.nnz == 0:
        return pd.DataFrame()

    user_dense = user_vector.toarray().astype('float32')
    norm_vector = user_dense / (np.linalg.norm(user_dense) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    neighbor_matrix = sparse_matrix[neighbor_indices].toarray().astype('float32')
    user_rated_mask = user_dense[0] != 0
    user_rated_count = np.count_nonzero(user_rated_mask)
    unrated_indices = np.where(user_dense[0] == 0)[0]

    # Define movie rating prediction for threading
    def predict_for_movie(movie_idx):
        weighted_scores, weights, true_ratings = [], [], []

        for i, neighbor_vec in enumerate(neighbor_matrix):
            rating = neighbor_vec[movie_idx]
            if rating == 0:
                continue
            overlap = np.sum(user_rated_mask & (neighbor_vec != 0))
            if overlap < min_overlap:
                continue

            weight = similarities[i] * (overlap / (user_rated_count + 1e-10))
            weighted_scores.append(rating * weight)
            weights.append(weight)
            true_ratings.append(rating)

        if len(true_ratings) < min_neighbors:
            return None

        pred = np.sum(weighted_scores) / np.sum(weights)
        pred = float(np.clip(pred, 0.5, 5.0))

        movie_id = reverse_movie_map[movie_idx]
        title_row = movies_df.loc[movies_df['movieId'] == movie_id, 'title']
        genres_row = movies_df.loc[movies_df['movieId'] == movie_id, 'genres']
        title = title_row.values[0] if not title_row.empty else "Unknown"
        genres = genres_row.values[0] if not genres_row.empty else "Unknown"

        return {
            'userId': user_id,
            'movieId': movie_id,
            'title': title,
            'genres': genres,
            'predicted_rating': round(pred, 2),
            'neighbors_used': len(true_ratings)
        }

    # ⏱️ Run parallel movie prediction
    results = []
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = {executor.submit(predict_for_movie, idx): idx for idx in unrated_indices}
        for future in as_completed(futures):
            result = future.result()
            if result is not None:
                results.append(result)

    top_n_df = pd.DataFrame(results).sort_values(by='predicted_rating', ascending=False).head(n)
    return top_n_df.reset_index(drop=True)


In [50]:
user3_recs_df = recommend_top_n_faiss_hybrid_parallel(
    user_id=3,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    reverse_movie_map=reverse_movie_map,
    movies_df=movies,
    n=5,
    k=20,
    min_overlap=3,
    min_neighbors=3,
    max_threads=8  # or 4 depending on your CPU
)


In [51]:
user3_recs_df

Unnamed: 0,userId,movieId,title,genres,predicted_rating,neighbors_used
0,3,171011,Planet Earth II (2016),Documentary,5.0,3
1,3,159817,Planet Earth (2006),Documentary,4.67,7
2,3,98491,Paperman (2012),Animation|Comedy|Romance,4.58,4
3,3,179135,Blue Planet II (2017),Documentary,4.53,5
4,3,55269,"Darjeeling Limited, The (2007)",Adventure|Comedy|Drama,4.46,6


## Leave-one-out Evaluation

In [52]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
import faiss
from tqdm import tqdm

# 1. Leave-One-Out Train/Test Split
def create_leave_one_out_split(ratings_df):
    test_rows = []
    train_rows = []

    grouped = ratings_df.groupby('userId')
    for user_id, group in grouped:
        if len(group) < 2:
            continue
        test_row = group.sample(1, random_state=42)
        train_rows.append(group.drop(test_row.index))
        test_rows.append(test_row)

    train_df = pd.concat(train_rows).reset_index(drop=True)
    test_df = pd.concat(test_rows).reset_index(drop=True)
    return train_df, test_df

# 2. Sparse Matrix Builder
def build_sparse_matrix(train_df):
    user_ids = train_df['userId'].unique()
    movie_ids = train_df['movieId'].unique()

    user_map = {uid: i for i, uid in enumerate(user_ids)}
    movie_map = {mid: i for i, mid in enumerate(movie_ids)}
    reverse_movie_map = {i: mid for mid, i in movie_map.items()}

    rows = train_df['userId'].map(user_map).values
    cols = train_df['movieId'].map(movie_map).values
    data = train_df['rating'].values

    sparse_matrix = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(movie_ids)))
    return sparse_matrix, user_map, movie_map, reverse_movie_map

# 3. FAISS Index Builder (Batched to prevent MemoryError)
def build_faiss_index_batched(sparse_matrix, batch_size=10000):
    index = faiss.IndexFlatIP(sparse_matrix.shape[1])
    user_indices = np.arange(sparse_matrix.shape[0])

    for i in range(0, len(user_indices), batch_size):
        batch_rows = user_indices[i:i + batch_size]
        dense_batch = sparse_matrix[batch_rows].toarray().astype('float32')
        dense_batch = normalize(dense_batch)
        index.add(dense_batch)

    return index

# 4. Predict rating for one user–movie pair
def predict_rating(user_id, movie_id, sparse_matrix, index, user_map, movie_map, k=20, min_overlap=3):
    if user_id not in user_map or movie_id not in movie_map:
        return None

    user_idx = user_map[user_id]
    movie_idx = movie_map[movie_id]

    user_vector = sparse_matrix[user_idx].toarray().astype('float32')
    norm_vector = user_vector / (np.linalg.norm(user_vector) + 1e-10)

    D, I = index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    user_rated_mask = user_vector[0] != 0
    user_rated_count = np.count_nonzero(user_rated_mask)

    ratings, weights = [], []

    for i, neighbor_idx in enumerate(neighbor_indices):
        neighbor_vector = sparse_matrix[neighbor_idx].toarray().astype('float32')[0]
        rating = neighbor_vector[movie_idx]
        if rating == 0:
            continue
        overlap = np.sum(user_rated_mask & (neighbor_vector != 0))
        if overlap < min_overlap:
            continue
        weight = similarities[i] * (overlap / (user_rated_count + 1e-10))
        ratings.append(rating * weight)
        weights.append(weight)

    if ratings and np.sum(weights) > 0:
        return float(np.clip(np.sum(ratings) / np.sum(weights), 0.5, 5.0))
    else:
        return None

# 5. Evaluate RMSE over test set
def evaluate_rmse(test_df, sparse_matrix, index, user_map, movie_map, k=20):
    actuals, preds = [], []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating"):
        pred = predict_rating(
            user_id=row['userId'],
            movie_id=row['movieId'],
            sparse_matrix=sparse_matrix,
            index=index,
            user_map=user_map,
            movie_map=movie_map,
            k=k
        )
        if pred is not None:
            actuals.append(row['rating'])
            preds.append(pred)

    actuals = np.array(actuals)
    preds = np.array(preds)
    rmse = np.sqrt(np.mean((actuals - preds) ** 2))
    return rmse


In [55]:
# 1. Split into train/test (leave-one-out)
train_df, test_df = create_leave_one_out_split(ratings_merged)

# 2. Sampling users to avoid memory overload
sample_users = train_df['userId'].value_counts().loc[lambda x: x > 1].sample(2000, random_state=42).index
train_df_sample = train_df[train_df['userId'].isin(sample_users)].copy()
test_df_sample = test_df[test_df['userId'].isin(sample_users)].copy()

# 3. Building user-item matrix and FAISS index
sparse_matrix1, user_map, movie_map, reverse_movie_map = build_sparse_matrix(train_df_sample)
faiss_index = build_faiss_index_batched(sparse_matrix1)

# 4. Evaluate
rmse = evaluate_rmse(test_df_sample, sparse_matrix1, faiss_index, user_map, movie_map, k=20)
print(f"\nLeave-One-Out RMSE (sample of 2000 users): {rmse:.4f}")


Evaluating: 100%|██████████| 2000/2000 [00:10<00:00, 195.99it/s]


Leave-One-Out RMSE (sample of 2000 users): 1.0331





In [56]:
print("FAISS index dimension:", index.d)
print("Number of users indexed:", index.ntotal)
print("Sparse matrix shape:", sparse_matrix.shape)

FAISS index dimension: 59047
Number of users indexed: 162541
Sparse matrix shape: (162541, 59047)


### Saving the Index
The file is very large - around 37GB.

In [57]:
#faiss.write_index(index, "faiss_index.index")

In [58]:
# Loading the index
# faiss_index = faiss.read_index("faiss_index.index")

### Saving Sparse Matrix (CSR format)

In [59]:
from scipy.sparse import save_npz, load_npz

save_npz("sparse_matrix.npz", sparse_matrix)

In [60]:
# Loading Matrix
#from scipy.sparse import load_npz

#sparse_matrix = load_npz("sparse_matrix.npz")

### Saving Mapping Dictionaries (user_map, movie_map, reverse_movie_map)

In [61]:
import pickle

with open("user_movie_maps.pkl", "wb") as f:
    pickle.dump((user_map, movie_map, reverse_movie_map), f)

In [62]:
# Loading Mapping Dictionaries

#with open("user_movie_maps.pkl", "rb") as f:
#    user_map, movie_map, reverse_movie_map = pickle.load(f)