# User-based Collaborative Filtering

Core idea:
"If Alice and Bob have rated movies similarly in the past, then we can recommend to Alice the movies that Bob liked (but Alice hasn’t seen yet)."

How it works:

Find users similar to the target user (e.g., Alice).

Look at what those similar users liked.

Recommend those movies to Alice.

Example:
If Alice and Bob both liked Inception and The Matrix, and Bob also liked Interstellar, then Interstellar might be recommended to Alice.

In [3]:
import pandas as pd
import os
base_path = r'C:\Users\Sara\Documents\python proj'

User-based Collaborative Filtering

In [4]:
# Load ratings and movie info
movies = pd.read_csv(os.path.join(base_path, 'movies.csv'))
ratings = pd.read_csv(os.path.join(base_path, 'ratings.csv'))

# Merge to get movie titles (optional, for display)
ratings_merged = ratings.merge(movies[['movieId', 'title']], on='movieId', how='left')

# Drop missing values (just in case)
ratings_merged.dropna(subset=['userId', 'movieId', 'rating'], inplace=True)



In [12]:
# Check the structure
ratings_merged.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,296,5.0,1147880044,Pulp Fiction (1994)
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994)
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,1,665,5.0,1147878820,Underground (1995)
4,1,899,3.5,1147868510,Singin' in the Rain (1952)


In [10]:
len(ratings_merged)

25000095

Option 1: Filter to “Active” Users and/or “Popular” Movies

We do this, because the full dataset is too computationally expensive for personal laptops.

In [11]:
# Keep users with at least 500 ratings
user_counts = ratings_merged['userId'].value_counts()
active_users = user_counts[user_counts >= 500].index

# Keep movies with at least 1000 ratings
movie_counts = ratings_merged['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 1000].index

# Filter the DataFrame
filtered = ratings_merged[
    ratings_merged['userId'].isin(active_users) &
    ratings_merged['movieId'].isin(popular_movies)
]

In [12]:
len(filtered)

7127698

In [13]:
# Assuming 'filtered' is your cleaned dataset with userId, movieId, rating, and title
user_item_matrix = filtered.pivot_table(index='userId', columns='movieId', values='rating')

# Mean-center and fill NaNs with 0 (for cosine similarity)
user_item_centered = user_item_matrix.sub(user_item_matrix.mean(axis=1), axis=0).fillna(0)

# Save for later use in prediction
user_ids = user_item_centered.index.tolist()
movie_ids = user_item_centered.columns.tolist()


In [14]:
user_item_centered 

movieId,1,2,3,4,5,6,7,8,9,10,...,188301,189203,189333,189713,192385,192389,192803,194448,195159,201773
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.302050,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
12,0.663313,-1.336687,-1.336687,0.000000,0.000000,0.000000,-0.336687,0.0,0.0,-0.336687,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
72,0.000000,-1.371912,0.000000,0.000000,0.000000,0.628088,0.628088,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
80,0.000000,-1.561441,0.000000,0.000000,0.000000,2.438559,0.000000,0.0,0.0,2.438559,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
120,0.748718,0.000000,0.000000,0.000000,0.000000,0.000000,-0.251282,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162484,0.406332,-0.093668,0.000000,-0.093668,-1.093668,0.000000,0.000000,0.0,0.0,-0.093668,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
162495,0.091423,0.091423,0.591423,0.000000,-0.908577,0.000000,0.000000,0.0,0.0,1.091423,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
162508,1.294654,0.000000,0.000000,0.000000,0.000000,-0.205346,0.000000,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
162516,1.220662,-0.779338,-2.779338,-1.279338,0.000000,1.220662,-1.279338,0.0,0.0,0.220662,...,0.220662,0.0,0.0,0.0,0.0,0.0,0.720662,0.220662,0.220662,0.0


In [15]:
from annoy import AnnoyIndex

# Number of movies (features per user)
f = len(movie_ids)
annoy_index = AnnoyIndex(f, metric='angular')  # angular ≈ cosine

user_id_map = {}               # userId -> index in annoy
reverse_user_id_map = {}       # index in annoy -> userId

for i, user_id in enumerate(user_ids):
    vector = user_item_centered.loc[user_id].values
    annoy_index.add_item(i, vector)
    user_id_map[user_id] = i
    reverse_user_id_map[i] = user_id

# Build with 10 trees (balance speed/accuracy)
annoy_index.build(n_trees=10) # will deliver randomized results. Optionally we can save the index

True

In [16]:
def predict_rating_annoy(user_id, movie_id, user_item_matrix, annoy_index, user_id_map, reverse_user_id_map, k=30, min_raters=3):
    if user_id not in user_id_map or movie_id not in user_item_matrix.columns:
        return None

    u_idx = user_id_map[user_id]
    neighbors = annoy_index.get_nns_by_item(u_idx, k, include_distances=False)
    neighbor_ids = [reverse_user_id_map[i] for i in neighbors if reverse_user_id_map[i] != user_id]

    ratings = []
    for neighbor_id in neighbor_ids:
        rating = user_item_matrix.loc[neighbor_id, movie_id]
        if not np.isnan(rating):
            ratings.append(rating)

    if len(ratings) < min_raters:
        return user_item_matrix.loc[user_id].mean()  # fallback

    return np.mean(ratings)

In [17]:
def recommend_top_n_annoy(user_id, user_item_matrix, annoy_index, user_id_map, reverse_user_id_map, movies_df, n=5, k_neighbors=30):
    if user_id not in user_item_matrix.index:
        return []

    unrated_movies = user_item_matrix.columns[user_item_matrix.loc[user_id].isna()]
    predictions = []

    for movie_id in unrated_movies:
        pred = predict_rating_annoy(user_id, movie_id, user_item_matrix, annoy_index, user_id_map, reverse_user_id_map, k=k_neighbors)
        if pred is not None:
            predictions.append((movie_id, pred))

    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    return [(mid, movies_df.loc[movies_df.movieId == mid, 'title'].values[0], score) for mid, score in top_n]

In [18]:
import numpy as np

example_user = user_item_matrix.index[0]
example_user1 = user_item_matrix.index[1]
example_user2 = user_item_matrix.index[2]

top_recs = recommend_top_n_annoy(
    user_id=example_user,
    user_item_matrix=user_item_matrix,
    annoy_index=annoy_index,
    user_id_map=user_id_map,
    reverse_user_id_map=reverse_user_id_map,
    movies_df=filtered,
    n=5
)

top_recs1 = recommend_top_n_annoy(
    user_id=example_user1,
    user_item_matrix=user_item_matrix,
    annoy_index=annoy_index,
    user_id_map=user_id_map,
    reverse_user_id_map=reverse_user_id_map,
    movies_df=filtered,
    n=5
)
top_recs2 = recommend_top_n_annoy(
    user_id=example_user2,
    user_item_matrix=user_item_matrix,
    annoy_index=annoy_index,
    user_id_map=user_id_map,
    reverse_user_id_map=reverse_user_id_map,
    movies_df=filtered,
    n=5
)

# Print recommendations
print(f"\nTop 5 Recommendations for User {example_user}:")
for movie_id, title, score in top_recs:
    print(f"{title} (Movie ID: {movie_id}) — Predicted Rating: {score:.2f}")

print(f"\nTop 5 Recommendations for User {example_user1}:")
for movie_id, title, score in top_recs1:
    print(f"{title} (Movie ID: {movie_id}) — Predicted Rating: {score:.2f}")

print(f"\nTop 5 Recommendations for User {example_user2}:")
for movie_id, title, score in top_recs2:
    print(f"{title} (Movie ID: {movie_id}) — Predicted Rating: {score:.2f}")


Top 5 Recommendations for User 3:
In the Mood For Love (Fa yeung nin wa) (2000) (Movie ID: 4144) — Predicted Rating: 4.83
Persona (1966) (Movie ID: 7327) — Predicted Rating: 4.75
On the Waterfront (1954) (Movie ID: 1945) — Predicted Rating: 4.67
Fanny and Alexander (Fanny och Alexander) (1982) (Movie ID: 2068) — Predicted Rating: 4.62
12 Angry Men (1957) (Movie ID: 1203) — Predicted Rating: 4.53

Top 5 Recommendations for User 12:
400 Blows, The (Les quatre cents coups) (1959) (Movie ID: 2731) — Predicted Rating: 4.71
Seven Samurai (Shichinin no samurai) (1954) (Movie ID: 2019) — Predicted Rating: 4.68
Breaking the Waves (1996) (Movie ID: 1354) — Predicted Rating: 4.67
Stranger Than Paradise (1984) (Movie ID: 3925) — Predicted Rating: 4.67
Hearts of Darkness: A Filmmakers Apocalypse (1991) (Movie ID: 26729) — Predicted Rating: 4.67

Top 5 Recommendations for User 72:
Touch of Evil (1958) (Movie ID: 1248) — Predicted Rating: 4.67
Grand Illusion (La grande illusion) (1937) (Movie ID: 31

Evaluation

In [19]:
from sklearn.model_selection import train_test_split

# Split the original filtered ratings
train_df, test_df = train_test_split(filtered, test_size=0.2, random_state=42)

In [20]:
from sklearn.model_selection import train_test_split

# Split your filtered data
train_df, test_df = train_test_split(filtered, test_size=0.2, random_state=42)

# Only keep users from the Annoy index
active_user_ids = user_item_matrix.index
test_sample = test_df[test_df['userId'].isin(active_user_ids)].sample(n=10, random_state=42)


In [21]:
test_sample

Unnamed: 0,userId,movieId,rating,timestamp,title
2797770,18551,54286,5.0,1445901557,"Bourne Ultimatum, The (2007)"
21679295,140913,4310,2.5,1112679231,Pearl Harbor (2001)
10683697,69398,48780,4.0,1298477439,"Prestige, The (2006)"
23209387,150681,2701,4.0,1115165140,Wild Wild West (1999)
20355220,132358,180031,3.0,1535534994,The Shape of Water (2017)
11573153,75054,3101,5.0,997377038,Fatal Attraction (1987)
16642645,107945,3730,4.5,1548871485,"Conversation, The (1974)"
19719637,128103,47,4.5,1156923416,Seven (a.k.a. Se7en) (1995)
13356390,86427,368,5.0,966640776,Maverick (1994)
19810534,128740,8641,3.5,1230161589,Anchorman: The Legend of Ron Burgundy (2004)


In [22]:
from sklearn.metrics import root_mean_squared_error

true_ratings = []
predicted_ratings = []

for _, row in test_sample.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    true_rating = row['rating']
    
    pred = predict_rating_annoy(
        user_id, movie_id,
        user_item_matrix=user_item_matrix,
        annoy_index=annoy_index,
        user_id_map=user_id_map,
        reverse_user_id_map=reverse_user_id_map,
        k=30
    )
    
    if pred is not None:
        true_ratings.append(true_rating)
        predicted_ratings.append(pred)

# Final RMSE
rmse = root_mean_squared_error(true_ratings, predicted_ratings)
print(f"✅ RMSE on 100 samples: {rmse:.4f}")

✅ RMSE on 100 samples: 1.0779


Option 2: Use a Sparse Matrix Instead of Full pivot_table

In [23]:
from scipy.sparse import csr_matrix

# Reindex for compatibility
user_mapper = {uid: i for i, uid in enumerate(ratings_merged['userId'].unique())}
movie_mapper = {mid: i for i, mid in enumerate(ratings_merged['movieId'].unique())}

ratings_merged['user_index'] = ratings_merged['userId'].map(user_mapper)
ratings_merged['movie_index'] = ratings_merged['movieId'].map(movie_mapper)

# Create sparse matrix
rows = ratings_merged['user_index']
cols = ratings_merged['movie_index']
data = ratings_merged['rating']

sparse_user_item = csr_matrix((data, (rows, cols)))

SCIKIT-SURPRISE

In [24]:
# Sample 2,000 unique users from the filtered dataset
sampled_users = filtered['userId'].drop_duplicates().sample(n=2000, random_state=42)
sampled = filtered[filtered['userId'].isin(sampled_users)]


In [25]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

# Create a Surprise dataset from the sample
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(sampled[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


ModuleNotFoundError: No module named 'surprise'

In [None]:
sim_options = {
    'name': 'cosine',
    'user_based': True
}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x2a70bb6b770>

In [None]:
predictions = algo.test(testset)
rmse(predictions)

RMSE: 0.9187


0.9186541593271833

In [None]:
from collections import defaultdict

def get_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid in top_n:
        top_n[uid] = sorted(top_n[uid], key=lambda x: x[1], reverse=True)[:n]
    return top_n

top_n = get_top_n(predictions, n=5)

In [None]:
user_id = list(top_n.keys())[0]

print(f"\nTop 5 Recommendations for User {user_id}:")
for movie_id, score in top_n[user_id]:
    title = sampled.loc[sampled.movieId == movie_id, 'title'].drop_duplicates().values[0]
    print(f"{title} (Movie ID: {movie_id}) — Predicted Rating: {score:.2f}")


Top 5 Recommendations for User 12319:
Godfather: Part II, The (1974) (Movie ID: 1221) — Predicted Rating: 4.58
Matrix, The (1999) (Movie ID: 2571) — Predicted Rating: 4.57
Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001) (Movie ID: 4973) — Predicted Rating: 4.55
Seven (a.k.a. Se7en) (1995) (Movie ID: 47) — Predicted Rating: 4.43
Saving Private Ryan (1998) (Movie ID: 2028) — Predicted Rating: 4.39


In [None]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from collections import defaultdict

# 2. Load your filtered DataFrame
# Assumes 'filtered' contains: userId, movieId, rating, title
ratings = filtered[['userId', 'movieId', 'rating']].copy()

# 3. Prepare data for Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings, reader)

# 4. Split into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# 5. Build the User-Based CF model
sim_options = {
    'name': 'cosine',
    'user_based': True  # Set to False for item-based
}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

# 6. Evaluate on test set
predictions = algo.test(testset)
rmse(predictions)

In [None]:
import numpy
print(numpy.__version__)

1.26.4


Faiss

In [26]:
# Load ratings and movie info
movies = pd.read_csv(os.path.join(base_path, 'movies.csv'))
ratings = pd.read_csv(os.path.join(base_path, 'ratings.csv'))

# Merge to get movie titles (optional, for display)
ratings_merged = ratings.merge(movies[['movieId', 'title']], on='movieId', how='left')

# Drop missing values (just in case)
ratings_merged.dropna(subset=['userId', 'movieId', 'rating'], inplace=True)

# Keep users with at least 500 ratings
user_counts = ratings_merged['userId'].value_counts()
active_users = user_counts[user_counts >= 500].index

# Filter the DataFrame to keep only active users
filtered = ratings_merged[ratings_merged['userId'].isin(active_users)]

# Keep movies with at least 500 ratings
movie_counts = ratings_merged['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 200].index

# Filter the dataset to include only the popular movies
filtered = filtered[filtered['movieId'].isin(popular_movies)]

# Check the filtered dataset size
print(f"Filtered dataset shape: {filtered.shape}")


Filtered dataset shape: (8305815, 5)


In [27]:
filtered["movieId"].nunique(), filtered["userId"].nunique()

(7984, 9713)

In [29]:
import faiss
from sklearn.preprocessing import normalize

# Create user-item matrix
user_item_matrix = filtered.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
user_vectors = normalize(user_item_matrix.values.astype('float32'))

# Build FAISS index (cosine similarity ≈ dot product on normalized vectors)
index = faiss.IndexFlatIP(user_vectors.shape[1])  # IP = Inner Product
index.add(user_vectors)

In [30]:
index

<faiss.swigfaiss_avx2.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x0000013C1CF99050> >

In [31]:
target_user_id = 3
target_idx = user_item_matrix.index.get_loc(target_user_id)
D, I = index.search(np.expand_dims(user_vectors[target_idx], axis=0), k=5)

# Output: indices of similar users + similarity scores
similar_users = user_item_matrix.index[I[0]]
print(similar_users)

Index([3, 124561, 149968, 159639, 91040], dtype='int64', name='userId')


In [32]:
def normalize_user_item_matrix(user_item_matrix):
    # Normalize the user-item matrix by subtracting each user's average rating
    # Calculate the mean of each user's ratings (ignoring NaN values)
    user_means = user_item_matrix.mean(axis=1)
    
    # Subtract the user's mean rating from each of their ratings
    user_item_matrix_normalized = user_item_matrix.sub(user_means, axis=0)
    
    return user_item_matrix_normalized

In [33]:
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,201588,201646,201749,201773,202103,202429,202439,203222,204698,205383
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,4.0,2.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,0.0,2.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162484,3.5,3.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162495,3.0,3.0,3.5,0.0,2.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162508,4.5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162516,4.5,2.5,0.5,2.0,0.0,4.5,2.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
def recommend_top_n_faiss(user_id, user_item_matrix, index, user_vectors, movies_df, n=5, k=5):
    """
    Recommend top-N movies for a given user using FAISS-based nearest neighbors.

    Parameters:
    - user_id: ID of the target user.
    - user_item_matrix: DataFrame containing user-item ratings.
    - index: FAISS index built from user vectors.
    - user_vectors: Numpy array of user vectors.
    - movies_df: DataFrame containing movie information with 'movieId' and 'title' columns.
    - n: Number of top recommendations to return.
    - k: Number of nearest neighbors to consider.

    Returns:
    - List of tuples: (movieId, title, predicted_rating)
    """
    if user_id not in user_item_matrix.index:
        return []

    # Normalize the user-item matrix by subtracting the user's average rating
    user_item_matrix_normalized = normalize_user_item_matrix(user_item_matrix)

    # Get the index of the target user
    target_idx = user_item_matrix.index.get_loc(user_id)

    # Search for k+1 nearest neighbors (including the user themselves)
    D, I = index.search(np.expand_dims(user_vectors[target_idx], axis=0), k=k+1)

    # Retrieve neighbor indices and distances
    neighbor_indices = I[0]
    distances = D[0]

    # Map indices to user IDs
    neighbor_ids = user_item_matrix.index[neighbor_indices]

    # Exclude the target user from neighbors
    mask = neighbor_ids != user_id
    neighbor_ids = neighbor_ids[mask]
    distances = distances[mask]

    # Identify movies not yet rated by the target user (those rated as 0)
    unrated_movies = user_item_matrix.columns[user_item_matrix.loc[user_id] == 0]
    
    if unrated_movies.empty:
        print(f"User {user_id} has already rated all movies. No unrated movies left to recommend.")
        return []  # Return empty list if no unrated movies are left
    
    predictions = []

    for movie_id in unrated_movies:
        # Retrieve ratings from neighbors for the current movie (from the normalized matrix)
        neighbor_ratings = user_item_matrix_normalized.loc[neighbor_ids, movie_id]

        # Drop ratings that are NaN (neighbors who haven't rated the movie)
        valid_ratings = neighbor_ratings.dropna()

        if valid_ratings.empty:
            continue  # Skip if no neighbor has rated the movie

        # Compute similarities (inverse of distances)
        similarities = 1 / (distances[:len(valid_ratings)] + 1e-10)  # Add epsilon to avoid division by zero
        
        # Normalize similarities to ensure they are not too skewed
        if np.sum(similarities) > 0:
            similarities = similarities / np.sum(similarities)

        # Compute weighted average of neighbor ratings
        weighted_sum = np.dot(valid_ratings.values, similarities)
        sum_of_weights = np.sum(similarities)

        # Calculate predicted rating in the normalized scale
        predicted_rating_normalized = weighted_sum / sum_of_weights

        # Get the user's average rating to "de-normalize" the prediction
        user_avg_rating = user_item_matrix.loc[user_id].mean()

        # Add the user's average back to the predicted rating
        predicted_rating = predicted_rating_normalized + user_avg_rating
        
        # Clamp predictions between 1 and 5
        predicted_rating = min(5, max(1, predicted_rating))

        predictions.append((movie_id, predicted_rating))

    if not predictions:
        print("No predictions made.")
        return []

    # Sort predictions by predicted rating in descending order
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]

    # Retrieve movie titles
    recommendations = []
    for movie_id, score in top_n:
        title = movies_df.loc[movies_df['movieId'] == movie_id, 'title'].values
        title = title[0] if len(title) > 0 else "Unknown Title"
        recommendations.append((movie_id, title, score))

    return recommendations


In [35]:
user_id = 12  # Replace with the target user's ID
recommendations = recommend_top_n_faiss(user_id, user_item_matrix, index, user_vectors, movies, n=5, k=5)

print(f"Top 5 Recommendations for User {user_id}:")
for movie_id, title, score in recommendations:
    print(f"{title} (Movie ID: {movie_id}) — Predicted Rating: {score:.2f}")

Top 5 Recommendations for User 12:
Graduate, The (1967) (Movie ID: 1247) — Predicted Rating: 4.22
Chinatown (1974) (Movie ID: 1252) — Predicted Rating: 4.01
Do the Right Thing (1989) (Movie ID: 3424) — Predicted Rating: 3.98
Sense and Sensibility (1995) (Movie ID: 17) — Predicted Rating: 3.90
Taxi Driver (1976) (Movie ID: 111) — Predicted Rating: 3.88


In [36]:
# Get the movies that the user has rated
rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] != 0]  # Non-zero ratings

print(f"Movies rated by User {user_id}:")
for movie_id, rating in rated_movies.items():
    title = movies.loc[movies['movieId'] == movie_id, 'title'].values
    title = title[0] if len(title) > 0 else "Unknown Title"
    print(f"{title} (Movie ID: {movie_id}) — Rating: {rating:.2f}")

Movies rated by User 12:
Toy Story (1995) (Movie ID: 1) — Rating: 4.00
Jumanji (1995) (Movie ID: 2) — Rating: 2.00
Grumpier Old Men (1995) (Movie ID: 3) — Rating: 2.00
Sabrina (1995) (Movie ID: 7) — Rating: 3.00
GoldenEye (1995) (Movie ID: 10) — Rating: 3.00
Casino (1995) (Movie ID: 16) — Rating: 5.00
Four Rooms (1995) (Movie ID: 18) — Rating: 4.00
Money Train (1995) (Movie ID: 20) — Rating: 1.00
Get Shorty (1995) (Movie ID: 21) — Rating: 2.00
Copycat (1995) (Movie ID: 22) — Rating: 3.00
Leaving Las Vegas (1995) (Movie ID: 25) — Rating: 3.00
Now and Then (1995) (Movie ID: 27) — Rating: 2.00
City of Lost Children, The (Cité des enfants perdus, La) (1995) (Movie ID: 29) — Rating: 4.00
Dangerous Minds (1995) (Movie ID: 31) — Rating: 2.00
Twelve Monkeys (a.k.a. 12 Monkeys) (1995) (Movie ID: 32) — Rating: 2.00
Dead Man Walking (1995) (Movie ID: 36) — Rating: 4.00
Clueless (1995) (Movie ID: 39) — Rating: 4.00
To Die For (1995) (Movie ID: 45) — Rating: 2.00
Seven (a.k.a. Se7en) (1995) (Movie 

Full dense Matrix Testing

In [37]:
from scipy.sparse import csr_matrix

user_ids = ratings_merged['userId'].unique()
movie_ids = ratings_merged['movieId'].unique()

user_map = {uid: i for i, uid in enumerate(user_ids)}
movie_map = {mid: i for i, mid in enumerate(movie_ids)}
reverse_user_map = {i: uid for uid, i in user_map.items()}
reverse_movie_map = {i: mid for mid, i in movie_map.items()}

row = ratings_merged['userId'].map(user_map)
col = ratings_merged['movieId'].map(movie_map)
data = ratings_merged['rating']

sparse_matrix = csr_matrix((data, (row, col)), shape=(len(user_ids), len(movie_ids)))

In [64]:
122 in movie_map


True

In [38]:
from sklearn.preprocessing import normalize
import faiss
import numpy as np

# Define batch size (e.g. 10,000 users at a time)
batch_size = 10000
index = faiss.IndexFlatIP(sparse_matrix.shape[1])  # FAISS expects dense matrix

user_indices = np.arange(sparse_matrix.shape[0])

for i in range(0, len(user_indices), batch_size):
    batch_rows = user_indices[i:i+batch_size]
    dense_batch = sparse_matrix[batch_rows].toarray().astype('float32')
    dense_batch = normalize(dense_batch)
    index.add(dense_batch)

In [39]:
target_idx = user_map[3]  # e.g., userId = 3
query_vector = normalize(sparse_matrix[target_idx].toarray().astype('float32'))

D, I = index.search(query_vector, k=5)

similar_user_ids = [reverse_user_map[i] for i in I[0]]
print(similar_user_ids)

[np.int64(3), np.int64(124561), np.int64(149968), np.int64(51328), np.int64(159639)]


In [65]:
def predict_rating_faiss(user_id, movie_id, sparse_matrix, faiss_index, user_map, movie_map, k=5):
    if user_id not in user_map or movie_id not in movie_map:
        return ("no user/movie")

    user_idx = user_map[user_id]
    movie_idx = movie_map[movie_id]

    user_vector = sparse_matrix[user_idx].toarray().astype('float32')
    norm_vector = user_vector / (np.linalg.norm(user_vector) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]  # exclude self

    ratings = []
    for neighbor in neighbor_indices:
        neighbor_rating = sparse_matrix[neighbor, movie_idx]
        if neighbor_rating != 0:
            ratings.append(neighbor_rating)

    if not ratings:
        global_avg = sparse_matrix.data.mean()
    return float(global_avg)

    return float(np.mean(ratings))

In [None]:
def recommend_top_n_faiss(user_id, sparse_matrix, faiss_index, user_map, movie_map, reverse_movie_map, movies_df, n=5, k=10):
    if user_id not in user_map:
        return []

    user_idx = user_map[user_id]
    user_vector = sparse_matrix[user_idx].toarray().astype('float32')

    if np.count_nonzero(user_vector) == 0:
        return []  # cold-start user

    norm_vector = user_vector / (np.linalg.norm(user_vector) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    unrated_movie_indices = np.where(user_vector[0] == 0)[0]
    predictions = []

    for movie_idx in unrated_movie_indices:
        weighted_ratings = []
        weights = []

        for sim, neighbor in zip(similarities, neighbor_indices):
            rating = sparse_matrix[neighbor, movie_idx]
            if rating != 0:
                weighted_ratings.append(rating * sim)
                weights.append(sim)

        if weights and np.sum(weights) > 0:
            pred = np.sum(weighted_ratings) / (np.sum(weights) + 1e-10)
            movie_id = reverse_movie_map[movie_idx]
            title_row = movies_df.loc[movies_df['movieId'] == movie_id, 'title']
            title = title_row.values[0] if not title_row.empty else "Unknown"
            predictions.append((movie_id, title, pred))

    top_n = sorted(predictions, key=lambda x: x[2], reverse=True)[:n]
    return top_n


In [66]:
# Predict rating
movies_df = ratings_merged[['movieId', 'title']].drop_duplicates()
rating = predict_rating_faiss(
    user_id=3,
    movie_id=122,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map
)
print(f"Predicted rating: {rating}")

# Top-N recommendations
top_recs = recommend_top_n_faiss(
    user_id=3,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    reverse_movie_map=reverse_movie_map,
    movies_df=movies_df,
    n=5
)

for mid, title, score in top_recs:
    print(f"{title} (Movie ID: {mid}) — Predicted Rating: {score:.2f}")

Predicted rating: 3.533854451353085
Talk to Her (Hable con Ella) (2002) (Movie ID: 5878) — Predicted Rating: 5.00
Planet Earth II (2016) (Movie ID: 171011) — Predicted Rating: 5.00
Cinderella (1950) (Movie ID: 1022) — Predicted Rating: 5.00
Peter Pan (1953) (Movie ID: 2087) — Predicted Rating: 5.00
My Cousin Vinny (1992) (Movie ID: 2302) — Predicted Rating: 5.00


In [46]:
rating = predict_rating_faiss(
    user_id=3,
    movie_id=122,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map
)
print(f"Predicted rating: {rating}")

Predicted rating: None


In [73]:
def predict_rating_faiss_hybrid(user_id, movie_id, sparse_matrix, faiss_index, user_map, movie_map, k=20, min_overlap=3):
    if user_id not in user_map or movie_id not in movie_map:
        return "no user/movie"

    user_idx = user_map[user_id]
    movie_idx = movie_map[movie_id]

    user_vector = sparse_matrix[user_idx].toarray().astype('float32')
    if np.count_nonzero(user_vector) == 0:
        return float(sparse_matrix.data.mean())  # cold-start user → global avg

    norm_vector = user_vector / (np.linalg.norm(user_vector) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    ratings = []
    weights = []

    for sim, neighbor_idx in zip(similarities, neighbor_indices):
        neighbor_vector = sparse_matrix[neighbor_idx].toarray().astype('float32')
        both_rated = (user_vector != 0) & (neighbor_vector != 0)
        overlap = np.sum(both_rated)

        rating = sparse_matrix[neighbor_idx, movie_idx]
        if rating != 0 and overlap >= min_overlap:
            adjusted_weight = sim * (overlap / (np.count_nonzero(user_vector) + 1e-10))
            ratings.append(rating * adjusted_weight)
            weights.append(adjusted_weight)

    if ratings and np.sum(weights) > 0:
        pred = np.sum(ratings) / np.sum(weights)
        return float(np.clip(pred, 0.5, 5.0))

    # 🔁 Fallbacks
    movie_ratings = sparse_matrix[:, movie_idx].data
    if len(movie_ratings) > 0:
        return float(np.clip(np.mean(movie_ratings), 0.5, 5.0))

    user_ratings = sparse_matrix[user_idx].data
    if len(user_ratings) > 0:
        return float(np.clip(np.mean(user_ratings), 0.5, 5.0))

    return float(np.clip(sparse_matrix.data.mean(), 0.5, 5.0))  # Final fallback



In [74]:
pred = predict_rating_faiss_hybrid(
    user_id=3,
    movie_id=122,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    k=20
)
print(f"Predicted rating: {pred}")

Predicted rating: 2.8593924191750277


In [None]:
def recommend_top_n_faiss_hybrid_fast(
    user_id,
    sparse_matrix,
    faiss_index,
    user_map,
    movie_map,
    reverse_movie_map,
    movies_df,
    n=5,
    k=20,
    min_overlap=3
):
    if user_id not in user_map:
        return []

    user_idx = user_map[user_id]
    user_vector = sparse_matrix[user_idx]
    if user_vector.nnz == 0:
        return []  # cold-start user

    # Normalize dense vector for FAISS
    user_dense = user_vector.toarray().astype('float32')
    norm_vector = user_dense / (np.linalg.norm(user_dense) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    # Cache neighbor vectors
    neighbor_vectors = {
        idx: sparse_matrix[idx].toarray().astype('float32')
        for idx in neighbor_indices
    }

    unrated_indices = np.where(user_dense[0] == 0)[0]
    user_rated_mask = user_dense[0] != 0
    user_rated_count = np.count_nonzero(user_rated_mask)

    predictions = []

    for movie_idx in unrated_indices:
        weighted_scores = []
        weights = []
        true_ratings = []

        for sim, neighbor_idx in zip(similarities, neighbor_indices):
            neighbor_vec = neighbor_vectors[neighbor_idx]
            neighbor_rating = neighbor_vec[0, movie_idx]

            if neighbor_rating == 0:
                continue

            overlap = np.sdef recommend_top_n_faiss_hybrid_fast(
    user_id,
    sparse_matrix,
    faiss_index,
    user_map,
    movie_map,
    reverse_movie_map,
    movies_df,
    n=5,
    k=20,
    min_overlap=3
):
    if user_id not in user_map:
        return []

    user_idx = user_map[user_id]
    user_vector = sparse_matrix[user_idx]
    if user_vector.nnz == 0:
        return []  # cold-start user

    user_dense = user_vector.toarray().astype('float32')
    norm_vector = user_dense / (np.linalg.norm(user_dense) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    neighbor_vectors = {
        idx: sparse_matrix[idx].toarray().astype('float32')
        for idx in neighbor_indices
    }

    unrated_indices = np.where(user_dense[0] == 0)[0]
    user_rated_mask = user_dense[0] != 0
    user_rated_count = np.count_nonzero(user_rated_mask)

    predictions = []

    for movie_idx in unrated_indices:
        weighted_scores = []
        weights = []
        true_ratings = []

        for sim, neighbor_idx in zip(similarities, neighbor_indices):
            neighbor_vec = neighbor_vectors[neighbor_idx]
            neighbor_rating = neighbor_vec[0, movie_idx]

            if neighbor_rating == 0:
                continue

            overlap = np.sum((user_rated_mask) & (neighbor_vec[0] != 0))
            if overlap < min_overlap:
                continue

            weight = sim * (overlap / (user_rated_count + 1e-10))
            weighted_scores.append(neighbor_rating * weight)
            weights.append(weight)
            true_ratings.append(neighbor_rating)

        if weighted_scores:
            pred = np.sum(weighted_scores) / np.sum(weights)
            pred = float(np.clip(pred, 0.5, 5.0))

            movie_id = reverse_movie_map[movie_idx]
            title_row = movies_df.loc[movies_df['movieId'] == movie_id, 'title']
            title = title_row.values[0] if not title_row.empty else "Unknown"

            predictions.append((movie_id, title, pred, true_ratings))

    # Sort by predicted rating
    top_n = sorted(predictions, key=lambda x: x[2], reverse=True)[:n]

    # 📢 Print results with extra info
    for movie_id, title, score, ratings_used in top_n:
        print(f"{title} (Movie ID: {movie_id})")
        print(f"  → Predicted Rating: {score:.2f}")
        print(f"  → Neighbors Used: {len(ratings_used)}")
        print(f"  → Ratings Used: {[float(r) for r in ratings_used]}")
        print("-" * 60)

    return [(movie_id, title, score) for movie_id, title, score, _ in top_n]
um((user_rated_mask) & (neighbor_vec[0] != 0))
            if overlap < min_overlap:
                continue

            weight = sim * (overlap / (user_rated_count + 1e-10))
            weighted_scores.append(neighbor_rating * weight)
            weights.append(weight)
            true_ratings.append(neighbor_rating)

        if weighted_scores:
            pred = np.sum(weighted_scores) / (np.sum(weights) + 1e-10)
        else:
            # Fallback to movie mean or global mean
            movie_ratings = sparse_matrix[:, movie_idx].data
            pred = movie_ratings.mean() if len(movie_ratings) > 0 else sparse_matrix.data.mean()

        pred = float(np.clip(pred, 0.5, 5.0))
        movie_id = reverse_movie_map[movie_idx]
        title_row = movies_df.loc[movies_df['movieId'] == movie_id, 'title']
        title = title_row.values[0] if not title_row.empty else "Unknown"

        # 🔍 Debug output for transparency
        print(f"[Debug] Movie: {movie_id}, Title: {title}, Ratings used: {true_ratings}, Weights: {weights}")
        print(f"[→] Final Predicted: {pred:.2f} using {len(true_ratings)} neighbors")

        predictions.append((movie_id, title, pred))

    top_n = sorted(predictions, key=lambda x: x[2], reverse=True)[:n]
    return top_n

In [85]:
top_recs = recommend_top_n_faiss_hybrid_fast(
    user_id=3,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    reverse_movie_map=reverse_movie_map,
    movies_df=movies_df,
    n=5
)

for mid, title, score in top_recs:
    print(f"Movie: {movie_id}, Ratings: {ratings}, Weights: {weights}")


[Debug] Movie: 899, Title: Singin' in the Rain (1952), Ratings: [np.float64(1.4716454504464394), np.float64(0.9365835264323714)], Weights: [np.float64(0.42047012869898265), np.float64(0.4682917632161857)]
[Debug] Movie: 1088, Title: Dirty Dancing (1987), Ratings: [np.float64(1.63902117125665)], Weights: [np.float64(0.4682917632161857)]
[Debug] Movie: 1175, Title: Delicatessen (1991), Ratings: [np.float64(0.8527300343279586)], Weights: [np.float64(0.3410920137311834)]
[Debug] Movie: 2011, Title: Back to the Future Part II (1989), Ratings: [np.float64(1.129838948751196), np.float64(2.2586808553552675), np.float64(1.2967669847532945), np.float64(1.6818805147959306), np.float64(1.8731670528647428), np.float64(1.0232760411935502), np.float64(0.8490376801387725), np.float64(0.9794348158485966), np.float64(1.2359484812106598), np.float64(1.115342996105745), np.float64(1.303499564846279), np.float64(1.3313880935309288)], Weights: [np.float64(0.3766129829170653), np.float64(0.45173617107105346)

NameError: name 'weights' is not defined

In [82]:
user_3_actual = ratings_merged[ratings_merged['userId'] == 3][['movieId', 'title', 'rating']]
user_3_actual = user_3_actual.sort_values(by='rating', ascending=False).reset_index(drop=True)

print(user_3_actual)

     movieId                                       title  rating
0        296                         Pulp Fiction (1994)     5.0
1        741  Ghost in the Shell (Kôkaku kidôtai) (1995)     5.0
2       1217                                  Ran (1985)     5.0
3       1252                            Chinatown (1974)     5.0
4       1221              Godfather: Part II, The (1974)     5.0
..       ...                                         ...     ...
651    79224                      Karate Kid, The (2010)     2.0
652     7325                      Starsky & Hutch (2004)     2.0
653     3646                    Big Momma's House (2000)     2.0
654     3825                          Coyote Ugly (2000)     2.0
655      480                        Jurassic Park (1993)     2.0

[656 rows x 3 columns]


In [89]:
def grid_search_k_for_user(user_id, sparse_matrix, faiss_index, user_map, movie_map, reverse_movie_map, movies_df, k_values=[5, 10, 20, 30], min_overlap=3, n=5):
    results = {}

    for k in k_values:
        print(f"\nRunning recommend_top_n_faiss_hybrid_fast with k={k}...")
        top_recs = recommend_top_n_faiss_hybrid_fast(
            user_id=user_id,
            sparse_matrix=sparse_matrix,
            faiss_index=faiss_index,
            user_map=user_map,
            movie_map=movie_map,
            reverse_movie_map=reverse_movie_map,
            movies_df=movies_df,
            n=n,
            k=k,
            min_overlap=min_overlap
        )
        results[k] = top_recs

        print(f"\nTop {n} for k={k}:")
        for mid, title, score in top_recs:
            print(f"{title} (Movie ID: {mid}) — Predicted Rating: {score:.2f}")
    return results


In [90]:
k_results = grid_search_k_for_user(
    user_id=3,
    sparse_matrix=sparse_matrix,
    faiss_index=index,
    user_map=user_map,
    movie_map=movie_map,
    reverse_movie_map=reverse_movie_map,
    movies_df=movies_df,
    k_values=[5, 10, 20, 30],
    min_overlap=2,
    n=5
)



Running recommend_top_n_faiss_hybrid_fast with k=5...
[Debug] Movie: 306, Title: Three Colors: Red (Trois couleurs: Rouge) (1994), Ratings used: [], Weights: []
[→] Final Predicted: 4.07 using 0 neighbors
[Debug] Movie: 307, Title: Three Colors: Blue (Trois couleurs: Bleu) (1993), Ratings used: [], Weights: []
[→] Final Predicted: 3.98 using 0 neighbors
[Debug] Movie: 665, Title: Underground (1995), Ratings used: [], Weights: []
[→] Final Predicted: 3.95 using 0 neighbors
[Debug] Movie: 899, Title: Singin' in the Rain (1952), Ratings used: [np.float32(3.5), np.float32(2.0)], Weights: [np.float64(0.42047012869898265), np.float64(0.4682917632161857)]
[→] Final Predicted: 2.71 using 2 neighbors
[Debug] Movie: 1088, Title: Dirty Dancing (1987), Ratings used: [np.float32(3.5)], Weights: [np.float64(0.4682917632161857)]
[→] Final Predicted: 3.50 using 1 neighbors
[Debug] Movie: 1175, Title: Delicatessen (1991), Ratings used: [], Weights: []
[→] Final Predicted: 3.96 using 0 neighbors
[Debug

KeyboardInterrupt: 