# User-based Collaborative Filtering using Saved FAISS

Predicting on new user

In [1]:
import faiss
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, csr_matrix, vstack
from sklearn.preprocessing import normalize
import pickle
import os
base_path = r'C:\Users\Sara\Documents\python proj'

In [2]:
# Load sparse matrix
sparse_matrix = load_npz("sparse_matrix.npz")

# Load user/movie mappings
with open("user_movie_maps.pkl", "rb") as f:
    user_map, movie_map, reverse_movie_map = pickle.load(f)

# Load movie metadata
movies_df = pd.read_csv(os.path.join(base_path, 'movies.csv'))  # or the path you use

In [3]:
def build_faiss_index_batched(sparse_matrix, batch_size=10000):
    index = faiss.IndexFlatIP(sparse_matrix.shape[1])  # inner product for cosine sim
    user_indices = np.arange(sparse_matrix.shape[0])

    for i in range(0, len(user_indices), batch_size):
        batch_rows = user_indices[i:i + batch_size]
        dense_batch = sparse_matrix[batch_rows].toarray().astype('float32')
        dense_batch = normalize(dense_batch)
        index.add(dense_batch)

    print("FAISS index rebuilt")
    return index


In [4]:
def add_new_user(new_ratings, movie_map, faiss_index, sparse_matrix):
    from scipy.sparse import vstack

    num_movies = sparse_matrix.shape[1]
    new_vector = np.zeros(num_movies)

    for movie_id, rating in new_ratings:
        if movie_id in movie_map:
            new_vector[movie_map[movie_id]] = rating

    # Append to sparse matrix
    new_user_sparse = csr_matrix(new_vector)
    sparse_matrix = vstack([sparse_matrix, new_user_sparse])

    # Normalize and add to FAISS
    new_vector_norm = normalize(new_vector.reshape(1, -1)).astype('float32')
    faiss_index.add(new_vector_norm)

    new_user_idx = sparse_matrix.shape[0] - 1
    return new_user_idx, sparse_matrix

In [5]:
def recommend_top_n_faiss_hybrid_fast_structured(
    user_id,
    sparse_matrix,
    faiss_index,
    user_map,
    movie_map,
    reverse_movie_map,
    movies_df,
    n=5,
    k=20,
    min_overlap=3,
    min_neighbors=3
):
    import pandas as pd

    if user_id not in user_map:
        return pd.DataFrame()

    user_idx = user_map[user_id]
    user_vector = sparse_matrix[user_idx]
    if user_vector.nnz == 0:
        return pd.DataFrame()  # cold-start user

    user_dense = user_vector.toarray().astype('float32')
    norm_vector = user_dense / (np.linalg.norm(user_dense) + 1e-10)

    D, I = faiss_index.search(norm_vector, k=k+1)
    neighbor_indices = I[0][1:]
    similarities = D[0][1:]

    neighbor_vectors = {
        idx: sparse_matrix[idx].toarray().astype('float32')
        for idx in neighbor_indices
    }

    unrated_indices = np.where(user_dense[0] == 0)[0]
    user_rated_mask = user_dense[0] != 0
    user_rated_count = np.count_nonzero(user_rated_mask)

    predictions = []

    for movie_idx in unrated_indices:
        weighted_scores = []
        weights = []
        true_ratings = []

        for sim, neighbor_idx in zip(similarities, neighbor_indices):
            neighbor_vec = neighbor_vectors[neighbor_idx]
            neighbor_rating = neighbor_vec[0, movie_idx]

            if neighbor_rating == 0:
                continue

            overlap = np.sum((user_rated_mask) & (neighbor_vec[0] != 0))
            if overlap < min_overlap:
                continue

            weight = sim * (overlap / (user_rated_count + 1e-10))
            weighted_scores.append(neighbor_rating * weight)
            weights.append(weight)
            true_ratings.append(neighbor_rating)

        if len(true_ratings) < min_neighbors:
            continue

        pred = np.sum(weighted_scores) / np.sum(weights)
        pred = float(np.clip(pred, 0.5, 5.0))

        movie_id = reverse_movie_map[movie_idx]
        title_row = movies_df.loc[movies_df['movieId'] == movie_id, 'title']
        genres_row = movies_df.loc[movies_df['movieId'] == movie_id, 'genres']
        title = title_row.values[0] if not title_row.empty else "Unknown"
        genres = genres_row.values[0] if not genres_row.empty else "Unknown"

        predictions.append({
            'userId': user_id,
            'movieId': movie_id,
            'title': title,
            'genres': genres,
            'predicted_rating': round(pred, 2),
            'neighbors_used': len(true_ratings)
        })

    top_n_df = pd.DataFrame(predictions).sort_values(by='predicted_rating', ascending=False).head(n)
    return top_n_df.reset_index(drop=True)


In [None]:
# Rebuild FAISS index
faiss_index = build_faiss_index_batched(sparse_matrix)

FAISS index rebuilt


In [20]:
# Define new user ratings
new_user_ratings = [
    (1, 4.0),     # Toy Story
    (318, 5.0),   # Shawshank Redemption
    (296, 4.0),   # Pulp Fiction
    (2571, 4.5),  # Matrix
    (260, 4.0)    # Star Wars
]

# Add new user
new_user_idx, sparse_matrix = add_new_user(
    new_user_ratings,
    movie_map,
    faiss_index,
    sparse_matrix
)

In [21]:
# Run recommendation
top_recs = recommend_top_n_faiss_hybrid_fast_structured(
    user_id=new_user_idx,
    sparse_matrix=sparse_matrix,
    faiss_index=faiss_index,
    user_map={new_user_idx: new_user_idx},
    movie_map=movie_map,
    reverse_movie_map=reverse_movie_map,
    movies_df=movies_df,
    n=5,
    k=20,
    min_overlap=3,
    min_neighbors=2 # optimal number tbd
)

# Display results
for _, row in top_recs.iterrows():
    print(f"{row['title']} (Movie ID: {row['movieId']})")
    print(f"  → Predicted Rating: {row['predicted_rating']:.2f}")
    print(f"  → Neighbors Used: {row['neighbors_used']}")
    print(f"  → Genres: {row['genres']}")
    print("-" * 60)

Honey, I Shrunk the Kids (1989) (Movie ID: 2054)
  → Predicted Rating: 5.00
  → Neighbors Used: 2
  → Genres: Adventure|Children|Comedy|Fantasy|Sci-Fi
------------------------------------------------------------
Mummy, The (1999) (Movie ID: 2617)
  → Predicted Rating: 5.00
  → Neighbors Used: 2
  → Genres: Action|Adventure|Comedy|Fantasy|Horror|Thriller
------------------------------------------------------------
Swingers (1996) (Movie ID: 1060)
  → Predicted Rating: 4.83
  → Neighbors Used: 3
  → Genres: Comedy|Drama
------------------------------------------------------------
Star Wars: Episode I - The Phantom Menace (1999) (Movie ID: 2628)
  → Predicted Rating: 4.76
  → Neighbors Used: 2
  → Genres: Action|Adventure|Sci-Fi
------------------------------------------------------------
Rescuers Down Under, The (1990) (Movie ID: 2089)
  → Predicted Rating: 4.72
  → Neighbors Used: 2
  → Genres: Adventure|Animation|Children
------------------------------------------------------------


Something we could consider here could be **boosting rare high ratings** (for example this user rated Shawshank Redemption with 5.0). If a movie has a very high rating by the user but few neighbors match on it, we could slightly increase its weight in similarity, e.g.: weight = sim * (overlap / (user_rated_count + 1e-10)) * rating_boost.

In [22]:
print("FAISS index dimension:", faiss_index.d)
print("Number of users indexed:", faiss_index.ntotal)
print("Sparse matrix shape:", sparse_matrix.shape)

FAISS index dimension: 59047
Number of users indexed: 162544
Sparse matrix shape: (162544, 59047)
