# User-Based Collaborative Filtering

## Import Libraries

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

from src.user_cf import get_top_k_recommendations

Directories are Found.


## Load Train & Test Data

In [2]:
# Load train data
train = pd.read_csv("../data/curated/train.csv")

# Filter active users
active_users = train["user_id"].value_counts()
active_users = active_users[active_users >= 10].index
train = train[train["user_id"].isin(active_users)]

print(f"Filtered train to {len(train)} ratings from {len(active_users)} active users.")

# Load Test Data
test = pd.read_csv("../data/curated/test.csv")
test = test[test["rating"] >= 4]
print(f"Test size: {test.shape}")

Filtered train to 80367 ratings from 943 active users.
Test size: (10819, 3)


## Build User-Item Matrix & Cosine Similarity

In [3]:
user_item_matrix = train.pivot(index="user_id", columns="item_id", values="rating")

user_item_centered = user_item_matrix.sub(user_item_matrix.mean(axis=1), axis=0)

user_item_centered = user_item_centered.fillna(0)

similarity_matrix = pd.DataFrame(
    cosine_similarity(user_item_centered),
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)

# Check
print(f"User Item Matrix Size: {user_item_matrix.shape}")
print(similarity_matrix.iloc[:5, :5])

User Item Matrix Size: (943, 1651)
user_id         1         2         3         4         5
user_id                                                  
1        1.000000  0.043419  0.010540  0.013470  0.118058
2        0.043419  1.000000  0.001380 -0.045285  0.015966
3        0.010540  0.001380  1.000000 -0.072125  0.021674
4        0.013470 -0.045285 -0.072125  1.000000 -0.001186
5        0.118058  0.015966  0.021674 -0.001186  1.000000


## Pick a Target User

In [4]:
# Keep only test users that are also in the train matrix
test = test[test["user_id"].isin(user_item_matrix.index)]
print(f"Filtered test set to {len(test)} entries with known users.")

target_user_id = test["user_id"].sample(1).iloc[0]
print(f"Target user: {target_user_id}")

Filtered test set to 10819 entries with known users.
Target user: 478


## Generate Top-K Recommendations

In [5]:
# Force reload the updated module to get the new function signature
import importlib
import src.user_cf

importlib.reload(src.user_cf)

from src.user_cf import get_top_k_recommendations

In [6]:
k_movies = 10
top_k_recs = get_top_k_recommendations(
    user_id=target_user_id,
    ratings=train,
    user_item_matrix=user_item_matrix.fillna(0),  # keep original for rating lookup
    similarity_matrix=similarity_matrix,
    k=5,
    top_n_neighbors=50
)
print(f"Top {k_movies} recommended items for user {target_user_id}:")
top_k_recs

Top 10 recommended items for user 478:


[(30, 5.000000000000001), (114, 5.0), (236, 5.0), (262, 5.0), (311, 5.0)]

## Evaluation

### Precision@K

In [32]:
from src.evaluation import evaluate_precision_at_k

precision = evaluate_precision_at_k(
    test_df=test,
    train_df=train,
    user_item_matrix=user_item_matrix,
    similarity_matrix=similarity_matrix,
    k=k_movies
)

print(f"Average Precision@{k_movies}: {precision * 100:.4f}%")

Average Precision@10: 0.4212%


### Recall@k

In [31]:
from src.evaluation import evaluate_recall_at_k

recall = evaluate_recall_at_k(
    test_df=test,
    train_df=train,
    user_item_matrix=user_item_matrix,
    similarity_matrix=similarity_matrix,
    k=k_movies,
    top_n_neighbors=50
)

print(f"Average Recall@{k_movies}: {recall * 100:.4f}%")

Average Recall@10: 0.2628%
