In [None]:
"""
Brendan Lauterborn
User-Based Collaborative Filtering for Movie Recommendations
Data Citation:
F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context.
ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19.
https://doi.org/10.1145/2827872
"""

!curl https://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('.')


import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
ratings_df.head(10)






  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1675k      0 --:--:-- --:--:-- --:--:-- 1675k


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [None]:
print("Dimension: ", ratings_df.shape)

Dimension:  (100836, 4)


In [None]:
user_item_all = pd.pivot_table(
    data = ratings_df,
    index = "userId",
    columns = "movieId",
    values = "rating"
)
user_means_all = user_item_all.mean(axis=1)
user_item_centered_all = user_item_all.sub(user_means_all, axis=0)

user_item_centered_filled_all = user_item_centered_all.fillna(0)
user_item_centered_filled_all.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.0,-0.366379,0.0,0.0,-0.366379,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.363636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.506369,1.506369,-0.493631,1.506369,0.506369,0.506369,-0.493631,0.0,-0.493631,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.269737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.425532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.574468,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
user_item_all.loc[1, 6]

np.float64(4.0)

In [None]:

user_similarity_all = cosine_similarity(user_item_centered_filled_all)
user_similarity_df_all = pd.DataFrame(
    data = user_similarity_all,
    index = user_item_centered_filled_all.index,
    columns = user_item_centered_filled_all.index)

user_similarity_df_all.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.001265,0.000553,0.048419,0.021847,-0.045497,-0.0062,0.047013,0.01951,-0.008754,...,0.018127,-0.017172,-0.015221,-0.037059,-0.029121,0.012016,0.055261,0.075224,-0.025713,0.010932
2,0.001265,1.0,0.0,-0.017164,0.021796,-0.021051,-0.011114,-0.048085,0.0,0.003012,...,-0.050551,-0.031581,-0.001688,0.0,0.0,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,0.0,1.0,-0.01126,-0.031539,0.0048,0.0,-0.032471,0.0,0.0,...,-0.004904,-0.016117,0.017749,0.0,-0.001431,-0.037289,-0.007789,-0.013001,0.0,0.01955
4,0.048419,-0.017164,-0.01126,1.0,-0.02962,0.013956,0.058091,0.002065,-0.005874,0.05159,...,-0.037687,0.063122,0.02764,-0.013782,0.040037,0.02059,0.014628,-0.037569,-0.017884,-0.000995
5,0.021847,0.021796,-0.031539,-0.02962,1.0,0.009111,0.010117,-0.012284,0.0,-0.033165,...,0.015964,0.012427,0.027076,0.012461,-0.036272,0.026319,0.031896,-0.001751,0.093829,-0.000278


In [None]:
def get_similar_users_all(user_id, n):
  sims = user_similarity_df_all.loc[user_id]
  sims = sims.drop(user_id)

  # Grab similarity scores for this user
  result = sims.sort_values(ascending=False).head(n)
  return pd.DataFrame({
      'userID': result.index,
      'similarity': result.values
  }
  )
get_similar_users_all(1, 5)


Unnamed: 0,userID,similarity
0,301,0.124799
1,597,0.102631
2,414,0.101348
3,477,0.099217
4,57,0.09907


In [None]:
from sklearn.model_selection import train_test_split

# split ratings into train and test
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Pivot TRAIN
user_item_train = train_df.pivot_table(index='userId', columns='movieId', values='rating')

# Mean-center on train
user_means = user_item_train.mean(axis=1)
user_item_centered = user_item_train.sub(user_means, axis=0)
user_item_centered_filled = user_item_centered.fillna(0)

user_similarity = cosine_similarity(user_item_centered_filled)
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=user_item_train.index,
    columns=user_item_train.index
)



In [None]:
def predict_rating(user_id, movie_id, k_neighbors =50):

  #movie not in dataset
  if movie_id not in user_item_train.columns:
    return np.nan

  sim_users = user_similarity_df.loc[user_id]
  ratings = user_item_train[movie_id]
  is_valid = ratings.notna()

  #remove users self,etc
  if user_id in ratings.index:
        is_valid.loc[user_id] = False
  if not is_valid.any():
        return np.nan

  #prediction function setup from slides
  temp = pd.DataFrame({
          'sim':  sim_users[is_valid],
          'r_bp': ratings[is_valid],
          'mu_b': user_means[is_valid]
    })
  temp['deviation'] = temp['r_bp'] - temp['mu_b']


  if temp.empty:
    return np.nan

  #top-k by similarity
  top_k = temp.sort_values('sim', key=np.abs, ascending=False).head(k_neighbors)

  denom = np.abs(top_k['sim']).sum()
  if denom == 0:
      return np.nan
  #actual function
  pred_dev = (top_k['sim'] * top_k['deviation']).sum() / denom
  pred = float(user_means.loc[user_id] + pred_dev)
  rmin, rmax = .5, 5.0
  return float(np.clip(pred, rmin, rmax))

In [None]:
def recommend_movies(user_id, n=5, k=10):
    # must exist in TRAIN
    if user_id not in user_item_train.index:
        return pd.DataFrame(columns=['movieId', 'predicted_rating'])

    # movies the user hasn't rated in TRAIN
    user_row = user_item_train.loc[user_id]
    unseen_mask = user_row.isna()
    candidate_movies = user_item_train.columns[unseen_mask]

    preds = []
    for movie_id in candidate_movies:
        # skip if movie unseen in train matrix
        if movie_id not in user_item_train.columns:
            continue
        p = predict_rating(user_id, movie_id, k_neighbors=k)
        if not np.isnan(p):
            preds.append((movie_id, p))

    if not preds:
        return pd.DataFrame(columns=['movieId', 'predicted_rating'])

    df = pd.DataFrame(preds, columns=['movieId','predicted_rating'])
    return df.sort_values('predicted_rating', ascending=False).head(n)

In [None]:
#make recommendations
movies_df = pd.read_csv('ml-latest-small/movies.csv')
top_recommendations = recommend_movies(6, n=5).merge(movies_df, on='movieId', how='left')

print(top_recommendations[['title', 'predicted_rating']].head(5))

                                 title  predicted_rating
0               Poltergeist III (1988)               5.0
1             Exorcist III, The (1990)               5.0
2      Exorcist II: The Heretic (1977)               5.0
3                     Cage Dive (2017)               5.0
4  Fast, Cheap & Out of Control (1997)               5.0


In [None]:
# from sklearn.metrics import mean_squared_error
# from math import sqrt

# # Collect predictions on the test set
# actuals = []
# preds = []

# for idx, row in test_df.iterrows():
#     uid = row['userId']
#     mid = row['movieId']
#     true_rating = row['rating']

#     # Skip if user/movie not seen in training
#     if uid not in user_item_train.index or mid not in user_item_train.columns:
#         continue

#     pred_rating = predict_rating(uid, mid, k_neighbors=10)
#     if not np.isnan(pred_rating):
#         actuals.append(true_rating)
#         preds.append(pred_rating)

# # Compute RMSE
# rmse = sqrt(mean_squared_error(actuals, preds))
# print(f"Test RMSE: {rmse:.4f}")
# print(f"Total test samples evaluated: {len(preds)}")

In [None]:
test_df[test_df['userId']==1].head(5)


Unnamed: 0,userId,movieId,rating,timestamp
226,1,3740,4.0,964982417
149,1,2353,5.0,964983861
87,1,1278,5.0,964983414
35,1,596,5.0,964982838
134,1,2115,5.0,964982529


In [None]:
def precision_recall_map_ndcg_cached(test_df, user_topn_cache, k_list=[5,10,20], threshold=4.0):
    results = []
    users = test_df['userId'].unique()

    # Precompute each user's relevant set from TEST once
    test_likes_by_user = {}
    for u in users:
        ut = test_df[test_df['userId'] == u]
        likes = set(ut.loc[ut['rating'] >= threshold, 'movieId'])
        test_likes_by_user[u] = likes

    for K in k_list:
        precisions, recalls, maps, ndcgs = [], [], [], []

        for u in users:
            if u not in user_topn_cache:
                continue
            recs = user_topn_cache[u][:K]
            if not recs:
                continue

            test_liked = test_likes_by_user.get(u, set())
            if not test_liked:
                continue

            hits = [1 if mid in test_liked else 0 for mid in recs]

            # precision@K, recall@K
            prec = np.sum(hits) / K
            rec = np.sum(hits) / len(test_liked)
            precisions.append(prec)
            recalls.append(rec)

            # MAP@K
            if np.sum(hits) > 0:
                cum_hits = np.cumsum(hits)
                precisions_at_i = [cum_hits[i] / (i+1) for i in range(len(hits)) if hits[i] == 1]
                maps.append(np.mean(precisions_at_i))
            else:
                maps.append(0.0)

            # NDCG@K
            dcg = np.sum([h / np.log2(i+2) for i, h in enumerate(hits)])
            idcg = np.sum([1.0 / np.log2(i+2) for i in range(min(len(test_liked), K))])
            ndcgs.append(dcg / idcg if idcg > 0 else 0.0)

        results.append({
            "K": K,
            "Precision": float(np.mean(precisions)) if precisions else 0.0,
            "Recall": float(np.mean(recalls)) if recalls else 0.0,
            "MAP": float(np.mean(maps)) if maps else 0.0,
            "NDCG": float(np.mean(ndcgs)) if ndcgs else 0.0
        })

    return pd.DataFrame(results)

# Run the cached evaluation
metrics_df = precision_recall_map_ndcg_cached(test_df, user_topn_cache, k_list=[5,10,20], threshold=4.0)
display(metrics_df)


KeyboardInterrupt: 