# Item-based Collaborative Filtering

Core idea
“If two movies get similar rating patterns from many users, then someone who liked one of those movies will probably like the other as well.”

How it works
  1. For every movie the target user has rated, find similar movies (e.g., by cosine similarity of rating vectors).
  2. Score those similar movies—weight by how much the user liked the original movie and by the similarity strength.
  3. Rank the unseen movies by the aggregated scores.
  4. Recommend the top-ranked ones to the user.

Example
Many users who liked Inception also liked Interstellar and The Matrix.
Alice rated Inception and The Matrix highly but hasn’t watched Interstellar.
Because both of Alice’s liked movies point to Interstellar as a close neighbour, the system recommends Interstellar to Alice.

In [1]:
# Load datasets
import pandas as pd
movies = pd.read_csv("../data/csv/movies.csv")
ratings = pd.read_csv("../data/csv/ratings.csv")

In [2]:
# Merge ratings with movie titles
movies_ratings = ratings.merge(movies[['movieId', 'title']], on='movieId', how='left')

print(movies_ratings.shape)
movies_ratings.head()

(25000095, 5)


Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,296,5.0,1147880044,Pulp Fiction (1994)
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994)
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,1,665,5.0,1147878820,Underground (1995)
4,1,899,3.5,1147868510,Singin' in the Rain (1952)


## Option 1: Filter to “Active” Users and/or “Popular” Movies

We do this, because the full dataset is too computationally expensive for personal laptops.

In [3]:
# Keep users with at least 500 ratings
user_counts = movies_ratings['userId'].value_counts()
active_users = user_counts[user_counts >= 100].index

# Keep movies with at least 1000 ratings
movie_counts = movies_ratings['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 500].index

# Filter the DataFrame
movies_ratings_filtered = movies_ratings[
    movies_ratings['userId'].isin(active_users) &
    movies_ratings['movieId'].isin(popular_movies)
]

print(movies_ratings_filtered.shape)
movies_ratings_filtered.head()

(18772328, 5)


Unnamed: 0,userId,movieId,rating,timestamp,title
70,2,1,3.5,1141415820,Toy Story (1995)
71,2,62,0.5,1141417130,Mr. Holland's Opus (1995)
72,2,110,5.0,1141416589,Braveheart (1995)
73,2,150,4.0,1141415790,Apollo 13 (1995)
74,2,151,4.5,1141415643,Rob Roy (1995)


### Lenskit implementation

In [4]:
from lenskit.data import from_interactions_df
from lenskit.knn import ItemKNNScorer
from lenskit.splitting import sample_records
from lenskit import pipeline, recommend

# convert df to a Dataset (new in LensKit 2025.2.0)
lk_dataset = from_interactions_df(movies_ratings_filtered, 
                                   user_col='userId', 
                                   item_col='movieId', 
                                   rating_col='rating', 
                                   timestamp_col='timestamp')
lk_dataset
pd_lk_dataset = lk_dataset.interaction_matrix(format='pandas')
pd_lk_dataset

Unnamed: 0,user_num,item_num,rating,timestamp,title
0,0,0,3.5,1141415820,Toy Story (1995)
1,0,54,0.5,1141417130,Mr. Holland's Opus (1995)
2,0,92,5.0,1141416589,Braveheart (1995)
3,0,113,4.0,1141415790,Apollo 13 (1995)
4,0,114,4.5,1141415643,Rob Roy (1995)
...,...,...,...,...,...
18772323,63891,4027,4.5,1240953372,Ratatouille (2007)
18772324,63891,4139,2.5,1240951998,Bee Movie (2007)
18772325,63891,4153,2.0,1240950697,Alvin and the Chipmunks (2007)
18772326,63891,4198,4.0,1240953434,"Dark Knight, The (2008)"


In [5]:
# we also can get some statistics from the Dataset object 
lk_dataset.item_stats()
# lk_dataset.user_stats()

  stats.loc[stats["count"] == 0, "first_time"] = pd.NaT
  stats.loc[stats["count"] == 0, "last_time"] = pd.NaT


Unnamed: 0_level_0,record_count,user_count,rating_count,mean_rating,count,first_time,last_time
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,36139,36139,36139,3.867235,36139,826437752,1574285022
2,18422,18422,18422,3.168983,18422,822873600,1574276821
3,6572,6572,6572,3.027998,6572,825582938,1573439445
4,1925,1925,1925,2.774286,1925,825582938,1574213055
5,6355,6355,6355,2.935169,6355,826962728,1573033018
...,...,...,...,...,...,...,...
197711,689,689,689,3.537736,689,1552750151,1574294960
200818,662,662,662,3.619335,662,1554956153,1574290269
201773,930,930,930,3.680108,930,1558380688,1574280869
202429,731,731,731,3.876881,731,1563592457,1574219415


In [6]:
# split into test and train sets
from lenskit.splitting import sample_records
test_size = round(lk_dataset.interaction_count * 0.2)
split = sample_records(lk_dataset, test_size) # creates test split with 3000 records and train split with the rest

In [7]:
# train
pipe = pipeline.topn_pipeline(ItemKNNScorer(), n=20)
pipe.train(split.train)

  return torch.sparse_csr_tensor(


In [8]:
# bratch recommend to users in test set
from lenskit.batch import recommend as batch_recommend

recs = batch_recommend(pipe, list(split.test.keys()), n=10) 

In [9]:
# convert to dataframes
df_recs = recs.to_df()
df_test = split.test_df

In [10]:
# validate recommendations
from sklearn.metrics import mean_squared_error

# keep only the columns we need and join on user & item
merged = (
  df_test[['user_id', 'item_id', 'rating']]
    .merge(df_recs[['user_id', 'item_id', 'score']],
           on=['user_id', 'item_id'],
           how='inner')          # drop pairs without predictions
)

mse  = mean_squared_error(merged['rating'], merged['score'])
rmse = mse ** 0.5 

print('MSE:', mse)
print('RMSE:', rmse)

MSE: 0.32750095382607425
RMSE: 0.5722769904740835


In [11]:
# test recommendations for a specific user
user_id = lk_dataset.users.index[3]
recs = recommend(pipe, user_id, n=15)
recs_df = recs.to_df()

print("Recommendations for user", user_id)
movies_ratings[movies_ratings['movieId'].isin(recs.ids())]

Recommendations for user 5


Unnamed: 0,userId,movieId,rating,timestamp,title
79,2,318,5.0,1141417181,"Shawshank Redemption, The (1994)"
153,2,2324,1.5,1141417726,Life Is Beautiful (La Vita è bella) (1997)
257,3,50,5.0,1439474391,"Usual Suspects, The (1995)"
265,3,318,4.0,1439472424,"Shawshank Redemption, The (1994)"
315,3,2329,5.0,1439474463,American History X (1998)
...,...,...,...,...,...
24999882,162540,49530,5.0,1248855507,Blood Diamond (2006)
24999916,162541,50,5.0,1240953428,"Usual Suspects, The (1995)"
24999927,162541,318,4.0,1240952695,"Shawshank Redemption, The (1994)"
25000002,162541,2324,4.5,1240953595,Life Is Beautiful (La Vita è bella) (1997)


In [12]:
# movies the user already watched
movies_ratings[movies_ratings['userId'] == user_id].sort_values('rating', ascending=False).head(10)

Unnamed: 0,userId,movieId,rating,timestamp,title
1154,5,32,5.0,830786277,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
1155,5,36,5.0,830786409,Dead Man Walking (1995)
1164,5,141,5.0,858625837,"Birdcage, The (1996)"
1158,5,50,5.0,831900118,"Usual Suspects, The (1995)"
1157,5,47,5.0,833146729,Seven (a.k.a. Se7en) (1995)
1166,5,150,5.0,830786155,Apollo 13 (1995)
1188,5,337,5.0,830786345,What's Eating Gilbert Grape (1993)
1183,5,292,5.0,830786263,Outbreak (1995)
1165,5,147,5.0,830786700,"Basketball Diaries, The (1995)"
1179,5,260,5.0,858625863,Star Wars: Episode IV - A New Hope (1977)


In [13]:
# perform a cross-validation 
from lenskit.data import ItemListCollection, UserIDKey
from lenskit.splitting import crossfold_records, SampleFrac
from lenskit.metrics import Precision, Recall, NDCG, RunAnalysis
from lenskit.metrics import RunAnalysis, RBP, TestItemCount, ListLength, MAE, RMSE, Precision, Recall, Hit
from lenskit.basic import UserTrainingHistoryLookup


base = pipeline.topn_pipeline(ItemKNNScorer(), n=23) 

# Set up containers to collect test interactions and recommendations for all folds
all_test = ItemListCollection(UserIDKey)           # test interactions by user
all_recs = ItemListCollection(UserIDKey)           # recommended item lists by user

folds = 5
# Perform 5-fold cross-validation
for fold, split in enumerate(crossfold_records(lk_dataset, rng=9, partitions=folds)):
    print(f"Processing fold {fold}...")
    all_test.add_from(split.test)  

    # Clone and train the pipeline on the training portion of this fold
    algo = base.clone() 
    algo.train(split.train)

    # Generate top-10 recommendations for each user in the test set of this fold
    recs = batch_recommend(algo, list(split.test.keys()), n=10) 
    all_recs.add_from(recs)  


Processing fold 0...
Processing fold 1...
Processing fold 2...
Processing fold 3...
Processing fold 4...


In [None]:
# Set up the evaluation with desired metrics
analysis = RunAnalysis()
analysis.add_metric(Precision())
analysis.add_metric(Recall())
analysis.add_metric(NDCG())
analysis.add_metric(Hit())

# Measure the recommendations against the test data
results = analysis.measure(all_recs, all_test)

metrics_df = results.list_metrics()
avg_metrics = metrics_df.mean()

print(f"metrics over {folds}-fold CV over {len(lk_dataset)} samples:")
print(avg_metrics)

metrics over 5-fold CV:
Precision    0.000419
Recall       0.000005
NDCG         0.000019
Hit          0.000419
dtype: float64


In [12]:
rec_df = all_recs.to_df()
test_df = all_test.to_df()


  rec_df = all_recs.to_df()


In [None]:
# test_df.shape
# rec_df.shape
# test_df[(test_df['user_id'] == 3) & (test_df['item_id'] == 77561)]

(3754465, 5)

In [None]:
# recommend
# user_id = 
# recs = recommend(pipe, user_id, n=15)
# recs_df = recs.to_df()

# print("Recommendations for user", user_id)
# movies_ratings[movies_ratings['movieId'].isin(recs.ids())]

recs = recommend(base, lk_dataset.users.index[5], n=10) 
recs.to_df().head()

[2m2025-05-31 23:51:55[0m [[31m[1merror    [0m] [1mfailed to run node            [0m [36mnode[0m=[35mhistory-lookup[0m



[2m2025-05-31 23:51:55[0m [[31m[1merror    [0m] [1mfailed to run node            [0m [36mnode[0m=[35mscorer[0m



[2m2025-05-31 23:51:56[0m [[31m[1merror    [0m] [1mfailed to run node            [0m [36mnode[0m=[35mranker[0m



AttributeError: 'UserTrainingHistoryLookup' object has no attribute 'interactions'

Unnamed: 0,item_id,item_num,score,rank
