# Item-based Collaborative Filtering

Core idea
“If two movies get similar rating patterns from many users, then someone who liked one of those movies will probably like the other as well.”

How it works
  1. For every movie the target user has rated, find similar movies (e.g., by cosine similarity of rating vectors).
  2. Score those similar movies—weight by how much the user liked the original movie and by the similarity strength.
  3. Rank the unseen movies by the aggregated scores.
  4. Recommend the top-ranked ones to the user.

Example
Many users who liked Inception also liked Interstellar and The Matrix.
Alice rated Inception and The Matrix highly but hasn’t watched Interstellar.
Because both of Alice’s liked movies point to Interstellar as a close neighbour, the system recommends Interstellar to Alice.

In [None]:
# Load datasets
import pandas as pd
movies = pd.read_csv("../data/csv/movies.csv")
ratings = pd.read_csv("../data/csv/ratings.csv")

In [None]:
# Merge ratings with movie titles
movies_ratings = ratings.merge(movies[['movieId', 'title']], on='movieId', how='left')

print(movies_ratings.shape)
movies_ratings.head()

(25000095, 5)


Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,296,5.0,1147880044,Pulp Fiction (1994)
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994)
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,1,665,5.0,1147878820,Underground (1995)
4,1,899,3.5,1147868510,Singin' in the Rain (1952)


## Option 1: Filter to “Active” Users and/or “Popular” Movies

We do this, because the full dataset is too computationally expensive for personal laptops.

In [None]:
# Keep users with at least 500 ratings
user_counts = movies_ratings['userId'].value_counts()
active_users = user_counts[user_counts >= 500].index

# Keep movies with at least 1000 ratings
movie_counts = movies_ratings['movieId'].value_counts()
popular_movies = movie_counts[movie_counts >= 1000].index

# Filter the DataFrame
movies_ratings_filtered = movies_ratings[
    movies_ratings['userId'].isin(active_users) &
    movies_ratings['movieId'].isin(popular_movies)
]

print(movies_ratings_filtered.shape)
movies_ratings_filtered.head()

(7127698, 5)


Unnamed: 0,userId,movieId,rating,timestamp,title
254,3,1,4.0,1439472215,Toy Story (1995)
255,3,29,4.5,1484754967,"City of Lost Children, The (Cité des enfants p..."
256,3,32,4.5,1439474635,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
257,3,50,5.0,1439474391,"Usual Suspects, The (1995)"
258,3,111,4.0,1484753849,Taxi Driver (1976)


### Lenskit implementation

In [None]:
from lenskit.data import from_interactions_df
from lenskit.knn import ItemKNNScorer
from lenskit import recommend, pipeline

# convert df to a Dataset (new in LensKit 2025.2.0)
lk_dataset = from_interactions_df(movies_ratings_filtered, 
                                   user_col='userId', 
                                   item_col='movieId', 
                                   rating_col='rating', 
                                   timestamp_col='timestamp')
lk_dataset
pd_lk_dataset = lk_dataset.interaction_matrix(format='pandas')
pd_lk_dataset

Unnamed: 0,user_num,item_num,rating,timestamp,title
0,0,292,5.0,1147880044,Pulp Fiction (1994)
1,0,302,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994)
2,0,303,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993)
3,0,654,5.0,1147878820,Underground (1995)
4,0,878,3.5,1147868510,Singin' in the Rain (1952)
...,...,...,...,...,...
25000090,162540,11359,4.5,1240953372,Ratatouille (2007)
25000091,162540,11925,2.5,1240951998,Bee Movie (2007)
25000092,162540,11972,2.0,1240950697,Alvin and the Chipmunks (2007)
25000093,162540,12216,4.0,1240953434,"Dark Knight, The (2008)"


In [64]:
# we also can get some statistics from the Dataset object 
lk_dataset.item_stats()
# lk_dataset.user_stats()

  stats.loc[stats["count"] == 0, "first_time"] = pd.NaT
  stats.loc[stats["count"] == 0, "last_time"] = pd.NaT


Unnamed: 0_level_0,record_count,user_count,rating_count,mean_rating,count,first_time,last_time
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,57309,57309,57309,3.893708,57309,822873600,1574285022
2,24228,24228,24228,3.251527,24228,822873600,1574276821
3,11804,11804,11804,3.142028,11804,823185228,1573439445
4,2523,2523,2523,2.853547,2523,823185225,1574213055
5,11714,11714,11714,3.058434,11714,823185224,1573033018
...,...,...,...,...,...,...,...
209157,1,1,1,1.500000,1,1574280748,1574280748
209159,1,1,1,3.000000,1,1574280985,1574280985
209163,1,1,1,4.500000,1,1574284913,1574284913
209169,1,1,1,3.000000,1,1574291826,1574291826


In [65]:

pipe = pipeline.topn_pipeline(ItemKNNScorer())
pipe.train(lk_dataset)

In [66]:
recs     = recommend(pipe, lk_dataset.users.index[0], n=15)
recs_df  = recs.to_df()
recs_df
# pd_lk_dataset[pd_lk_dataset["item_num"].isin(recs.numbers())][['item_num','title']]

Unnamed: 0,item_id,item_num,score,rank
0,79866,15083,5.592944,1
1,171299,43082,5.56413,2
2,156535,37074,5.336547,3
3,137010,29632,5.323746,4
4,196965,54440,5.322197,5
5,195889,53960,5.283384,6
6,166574,41040,5.267812,7
7,162360,39343,5.205526,8
8,169346,42235,5.203886,9
9,157931,37594,5.188014,10


In [67]:
movies[movies['movieId'].isin(recs.ids())]

Unnamed: 0,movieId,title,genres
15096,79866,Schmatta: Rags to Riches to Rags (2009),Documentary
30779,137010,The Nightmare Nanny (2013),Drama|Thriller
30795,137042,When Friendship Kills (1996),Drama
36003,149218,"Totò, Peppino e i fuorilegge (1956)",Comedy
39146,156535,High Society (1955),Comedy
39761,157931,La Cage (1975),Drama
41744,162360,Downhill (2014),Comedy
43517,166229,Allerleirauh (2012),Children|Fantasy
43535,166267,Finnish Blood Swedish Heart (2012),(no genres listed)
43669,166574,Skewered (2013),Comedy|Drama


In [None]:

from lenskit import Recommender, batch
from lenskit.algorithms.item_knn import ItemItem
from lenskit.crossfold import partition_users, SampleN  # add SampleN
from lenskit import batch
from lenskit.metrics import predict as lm

# 1. long-format ratings -----------------------------------------
ratings = (
    df_ratings_filtered[['userId', 'movieId', 'rating']]
      .rename(columns={'userId': 'user', 'movieId': 'item'})
      .astype({'rating': 'float64'})
)

# hold out 5 ratings per user for testing
train, test = next(
  partition_users(ratings, partitions=1, method=SampleN(5), rng_spec=42)
)

algo = Recommender.adapt(ItemItem(nnbrs=40, center=True))
algo.fit(train)

preds = batch.predict(algo, test[['user', 'item']])

# --- make the two Series share a MultiIndex (user, item) -------------
pred  = preds.set_index(['user', 'item'])['prediction']
truth = test .set_index(['user', 'item'])['rating']

# if you only want the pairs that overlap:
rmse = lm.rmse(pred, truth, missing='ignore')
mae  = lm.mae (pred, truth, missing='ignore')

print(f'RMSE={rmse:.3f}, MAE={mae:.3f}')

recs = batch.recommend(algo, test['user'].unique(), n=10, n_jobs=6)

from lenskit.metrics import topn as lt
prec  = lt.precision(recs, test)      # Precision@10
rec   = lt.recall   (recs, test)      # Recall@10
ndcg  = lt.ndcg     (recs, test)      # Ranking quality

print(f'P@10={prec:.3f}, R@10={rec:.3f}, NDCG@10={ndcg:.3f}')


# # 2. build predictor → adapt to recommender ----------------------
# pred = ItemItem(nnbrs=40, center=True)   # multithreaded inside
# algo = Recommender.adapt(pred)           # <— wrap as Top-N recommender
# algo.fit(ratings)

# # 3. batch recommendations (set n_jobs to the core count you want)
# recs = batch.recommend(algo,
#                        ratings['user'].unique(),
#                        n=10,
#                        n_jobs=5)


RMSE=0.737, MAE=0.554
P@10=0.000, R@10=0.000, NDCG@10=0.000


In [13]:
# create ratings matrix
df_user_movie_rating_matrix = df_ratings_filtered.pivot_table(
  index='userId',
  columns='movieId',
  values='rating'
)

df_user_movie_rating_matrix_centered = df_user_movie_rating_matrix.sub(df_user_movie_rating_matrix.mean(axis=1), axis=0).fillna(0)

user_ids = df_user_movie_rating_matrix_centered.index.tolist()
movie_ids = df_user_movie_rating_matrix_centered.columns.tolist()

df_user_movie_rating_matrix_centered.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,188301,189203,189333,189713,192385,192389,192803,194448,195159,201773
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.30205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.663313,-1.336687,-1.336687,0.0,0.0,0.0,-0.336687,0.0,0.0,-0.336687,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,0.0,-1.371912,0.0,0.0,0.0,0.628088,0.628088,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,0.0,-1.561441,0.0,0.0,0.0,2.438559,0.0,0.0,0.0,2.438559,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,0.748718,0.0,0.0,0.0,0.0,0.0,-0.251282,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# 1. cosine similarity between movie columns
item_sim = pd.DataFrame(
  cosine_similarity(df_user_movie_rating_matrix_centered.T),
  index=movie_ids,
  columns=movie_ids
)
item_sim.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,188301,189203,189333,189713,192385,192389,192803,194448,195159,201773
1,1.0,-0.067601,-0.122692,-0.091522,-0.136504,0.154613,-0.069073,-0.074651,-0.12944,0.024005,...,0.021024,0.022142,0.055981,0.051502,0.013065,-0.068166,0.00517,0.028304,0.150893,0.03976
2,-0.067601,1.0,0.147217,0.057336,0.164892,-0.160565,0.100846,0.075296,0.089625,0.068987,...,-0.00074,0.014005,-0.015524,-0.011813,-0.012584,0.014457,0.013575,-0.006389,-0.011294,0.007353
3,-0.122692,0.147217,1.0,0.098735,0.249488,-0.100236,0.126705,0.068694,0.117283,0.059977,...,-0.00239,-0.009327,-0.002086,0.000274,-0.000938,0.00897,-0.011119,-0.017175,-0.025578,-0.007115
4,-0.091522,0.057336,0.098735,1.0,0.118768,-0.05631,0.077967,0.05877,0.073362,0.006649,...,-0.002318,-0.012092,-0.002768,-0.021746,0.001161,7.5e-05,-0.013281,-0.010296,-0.018573,-0.003908
5,-0.136504,0.164892,0.249488,0.118768,1.0,-0.133113,0.166815,0.107024,0.107257,0.057012,...,-0.00312,-0.022872,-0.014928,-0.020771,-0.019195,0.00735,-0.014749,-0.015771,-0.028175,-0.011247


In [15]:
# K-nearest items for each movie (already computed → topk)
K = 40
topk = {
  m: item_sim[m].nlargest(K + 1).iloc[1:]     # drop self-similarity
  for m in item_sim.columns
}

def recommend(uid, n_rec=10):
  """Top-N item-based CF recommendations for one user."""
  seen   = df_user_movie_rating_matrix.loc[uid].dropna()     # movies + own ratings
  score  = {}          # movie → weighted sum
  weight = {}          # movie → sum|sim|

  for mid, r in seen.items():
    for nbr, sim in topk[mid].items():
      if nbr in seen:           # skip already-rated
        continue
      score[nbr]  = score.get(nbr, 0)  + sim * r
      weight[nbr] = weight.get(nbr, 0) + abs(sim)

  preds = {m: score[m] / weight[m] for m in score if weight[m] > 0}
  # return N best unseen movies
  return sorted(preds.items(), key=lambda x: x[1], reverse=True)[:n_rec]

# example: 10 suggestions for user 3
print(recommend(3, n_rec=10))

[(1079, 5.0), (1278, 5.0), (3089, 5.0), (3307, 5.0), (3462, 5.0), (104283, 5.0), (1223, 4.839036565364222), (3429, 4.8236116882661575), (720, 4.817987958357368), (3022, 4.755251341107619)]
