In [1]:
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip

--2025-04-01 03:49:07--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.1’


2025-04-01 03:49:08 (4.95 MB/s) - ‘ml-100k.zip.1’ saved [4924029/4924029]



In [2]:
!unzip ml-100k.zip -d ./ml-100k

Archive:  ml-100k.zip
replace ./ml-100k/ml-100k/allbut.pl? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./ml-100k/ml-100k/allbut.pl  
replace ./ml-100k/ml-100k/mku.sh? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./ml-100k/ml-100k/mku.sh  
replace ./ml-100k/ml-100k/README? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./ml-100k/ml-100k/README  
replace ./ml-100k/ml-100k/u.data? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./ml-100k/ml-100k/u.data  
replace ./ml-100k/ml-100k/u.genre? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./ml-100k/ml-100k/u.genre  
replace ./ml-100k/ml-100k/u.info? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./ml-100k/ml-100k/u.info  
replace ./ml-100k/ml-100k/u.item? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./ml-100k/ml-100k/u.item  
replace ./ml-100k/ml-100k/u.occupation? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ./ml-100k/ml-100k/u.occupation  
replace ./ml-100k/ml-100k/u.user? [y]es, [n]o,

In [3]:
import pandas as pd

In [4]:
ratings = pd.read_csv("./ml-100k/ml-100k/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
print(ratings.head())

movies = pd.read_csv("./ml-100k/ml-100k/u.item", sep="|", encoding="latin-1", header=None,
                     names=["movie_id", "title", "release_date", "video_release_date", "IMDb_URL"] + [f"genre_{i}" for i in range(19)])
print(movies.head())

print("Number of users:", ratings["user_id"].nunique())
print("Number of items:", ratings["item_id"].nunique())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
   movie_id              title release_date  video_release_date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb_URL  genre_0  genre_1  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0        0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0        1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0        0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0      

In [5]:
user_item_matrix = ratings.pivot(index="user_id", columns="item_id", values="rating").fillna(0)
print(user_item_matrix.head())

item_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                              ...   
1         5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2         4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   
3         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5         4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

item_id  1673  1674  1675  1676  1677  1678  1679  1680  1681  1682  
user_id                                                              
1         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
5         0.0   0.0   0.0   0.0   0.0   0.0   0

In [6]:
# 1st method is matrix factorization using svd
# https://machinelearningmastery.com/building-a-recommender-system-from-scratch-with-matrix-factorization-in-python/
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate

In [7]:
# data prepare for surprise
reader = Reader(rating_scale=(1, 5))
data_surprise = Dataset.load_from_df(ratings[["user_id", "item_id", "rating"]], reader)

# svd for matrix factorization
svd = SVD(n_epochs=20, lr_all=0.005, reg_all=0.02)
cross_val_results = cross_validate(svd, data_surprise, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9316  0.9435  0.9362  0.9433  0.9241  0.9357  0.0073  
MAE (testset)     0.7367  0.7436  0.7375  0.7415  0.7279  0.7374  0.0054  
Fit time          1.45    1.48    1.36    2.16    1.35    1.56    0.30    
Test time         0.19    0.19    0.10    0.31    0.13    0.18    0.07    


In [8]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m87.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2


In [9]:
# 2nd method is als using implicit
import numpy as np
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix

In [10]:
# https://benfred.github.io/implicit/
# convert user item to sparse
sparse_user_item = csr_matrix(user_item_matrix.values)

# init and train
als_model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)
als_model.fit(sparse_user_item.T)

# compute recommendations for sample user
user_id = 1
user_items = sparse_user_item[user_id-1]
recommendations = als_model.recommend(user_id-1, user_items)

  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

In [11]:
print("Recommendations (item indices, scores):", recommendations)

Recommendations (item indices, scores): (array([891, 621, 619, 653, 867, 401, 636, 707, 312, 820], dtype=int32), array([1.2905084, 1.2798402, 1.2796443, 1.2756964, 1.2743874, 1.2659827,
       1.251369 , 1.2221775, 1.2148552, 1.1994851], dtype=float32))
