In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, accuracy, Reader, SVD, SVDpp, NMF, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, BaselineOnly, CoClustering, SlopeOne, NormalPredictor
from surprise.model_selection import train_test_split, cross_validate

# Data

In [2]:
# Use the zip file
from zipfile import ZipFile
with ZipFile('data/ml-100k.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('./data/ml-100k/u.user', sep='|', names=u_cols,  encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating']
ratings_df = pd.read_csv('./data/ml-100k/u.data', sep='\t', usecols=[0,1,2], names=r_cols, encoding='latin-1')

i_cols = ['movie_id', 'movie_title']
items = pd.read_csv('./data/ml-100k/u.item', sep='|', usecols=[0, 1], names=i_cols, encoding='latin-1')

ratings_df.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


# EDA

In [3]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = ratings_df['rating'].value_counts().sort_index(ascending=False)

trace = go.Bar(
    x = data.index,
     text = ['{:.1f} %'.format(val) for val in (data.values / ratings_df.shape[0] * 100)],
     textposition = 'auto',
     textfont = dict(color = '#000000'),
     y = data.values
)

layout = dict(
     title='Distribution of Ratings', 
     xaxis = dict(title = 'Rating'),
     yaxis = dict(title = 'Count')
)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [4]:
data = ratings_df.groupby('movie_id')['rating'].count()

trace = go.Histogram(
    x = data.values,
    name = 'Ratings',
    xbins = dict(
        start = 0,
        end = 100,
        size = 2
    )
)

layout = go.Layout(
    title = 'Distribution of Ratings per Movie',
    xaxis = dict(title = 'Number of Ratings per movie'),
    yaxis = dict(title = 'Count'),
    bargap = 0.1
)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [5]:
data = ratings_df.groupby('user_id')['rating'].count()

trace = go.Histogram(
    x = data.values,
    name = 'Ratings',
    xbins = dict(
        start = 0,
        end = 100,
        size = 2
    )
)

layout = go.Layout(
    title = 'Distribution of Ratings per User',
    xaxis = dict(title = 'Ratings per user'),
    yaxis = dict(title = 'Count'),
    bargap = 0.1
)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

# Benchmark Algorithms

In [6]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)

In [7]:
benchmark = []

for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = pd.concat([tmp, pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])])
    benchmark.append(tmp)

pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.926315,16.614837,5.770494
KNNBaseline,0.935688,0.57893,6.192411
SVD,0.945415,0.969835,0.234068
BaselineOnly,0.947668,0.232038,0.212419
SlopeOne,0.948941,0.611808,4.028263
KNNWithMeans,0.955647,0.346831,5.014912
KNNWithZScore,0.956995,0.42321,4.736002
CoClustering,0.97545,1.45114,0.19129
NMF,0.975706,1.984828,0.233831
KNNBasic,0.987967,0.3123,4.343794


# Training

In [8]:
print("Using SVDpp")

algo = SVDpp(
    n_factors=50,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.02
)

cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=True)

Using SVDpp
Evaluating RMSE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9284  0.9277  0.9284  0.9282  0.0003  
Fit time          35.83   38.45   34.95   36.41   1.49    
Test time         5.92    6.50    3.68    5.36    1.21    


{'test_rmse': array([0.92839127, 0.92767834, 0.92843054]),
 'fit_time': (35.82781481742859, 38.45448064804077, 34.946890354156494),
 'test_time': (5.9175519943237305, 6.496156215667725, 3.6805288791656494)}

In [9]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = SVDpp(
    n_factors=50,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.02
)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.9212


0.9211568704924165

In [10]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [11]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
6815,394,22,5.0,5.0,{'was_impossible': False},120,216,0.0
9735,379,127,5.0,5.0,{'was_impossible': False},146,313,0.0
2257,291,50,5.0,5.0,{'was_impossible': False},218,436,0.0
10419,457,172,5.0,5.0,{'was_impossible': False},201,281,0.0
16753,711,408,5.0,5.0,{'was_impossible': False},170,87,0.0
23980,592,98,5.0,5.0,{'was_impossible': False},260,286,0.0
6051,181,1132,1.0,1.0,{'was_impossible': False},328,25,0.0
1766,507,121,5.0,5.0,{'was_impossible': False},43,333,0.0
13388,416,216,5.0,5.0,{'was_impossible': False},366,225,0.0
21333,405,1029,1.0,1.0,{'was_impossible': False},535,12,0.0


In [12]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
21982,405,842,5.0,1.542267,{'was_impossible': False},535,16,3.457733
14339,151,469,1.0,4.523792,{'was_impossible': False},228,49,3.523792
22702,405,238,5.0,1.457775,{'was_impossible': False},535,188,3.542225
6186,148,191,1.0,4.554194,{'was_impossible': False},49,214,3.554194
21112,137,51,1.0,4.572899,{'was_impossible': False},35,55,3.572899
22622,239,318,1.0,4.625995,{'was_impossible': False},119,220,3.625995
12166,648,69,1.0,4.686965,{'was_impossible': False},227,258,3.686965
20658,867,276,1.0,4.703596,{'was_impossible': False},69,208,3.703596
12840,239,190,1.0,4.827079,{'was_impossible': False},119,92,3.827079
20142,887,274,1.0,4.878277,{'was_impossible': False},121,142,3.878277


# Save & Load

In [13]:
import os

def get_current_version(version_file='version.txt'):
    """Read the current version from the version file."""
    if not os.path.exists(version_file):
        with open(version_file, 'w') as f:
            f.write('0')
        return 0
    with open(version_file, 'r') as f:
        return int(f.read().strip())

def increment_version(version_file='version.txt'):
    """Increment the version number and save it back to the file."""
    current_version = get_current_version(version_file)
    new_version = current_version + 1
    with open(version_file, 'w') as f:
        f.write(str(new_version))
    return new_version

# Save

In [14]:
import pickle

version_file = 'ml_model_version.txt'
new_version = increment_version(version_file)

model_name = f"model_v{new_version}"
os.makedirs(f"ml_models/{model_name}", exist_ok=True)
save_path = os.path.join(f"ml_models/{model_name}", model_name)

with open(f"{save_path}.pkl", 'wb') as f:
    pickle.dump(algo, f)

print(f"Model saved to {save_path}")

Model saved to ml_models/model_v1/model_v1


# Load

In [15]:
# import pickle

# version_file = 'ml_model_version.txt'
# last_version = get_current_version(version_file)

# load_path = f"ml_models/model_v{last_version}/model_v{last_version}.pkl"

# with open(load_path, 'rb') as f:
    # algo = pickle.load(f)

# print(f"Model loaded from {load_path}")

# Inference

In [22]:
def check_user_rating(user_id, movie_id):
    if ratings_df[(ratings_df['user_id'] == user_id) & (ratings_df['movie_id'] == movie_id)].empty:
        print("User has not rated the movie")
    else:
        print(f"User has rated the movie with rating: {ratings_df[(ratings_df['user_id'] == user_id) & (ratings_df['movie_id'] == movie_id)]['rating'].values[0]}")

check_user_rating(196, 202)
check_user_rating(197, 64)

User has rated the movie with rating: 3
User has not rated the movie


## Single Prediction

In [17]:
user_id = 196
item_id = 302

pred = algo.predict(user_id, item_id)
print(pred)

user: 196        item: 302        r_ui = None   est = 4.06   {'was_impossible': False}


## Predictions For A User

In [18]:
user_id = 197
all_items = ratings_df['movie_id'].unique()

# Exclude the items that are already rated by the user
items_rated_by_user = ratings_df[ratings_df['user_id'] == user_id]['movie_id'].values
items_to_pred = np.setdiff1d(all_items, items_rated_by_user)

predictions = [algo.predict(user_id, item_id) for item_id in items_to_pred]

# Then sort the predictions by their estimated ratings
sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

# Get the top n items
top_n = 10
for pred in sorted_predictions[:top_n]:
    movie_name = items[items['movie_id'] == pred.iid]['movie_title'].values[0]
    print(f'{pred.iid} - {movie_name} - {pred.est}')

# print(len(all_items), len(items_rated_by_user), len(items_to_pred))

64 - Shawshank Redemption, The (1994) - 4.793489669371369
12 - Usual Suspects, The (1995) - 4.5711821186356465
69 - Forrest Gump (1994) - 4.386736741773548
173 - Princess Bride, The (1987) - 4.34786779740122
114 - Wallace & Gromit: The Best of Aardman Animation (1996) - 4.300245727741361
169 - Wrong Trousers, The (1993) - 4.278377960854714
28 - Apollo 13 (1995) - 4.262295337644336
408 - Close Shave, A (1995) - 4.247867033181247
318 - Schindler's List (1993) - 4.20680807056584
484 - Maltese Falcon, The (1941) - 4.140929033114934


## Top N All Time Popular

In [19]:
# Show top 10 popular movies by mean of sum of ratings where the number of ratings is greater than 100
movie_stats = ratings_df.groupby('movie_id').agg({'rating': [np.size, np.mean]})
popular_movies = movie_stats['rating']['size'] >= 200
movie_stats[popular_movies].sort_values([('rating', 'mean')], ascending=False).head(10)


The provided callable <function mean at 0x7f7da402f7e0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.



Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
318,298,4.466443
483,243,4.45679
64,283,4.44523
603,209,4.38756
12,267,4.385768
50,583,4.358491
427,219,4.292237
357,264,4.291667
98,390,4.289744
127,413,4.283293


## Top N For All Users

In [20]:
from collections import defaultdict

def get_top_n_recommendations(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

predictions = algo.test(testset)
top_n_recommendations = get_top_n_recommendations(predictions, n=10)

for uid, user_ratings in top_n_recommendations.items():
    print(uid, [iid for (iid, _) in user_ratings])

500 [168, 182, 100, 285, 242, 179, 611, 135, 30, 116]
534 [150, 237, 1, 273, 475, 274, 331, 149, 93, 24]
805 [269, 175, 603, 190, 86, 527, 425, 223, 185, 432]
749 [174, 96, 483, 98, 100, 378, 568, 153, 79, 511]
13 [64, 511, 199, 69, 435, 705, 182, 97, 193, 154]
429 [483, 357, 480, 12, 124, 178, 166, 89, 482, 45]
618 [64, 174, 318, 172, 190, 12, 127, 132, 23, 181]
24 [318, 127, 357, 100, 223, 153, 475, 275, 276, 55]
498 [474, 179, 9, 56, 100, 124, 190, 191, 137, 475]
541 [50, 511, 196, 79, 151, 143, 83, 654, 8, 82]
201 [134, 192, 98, 100, 179, 480, 190, 513, 511, 137]
244 [135, 86, 357, 169, 183, 180, 528, 64, 1039, 168]
405 [194, 169, 202, 181, 30, 97, 22, 216, 641, 132]
277 [302, 9, 124, 257, 282, 1129, 274, 181, 117, 762]
804 [174, 1, 651, 483, 479, 654, 181, 176, 143, 187]
380 [357, 114, 134, 479, 427, 12, 64, 654, 709, 190]
889 [64, 408, 172, 657, 480, 479, 194, 178, 357, 190]
916 [23, 60, 318, 168, 180, 64, 56, 523, 48, 223]
935 [121, 282, 181, 300, 284, 1048]
141 [300, 121, 1, 11