In [42]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise import SVD, KNNBasic, accuracy
from surprise.model_selection import train_test_split
import collections
from collections import defaultdict
import math

If using anaconda, use ***conda install -c conda-forge scikit-surprise*** to install surprise package.

Surprise is *Simple Python Recommendation System Engine* it can use KNN as baseline model

In [7]:
ratings = pd.read_csv(
    'ml-1m/ratings.dat',
    sep='::',
    names=['user_id', 'movie_id', 'rating', 'timestamp'],
    engine='python'
)
users = pd.read_csv(
    'ml-1m/users.dat',
    sep='::',
    names=['user_id', 'gender', 'age', 'occupation', 'zip'],
    engine='python'
)
movies = pd.read_csv(
    'ml-1m/movies.dat',
    sep='::',
    names=['movie_id', 'title', 'genres'],
    engine='python',
    encoding='latin-1'
)

data = pd.merge(ratings, users, on='user_id', how='left')
data = pd.merge(data, movies, on='movie_id', how='left')

In [10]:
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


### Setting up Recommendation system (Surprise)

In [13]:
df_ratings = ratings[['user_id', 'movie_id', 'rating']].copy()
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(df_ratings, reader)

train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)

In [14]:
sim_options = {
    'name': 'cosine',
    'user_based': False
}

model_knn = KNNBasic(sim_options=sim_options)
model_knn.fit(train_set)
pred_knn = model_knn.test(test_set)
rmse_knn = accuracy.rmse(pred_knn)
print('KNN RMSE:', rmse_knn)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9988
KNN RMSE: 0.9988005727431559


In [15]:
model_svd = SVD()
model_svd.fit(train_set)
pred_svd = model_svd.test(test_set)
rmse_svd = accuracy.rmse(pred_svd)
print('SVD RMSE:', rmse_svd)

RMSE: 0.8731
SVD RMSE: 0.8730587435087336


In [47]:
def precision_recall(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((iid, est, true_r))
    precisions = []
    recalls = []
    
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, _, true_r) in user_ratings)
        n_rec_k = sum((true_r >= threshold) for (_, _, true_r) in user_ratings[:k])
        
        if k > 0:
            precision = n_rec_k / k
        else:
            precision = 1
        if n_rel != 0:
            recall = n_rec_k / n_rel
        else:
            recall = 1
        precisions.append(precision)
        recalls.append(recall)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    return mean_precision, mean_recall

### Data Sparsity 

In [48]:
fractions = [0.1, 0.3, 0.5, 0.7]
results_rmse = {}
results_prec = {}
results_recall = {}

for f in fractions:
    print(f'==========Data fraction: {f}==========')
    df_sample = df_ratings.sample(frac=f, random_state=42)
    dataset_sample = Dataset.load_from_df(df_sample, reader)
    train_set_sample, test_set_sample = train_test_split(dataset_sample, test_size=0.2, random_state=42)
    
    model_knn = KNNBasic(sim_options=sim_options)
    model_knn.fit(train_set_sample)
    pred_knn = model_knn.test(test_set_sample)
    
    rmse_knn = accuracy.rmse(pred_knn)
    results_rmse[f] = rmse_knn
    
    p_val, r_val = precision_recall(pred_knn, k=10, threshold=3.5)
    results_prec[f] = p_val
    results_recall[f] = r_val

print("Sparsity Analysis")
for f in fractions:
    print(f"Fraction: {f}, RMSE: {results_rmse[f]}, Precision: {results_prec[f]}, Recall: {results_recall[f]}")


Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0994
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0479
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0206
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0090
Sparsity Analysis
Fraction: 0.1, RMSE: 1.0994407902790586, Precision: 0.22392598894087623, Recall: 0.9759417774146141
Fraction: 0.3, RMSE: 1.0479165161276993, Precision: 0.38932662270045126, Recall: 0.8699812861426067
Fraction: 0.5, RMSE: 1.020628624860206, Precision: 0.4998320456835741, Recall: 0.7852348081517267
Fraction: 0.7, RMSE: 1.0090022068848437, Precision: 0.5780418882978723, Recall: 0.7164635733766043


### Item Popularity

In [49]:
df_itempop = pd.read_csv(
    'ml-1m/ratings.dat',
    sep='::',
    names=['user_id', 'movie_id', 'rating', 'timestamp'],
    engine='python'
)

df_itempop = df_itempop[['user_id', 'movie_id', 'rating']] # drop timestamp for now
data_itempop = Dataset.load_from_df(df_itempop, reader)
train_set_ip, test_set_ip = train_test_split(data_itempop, test_size=0.2, random_state=42)


In [None]:

train_items = collections.defaultdict(int)

for uid, iid, rating in train_set.all_ratings():
    raw_iid = train_set.to_raw_iid(int(iid))
    train_items[raw_iid] += 1

In [None]:

sorted_items = sorted(train_items.items(), key=lambda x: x[1], reverse=True)
num_popular_items = math.ceil(0.2 * len(sorted_items))
popular_item_ids = set([item[0] for item in sorted_items[:num_popular_items]])

print(f"Number of popular items: {num_popular_items}")


Number of popular items: 735


In [56]:
model_knn_pop = KNNBasic(sim_options=sim_options)
model_knn_pop.fit(train_set)
pred_knn_pop = model_knn_pop.test(test_set_ip)

rmse_all = accuracy.rmse(pred_knn_pop)
p_all, r_all = precision_recall(pred_knn_pop, k=10, threshold=3.5)

preds_popular = []
preds_longtail = []

for p in pred_knn_pop:
    if p.iid in popular_item_ids:
        preds_popular.append(p)
    else:
        preds_longtail.append(p)
        
rmse_popular = accuracy.rmse(preds_popular)
p_pop, r_pop = precision_recall(preds_popular, k=10, threshold=3.5)
rmse_longtail = accuracy.rmse(preds_longtail)
p_long, r_long = precision_recall(preds_longtail, k=10, threshold=3.5)

print("Item Popularity Bias Analysis")
print(f"Overall RMSE: {rmse_all}, Precision: {p_all}, Recall: {r_all}")
print(f"Popular Items RMSE: {rmse_popular}, Precision: {p_pop}, Recall: {r_pop}")
print(f"Long-tail Items RMSE: {rmse_longtail}, Precision: {p_long}, Recall: {r_long}")


Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9988
RMSE: 0.9524
RMSE: 1.0795
Item Popularity Bias Analysis
Overall RMSE: 0.9988005727431559, Precision: 0.6575335653903531, Recall: 0.6296417338126692
Popular Items RMSE: 0.9523874334003957, Precision: 0.6224292845257904, Recall: 0.712438557242363
Long-tail Items RMSE: 1.0794934990129461, Precision: 0.3218455402465555, Recall: 0.864917496083326


### Cold start

In [57]:
df_cold = pd.read_csv(
    'ml-1m/ratings.dat',
    sep='::',
    names=['user_id', 'movie_id', 'rating', 'timestamp'],
    engine='python'
)

df_cold = df_cold[['user_id', 'movie_id', 'rating']] # drop timestamp for now

In [58]:

data_cold = Dataset.load_from_df(df_cold, reader)
train_set_cold, test_set_cold = train_test_split(data_cold, test_size=0.2, random_state=42)

In [59]:
test_df_cold = pd.DataFrame(test_set_cold, columns=['user_id', 'movie_id', 'rating'])
new_users = [
    (6041,1,5),
    (6041,50,1),
    (6042,10,1),
    (6042,100,5),
    (6043,33,2),
    (6043,55,5),
    (6044,66,1),
    (6044,77,5)     
    ]

new_users_df = pd.DataFrame(new_users, columns=['user_id', 'movie_id', 'rating'])
test_df_cold = pd.concat([test_df_cold, new_users_df], ignore_index=True)


In [60]:
test_set_new = list(test_df_cold[['user_id', 'movie_id', 'rating']].itertuples(index=False, name=None))

model_knn_cold = KNNBasic(sim_options=sim_options)
model_knn_cold.fit(train_set_cold)
pred_knn_cold = model_knn_cold.test(test_set_new)
rmse_knn_cold = accuracy.rmse(pred_knn_cold)

new_user_ids = {6041, 6042, 6043, 6044}
#filter out predictions for new users
preds_new = [p for p in pred_knn_cold if p.uid in new_user_ids]
preds_old = [p for p in pred_knn_cold if p.uid not in new_user_ids]

rmse_new = accuracy.rmse(preds_new)
rmse_old = accuracy.rmse(preds_old)

p_all_cold, r_all_cold = precision_recall(pred_knn_cold, k=10, threshold=3.5)
p_new, r_new = precision_recall(preds_new, k=10, threshold=3.5)
p_old, r_old = precision_recall(preds_old, k=10, threshold=3.5)
print("Cold Start Analysis")
print(f"Overall RMSE: {rmse_knn_cold}, Precision: {p_all_cold}, Recall: {r_all_cold}")
print(f"New Users RMSE: {rmse_new}, Precision: {p_new}, Recall: {r_new}")
print(f"Old Users RMSE: {rmse_old}, Precision: {p_old}, Recall: {r_old}")


Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9989
RMSE: 1.9539
RMSE: 0.9988
Cold Start Analysis
Overall RMSE: 0.9988570278743717, Precision: 0.6571641543813153, Recall: 0.6298871260711998
New Users RMSE: 1.9539083898500522, Precision: 0.1, Recall: 1.0
Old Users RMSE: 0.9988005727431559, Precision: 0.6575335653903531, Recall: 0.6296417338126692
