In [60]:
import numpy as np
import pandas as pd 

from collections import defaultdict
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNBaseline


In [61]:
train_cols = ['user_id', 'item_id', 'rating', 'timestamp']
item_cols = ['item_id', 'movie', 'release_date', 'v_release_date', 'imdb_url', 'unknown', 'action', 
             'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy',
             'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi','Thriller', 'War', 'Western']

train_df = pd.read_csv('../data/MovieLens.training', sep='\t', lineterminator='\n')
test_df = pd.read_csv('../data/MovieLens.test', sep='\t', lineterminator='\n')
item_df = pd.read_csv('../data/MovieLens.item', sep='|', lineterminator='\n')

train_df.columns = train_cols
test_df.columns = train_cols
item_df.columns = item_cols

In [62]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
test_data = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)

trainset = train_data.build_full_trainset()

In [63]:
def get_top_n(df, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        test_df(list of Prediction objects): dataframe with predictions.
        n(int): The number of recommendation to output for each user. Default is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [raw item id, ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for index, row in df.iterrows():
        top_n[row["user_id"]].append((row['item_id'], row['predictions']))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_items = [item_rating_t[0] for item_rating_t in user_ratings[:n]]
        top_n[uid] = top_items

    return top_n

### User based KNN recommender system model

In [69]:
# - https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=KNNBaseline#use-a-custom-dataset
sim_options_user = {'name': 'pearson_baseline', 'user_based': True}

userBasedKNN = KNNBaseline(sim_options=sim_options_user)
userBasedKNN.fit(trainset)

prediction = []
for index, row in test_df.iterrows():
    pred = userBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
    prediction.append(pred[3])
    
test_df['predictions'] = prediction

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [70]:
top10 = get_top_n(test_df)
for user in top10.keys():
    if user < 16:
        print("User ", user, " : ", top10[user])

User  1.0  :  [114.0, 64.0, 272.0, 174.0, 98.0, 134.0, 100.0, 12.0, 190.0, 56.0]
User  2.0  :  [50.0, 313.0, 251.0, 19.0, 315.0, 316.0, 297.0, 303.0, 257.0, 298.0]
User  3.0  :  [318.0, 272.0, 345.0, 307.0, 300.0, 328.0, 327.0, 332.0, 331.0, 343.0]
User  4.0  :  [50.0, 357.0, 303.0, 354.0, 288.0, 361.0, 294.0, 356.0, 264.0, 260.0]
User  5.0  :  [173.0, 89.0, 445.0, 176.0, 100.0, 1.0, 98.0, 42.0, 79.0, 185.0]
User  6.0  :  [480.0, 318.0, 483.0, 515.0, 513.0, 488.0, 528.0, 479.0, 187.0, 134.0]
User  7.0  :  [174.0, 223.0, 127.0, 511.0, 483.0, 661.0, 182.0, 543.0, 185.0, 657.0]
User  8.0  :  [50.0, 172.0, 127.0, 183.0, 79.0, 651.0, 511.0, 210.0, 190.0, 89.0]
User  9.0  :  [487.0, 479.0, 527.0, 521.0, 691.0, 298.0, 507.0, 340.0, 6.0, 286.0]
User  10.0  :  [483.0, 474.0, 127.0, 98.0, 603.0, 488.0, 493.0, 56.0, 100.0, 199.0]
User  11.0  :  [190.0, 100.0, 735.0, 12.0, 524.0, 191.0, 194.0, 22.0, 736.0, 740.0]
User  12.0  :  [318.0, 50.0, 196.0, 172.0, 96.0, 28.0, 191.0, 735.0, 204.0, 82.0]
Use

### Item based KNN recommender system model

In [67]:
sim_options_items = {'name': 'pearson_baseline', 'user_based': False}

itemBasedKNN = KNNBaseline(sim_options=sim_options_items)
itemBasedKNN.fit(trainset)

item_test_df = test_df.copy()
prediction = []

for index, row in item_test_df.iterrows():
    pred = itemBasedKNN.predict(row["user_id"], row["item_id"], row["rating"], verbose=False)
    prediction.append(pred[3])
    
item_test_df['predictions'] = prediction

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [68]:
top10 = get_top_n(item_test_df)
for user in top10.keys():
    if user < 16:
        print("User ", user, " : ", top10[user])

User  1.0  :  [170.0, 174.0, 272.0, 64.0, 134.0, 100.0, 12.0, 60.0, 190.0, 98.0]
User  2.0  :  [50.0, 315.0, 251.0, 19.0, 313.0, 297.0, 316.0, 303.0, 292.0, 298.0]
User  3.0  :  [318.0, 345.0, 272.0, 307.0, 331.0, 354.0, 348.0, 335.0, 327.0, 334.0]
User  4.0  :  [303.0, 357.0, 361.0, 50.0, 354.0, 288.0, 356.0, 260.0, 264.0, 294.0]
User  5.0  :  [89.0, 176.0, 173.0, 100.0, 144.0, 1.0, 79.0, 69.0, 109.0, 429.0]
User  6.0  :  [483.0, 515.0, 318.0, 513.0, 488.0, 357.0, 134.0, 480.0, 478.0, 199.0]
User  7.0  :  [643.0, 483.0, 174.0, 172.0, 223.0, 657.0, 528.0, 8.0, 127.0, 191.0]
User  8.0  :  [50.0, 511.0, 183.0, 172.0, 79.0, 176.0, 127.0, 651.0, 89.0, 190.0]
User  9.0  :  [479.0, 527.0, 487.0, 521.0, 340.0, 298.0, 507.0, 6.0, 691.0, 286.0]
User  10.0  :  [483.0, 603.0, 98.0, 474.0, 64.0, 127.0, 191.0, 488.0, 199.0, 651.0]
User  11.0  :  [190.0, 735.0, 191.0, 216.0, 12.0, 22.0, 100.0, 194.0, 429.0, 736.0]
User  12.0  :  [50.0, 196.0, 318.0, 735.0, 28.0, 282.0, 172.0, 143.0, 96.0, 15.0]
User