In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import math
from multiprocessing import Pool
import warnings
from functools import partial
import random

In [2]:
np.random.seed(42)
tqdm.pandas()
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [4]:
def parallelize(groups, func):
    num_workers = 16
    with Pool(num_workers) as p:
        return pd.concat(p.map(func, [group for name, group in groups])).sort_index()

In [5]:
train = pd.read_pickle('../data/train.pkl.xz')

In [6]:
title_means = train.mean()

In [7]:
def rmse(pred, true):
    return np.sqrt(np.nanmean((pred - true)**2))

## Model : Mean of Ratings of Most Similar Users (Cosine Similarity Matrix)

In [6]:
validate_X = pd.read_pickle('../data/validate_X.pkl.xz')
validate_Y = pd.read_pickle('../data/validate_Y.pkl.xz')

In [13]:
train = train.fillna(0)

sparse = sp.sparse.csr_matrix(train.values, dtype=np.float32)

def similarity_calculator(data):
    return pd.DataFrame(cosine_similarity(sparse, data), index=train.index, columns=data.index).astype(np.float16)

num_workers = 15
chunksize = int(validate_X.shape[0]/num_workers)+1
chunks = chunker(validate_X.fillna(0), chunksize)
with Pool(num_workers) as p:
    similarity_matrix = pd.concat(p.map(similarity_calculator, chunks), axis=1)
    
del chunks

similarity_matrix.to_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similarity_matrix.pkl')

similarity_matrix = pd.read_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similarity_matrix.pkl')
def n_similar_users(column, n=1000):
    return pd.Series(column.sort_values(ascending=False).head(n).index, name=column.name)

def similar_users_ordered(column):
    return pd.Series(column.sort_values(ascending=False).index, name=column.name)

with Pool(15) as p:
    similar_users_ordered = pd.concat(p.map(similar_users_ordered, (tup[1] for tup in similarity_matrix.items())), axis=1)

similar_users_ordered.to_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similar_users_ordered.pkl')

train = train.replace(0, np.NaN)

In [14]:
similar_users_ordered = pd.read_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similar_users_ordered.pkl')

In [15]:
def get_recommendations(username, num_sim_users=2000, avg_method='simple'):    
    sim_users_data = train.loc[similar_users_ordered[username].head(num_sim_users).values]

    watch_data = validate_X.loc[username]

    user_watched = watch_data.loc[watch_data.notnull()].index

    suggestables = set(sim_users_data.columns).difference(user_watched)

    sim_users_data = sim_users_data[suggestables]

    if avg_method == 'simple':
        pred_data = sim_users_data.mean(axis=0).sort_values(ascending=False)

    elif avg_method == 'weighted':
        # user_weights = [incremental_base**x for x in range(sim_users_data.shape[0], 0, -1)]
        user_weights = np.arange(sim_users_data.shape[0], 0, -1)

        weighted_sum = sim_users_data.mul(user_weights, axis=0).sum()

        anime_weights = sim_users_data.notnull().astype('int').mul(user_weights, axis=0).sum()

        pred_data = (weighted_sum / anime_weights).sort_values(ascending=False)

    
    return pred_data

## Hyperparameter Tuning

### Number of Similar Users to Consider

In [17]:
num_sim_users_list = [10, 20, 50, 100, 200, 500, 1000, 2000, 4000]
avg_method_list = ['simple', 'weighted']
# incremental_base_list = [1.0001, 1.001, 1.01, 1.1]

parameter_combs = []
for nsu in num_sim_users_list:
    for avgm in avg_method_list:
        parameter_combs.append((nsu, avgm))
            
random.shuffle(parameter_combs)

In [18]:
data = validate_X.sample(500, random_state=42).index
rmse_dict = {}

for nsu, avgm in tqdm(parameter_combs):
    with Pool(15) as p:
        partial_f = partial(get_recommendations, num_sim_users=nsu, avg_method=avgm)
        validate_pred = pd.DataFrame(p.map(partial_f, np.array(data)), index=data)

    rmse_dict[(nsu, avgm)] = rmse(validate_pred, validate_Y.loc[data])

100%|██████████| 18/18 [07:16<00:00, 24.26s/it]


In [19]:
rmse_dict

{(10, 'simple'): 0.2143944826334401,
 (4000, 'weighted'): 0.17993241160038503,
 (4000, 'simple'): 0.1805542327349845,
 (20, 'weighted'): 0.21065714211621878,
 (2000, 'weighted'): 0.17975068380955647,
 (200, 'simple'): 0.18994990031427866,
 (200, 'weighted'): 0.19230069767148963,
 (1000, 'simple'): 0.18097026013101253,
 (500, 'simple'): 0.18325626407655016,
 (100, 'simple'): 0.1943903428534364,
 (1000, 'weighted'): 0.18128137866286645,
 (50, 'weighted'): 0.202639283168223,
 (500, 'weighted'): 0.18454352045758826,
 (20, 'simple'): 0.20713744953324137,
 (10, 'weighted'): 0.21778560894173007,
 (50, 'simple'): 0.19910013127841067,
 (100, 'weighted'): 0.19688605876167084,
 (2000, 'simple'): 0.17999431105071925}

### The best model

In [20]:
data = validate_X.sample(1000, random_state=42).index
with Pool(15) as p:
    partial_f = partial(get_recommendations, num_sim_users=2000, avg_method='weighted')
    validate_pred = pd.DataFrame(p.map(partial_f, np.array(data)), index=data)

In [21]:
rmse(validate_pred, validate_Y.loc[data])

0.17991354396697642

## Testing

In [8]:
validate = pd.read_pickle('../data/validate.pkl.xz')

In [9]:
train = pd.concat([train, validate], axis=0)

In [10]:
test_X = pd.read_pickle('../data/test_X.pkl.xz')
test_Y = pd.read_pickle('../data/test_Y.pkl.xz')

In [11]:
train = train.fillna(0)

sparse = sp.sparse.csr_matrix(train.values, dtype=np.float32)

def similarity_calculator(data):
    return pd.DataFrame(cosine_similarity(sparse, data), index=train.index, columns=data.index).astype(np.float16)

num_workers = 15
chunksize = int(test_X.shape[0]/num_workers)+1
chunks = chunker(test_X.fillna(0), chunksize)
with Pool(num_workers) as p:
    similarity_matrix = pd.concat(p.map(similarity_calculator, chunks), axis=1)
    
del chunks

similarity_matrix.to_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similarity_matrix_test.pkl')

similarity_matrix = pd.read_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similarity_matrix_test.pkl')
def n_similar_users(column, n=1000):
    return pd.Series(column.sort_values(ascending=False).head(n).index, name=column.name)

def similar_users_ordered(column):
    return pd.Series(column.sort_values(ascending=False).index, name=column.name)

with Pool(15) as p:
    similar_users_ordered = pd.concat(p.map(similar_users_ordered, (tup[1] for tup in similarity_matrix.items())), axis=1)

similar_users_ordered.to_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similar_users_ordered_test.pkl')

train = train.replace(0, np.NaN)

KeyboardInterrupt: 

In [12]:
similar_users_ordered = pd.read_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similar_users_ordered_test.pkl')

In [13]:
def get_recommendations(username, num_sim_users=2000, avg_method='weighted'):    
    sim_users_data = train.loc[similar_users_ordered[username].head(num_sim_users).values]

    watch_data = test_X.loc[username]

    user_watched = watch_data.loc[watch_data.notnull()].index

    suggestables = set(sim_users_data.columns).difference(user_watched)

    sim_users_data = sim_users_data[suggestables]

    if avg_method == 'simple':
        pred_data = sim_users_data.mean(axis=0).sort_values(ascending=False)

    elif avg_method == 'weighted':
        # user_weights = [incremental_base**x for x in range(sim_users_data.shape[0], 0, -1)]
        user_weights = np.arange(sim_users_data.shape[0], 0, -1)

        weighted_sum = sim_users_data.mul(user_weights, axis=0).sum()

        anime_weights = sim_users_data.notnull().astype('int').mul(user_weights, axis=0).sum()

        pred_data = (weighted_sum / anime_weights).sort_values(ascending=False)

    
    return pred_data

In [14]:
data = test_X.sample(2000, random_state=42).index
with Pool(15) as p:
    partial_f = partial(get_recommendations, num_sim_users=2000, avg_method='weighted')
    test_pred = pd.DataFrame(p.map(partial_f, np.array(data)), index=data)

In [21]:
model_rmse = rmse(test_pred, test_Y.loc[data])
model_rmse

0.1790261474692326

### Benchmark (Mean of the titles)

In [16]:
mean_pred = pd.DataFrame([title_means] * test_X.shape[0], index=test_X.index)

In [17]:
mean_pred

title,.hack//Legend Of The Twilight,.hack//Roots,.hack//SIGN,009-1,07-Ghost,100 Sleeping Princes & the Kingdom of Dreams,100% Pascal-sensei (2017),11eyes,12-Sai: Chiccha na Mune no Tokimeki,12-Sai: Chiccha na Mune no Tokimeki 2,...,gdgd Fairies,gdgd Fairies 2,gdgd men's party,number24,revisions,sola,www.Working!!,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
merlin44,0.398595,0.437319,0.440021,0.318789,0.458356,0.322079,0.316159,0.343093,0.401711,0.402605,...,0.398049,0.445373,0.273214,0.428703,0.319214,0.468144,0.503494,0.545162,0.608905,0.31114
Pcnumero1,0.398595,0.437319,0.440021,0.318789,0.458356,0.322079,0.316159,0.343093,0.401711,0.402605,...,0.398049,0.445373,0.273214,0.428703,0.319214,0.468144,0.503494,0.545162,0.608905,0.31114
Krigod,0.398595,0.437319,0.440021,0.318789,0.458356,0.322079,0.316159,0.343093,0.401711,0.402605,...,0.398049,0.445373,0.273214,0.428703,0.319214,0.468144,0.503494,0.545162,0.608905,0.31114
Luiven801,0.398595,0.437319,0.440021,0.318789,0.458356,0.322079,0.316159,0.343093,0.401711,0.402605,...,0.398049,0.445373,0.273214,0.428703,0.319214,0.468144,0.503494,0.545162,0.608905,0.31114
sylphine,0.398595,0.437319,0.440021,0.318789,0.458356,0.322079,0.316159,0.343093,0.401711,0.402605,...,0.398049,0.445373,0.273214,0.428703,0.319214,0.468144,0.503494,0.545162,0.608905,0.31114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
jenniferm1,0.398595,0.437319,0.440021,0.318789,0.458356,0.322079,0.316159,0.343093,0.401711,0.402605,...,0.398049,0.445373,0.273214,0.428703,0.319214,0.468144,0.503494,0.545162,0.608905,0.31114
Burokun,0.398595,0.437319,0.440021,0.318789,0.458356,0.322079,0.316159,0.343093,0.401711,0.402605,...,0.398049,0.445373,0.273214,0.428703,0.319214,0.468144,0.503494,0.545162,0.608905,0.31114
camilasantander21,0.398595,0.437319,0.440021,0.318789,0.458356,0.322079,0.316159,0.343093,0.401711,0.402605,...,0.398049,0.445373,0.273214,0.428703,0.319214,0.468144,0.503494,0.545162,0.608905,0.31114
GorillazAlchemist,0.398595,0.437319,0.440021,0.318789,0.458356,0.322079,0.316159,0.343093,0.401711,0.402605,...,0.398049,0.445373,0.273214,0.428703,0.319214,0.468144,0.503494,0.545162,0.608905,0.31114


In [22]:
mean_rmse = rmse(mean_pred, test_Y)
mean_rmse

0.1848834754104757

## Comparison to the Benchmark

In [26]:
((model_rmse - mean_rmse) / mean_rmse)*100

-3.1681186911046266

3.168% lower RMSE than the benchmark