In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import math
from multiprocessing import Pool
import warnings
from functools import partial
import random

In [2]:
np.random.seed(42)
tqdm.pandas()
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [4]:
def parallelize(groups, func):
    num_workers = 16
    with Pool(num_workers) as p:
        return pd.concat(p.map(func, [group for name, group in groups])).sort_index()

In [5]:
df = pd.read_pickle('../data/train.pkl.xz')

In [6]:
validate_X = pd.read_pickle('../data/validate_X.pkl.xz')
validate_Y = pd.read_pickle('../data/validate_Y.pkl.xz')

In [None]:
df = df.fillna(0)

sparse = sp.sparse.csr_matrix(df.values, dtype=np.float32)

def similarity_calculator(data):
    return pd.DataFrame(cosine_similarity(sparse, data), index=df.index, columns=data.index).astype(np.float16)

num_workers = 15
chunksize = int(validate_X.shape[0]/num_workers)+1
chunks = chunker(validate_X.fillna(0), chunksize)
with Pool(num_workers) as p:
    similarity_matrix = pd.concat(p.map(similarity_calculator, chunks), axis=1)
    
del chunks

similarity_matrix.to_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similarity_matrix.pkl')

similarity_matrix = pd.read_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similarity_matrix.pkl')
def n_similar_users(column, n=1000):
    return pd.Series(column.sort_values(ascending=False).head(n).index, name=column.name)

def similar_users_ordered(column):
    return pd.Series(column.sort_values(ascending=False).index, name=column.name)

with Pool(15) as p:
    similar_users_ordered = pd.concat(p.map(similar_users_ordered, (tup[1] for tup in similarity_matrix.items())), axis=1)

similar_users_ordered.to_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similar_users_ordered.pkl')

df = df.replace(0, np.NaN)

In [8]:
similar_users_ordered = pd.read_pickle('/mnt/int_drive_0/data/anime-recommendation-engine/similar_users_ordered.pkl')

In [10]:
def get_recommendations(username, num_sim_users=2000, avg_method='simple'):    
    sim_users_data = df.loc[similar_users_ordered[username].head(num_sim_users).values]

    watch_data = validate_X.loc[username]

    user_watched = watch_data.loc[watch_data.notnull()].index

    suggestables = set(sim_users_data.columns).difference(user_watched)

    sim_users_data = sim_users_data[suggestables]

    if avg_method == 'simple':
        pred_data = sim_users_data.mean(axis=0).sort_values(ascending=False)

    elif avg_method == 'weighted':
        # user_weights = [incremental_base**x for x in range(sim_users_data.shape[0], 0, -1)]
        user_weights = np.arange(sim_users_data.shape[0], 0, -1)

        weighted_sum = sim_users_data.mul(user_weights, axis=0).sum()

        anime_weights = sim_users_data.notnull().astype('int').mul(user_weights, axis=0).sum()

        pred_data = (weighted_sum / anime_weights).sort_values(ascending=False)

    
    return pred_data

## Hyperparameter Tuning

In [30]:
def rmse(pred, true):
    return np.sqrt(np.nanmean((pred - true)**2))

### Number of Similar Users to Consider

In [11]:
num_sim_users_list = [10, 20, 50, 100, 200, 500, 1000]
avg_method_list = ['simple', 'weighted']
# incremental_base_list = [1.0001, 1.001, 1.01, 1.1]

parameter_combs = []
for nsu in num_sim_users_list:
    for avgm in avg_method_list:
        parameter_combs.append((nsu, avgm))
            
random.shuffle(parameter_combs)

In [23]:
data = validate_X.sample(750, random_state=42).index
rmse_dict = {}

for nsu, avgm in tqdm(parameter_combs):
    with Pool(15) as p:
        partial_f = partial(get_recommendations, num_sim_users=nsu, avg_method=avgm)
        validate_pred = pd.DataFrame(p.map(partial_f, np.array(data)), index=data)

    rmse_dict[(nsu, avgm)] = rmse(validate_pred, validate_Y.loc[data])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [03:49<00:00, 16.36s/it]


In [24]:
rmse_dict

{(20, 'weighted'): 1.0164731114288996,
 (20, 'simple'): 1.0094720884576431,
 (50, 'simple'): 1.004475373088348,
 (10, 'weighted'): 1.0168702648055246,
 (200, 'weighted'): 0.9570248796560815,
 (500, 'weighted'): 0.9132503671621245,
 (200, 'simple'): 0.9452104899892659,
 (500, 'simple'): 0.9050967724213126,
 (1000, 'simple'): 0.8892681622914886,
 (100, 'simple'): 0.9739179682261925,
 (10, 'simple'): 1.0135619224343615,
 (1000, 'weighted'): 0.8932796171901453,
 (100, 'weighted'): 0.9885698333129669,
 (50, 'weighted'): 1.0159410861985771}

### The best model

In [48]:
data = validate_X.sample(750, random_state=42).index
with Pool(15) as p:
    partial_f = partial(get_recommendations, num_sim_users=1000, avg_method='simple')
    validate_pred = pd.DataFrame(p.map(partial_f, np.array(data)), index=data)

In [49]:
rmse(validate_pred, validate_Y.loc[data])

0.8892681622914886

### Benchmark (Mean of the titles)

In [46]:
mean_pred = pd.DataFrame([validate_X.mean()] * validate_X.shape[1], index=validate_X.index)

In [47]:
rmse(mean_pred, validate_Y)

0.9145362271736047

validate_pred_prev = pd.read_pickle('../data/validate_pred.pkl.xz')

validate_pred.to_pickle('../data/validate_pred.pkl')

%%bash

cd ../data

rm validate_pred.pkl.xz
xz -vT14 validate_pred.pkl

validate_pred = pd.read_pickle('../data/validate_pred.pkl.xz')