In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import math
from multiprocessing import Pool
import warnings
from functools import partial
import random
from numba import njit, vectorize

In [2]:
np.random.seed(42)
tqdm.pandas()
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [4]:
def parallelize(groups, func):
    num_workers = 16
    with Pool(num_workers) as p:
        return pd.concat(p.map(func, [group for name, group in groups])).sort_index()

In [5]:
df = pd.read_pickle('../data/train.pkl.xz')

In [6]:
validate_X = pd.read_pickle('../data/validate_X.pkl.xz')
validate_Y = pd.read_pickle('../data/validate_Y.pkl.xz')

df = df.fillna(0)

sparse = sp.sparse.csr_matrix(df.values, dtype=np.float32)

def similarity_calculator(data):
    return pd.DataFrame(cosine_similarity(sparse, data), index=df.index, columns=data.index).astype(np.float16)

num_workers = 15
chunksize = int(validate_X.shape[0]/num_workers)+1
chunks = chunker(validate_X.fillna(0), chunksize)
with Pool(num_workers) as p:
    similarity_matrix = pd.concat(p.map(similarity_calculator, chunks), axis=1)
    
del chunks

similarity_matrix.to_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similarity_matrix.pkl')

similarity_matrix = pd.read_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similarity_matrix.pkl')
def n_similar_users(column, n=1000):
    return pd.Series(column.sort_values(ascending=False).head(n).index, name=column.name)

def similar_users_ordered(column):
    return pd.Series(column.sort_values(ascending=False).index, name=column.name)

with Pool(15) as p:
    similar_users_ordered = pd.concat(p.map(similar_users_ordered, (tup[1] for tup in similarity_matrix.items())), axis=1)

similar_users_ordered.to_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similar_users_ordered.pkl')

df = df.replace(0, np.NaN)

In [7]:
similar_users_ordered = pd.read_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similar_users_ordered.pkl')

In [22]:
df.loc['Kai777']

title
.hack//Legend Of The Twilight   NaN
.hack//Roots                    NaN
.hack//SIGN                     NaN
009-1                           NaN
07-Ghost                        NaN
                                 ..
sola                            NaN
www.Working!!                   NaN
xxxHOLiC                        NaN
xxxHOLiC Kei                    NaN
ēlDLIVE                         NaN
Name: Kai777, Length: 4489, dtype: float64

In [21]:
def get_recommendations(username, num_sim_users=2000, avg_method='simple'):    
    sim_users_data = df.loc[similar_users_ordered[username].head(num_sim_users).values]

    watch_data = validate_X.loc[username]

    user_watched = watch_data.loc[watch_data.notnull()].index

    suggestables = set(sim_users_data.columns).difference(user_watched)

    sim_users_data = sim_users_data[suggestables]

    if avg_method == 'simple':
        pred_data = sim_users_data.mean(axis=0).sort_values(ascending=False)

    elif avg_method == 'weighted':
        # user_weights = [incremental_base**x for x in range(sim_users_data.shape[0], 0, -1)]
        user_weights = np.arange(sim_users_data.shape[0], 0, -1)

        weighted_sum = sim_users_data.mul(user_weights, axis=0).sum()

        anime_weights = sim_users_data.notnull().astype('int').mul(user_weights, axis=0).sum()

        pred_data = (weighted_sum / anime_weights).sort_values(ascending=False)

    
    return pred_data

## Hyperparameter Tuning

### Number of Similar Users to Consider

In [9]:
num_sim_users_list = [10, 20, 50, 100, 200, 500, 1000]
avg_method_list = ['simple', 'weighted']
# incremental_base_list = [1.0001, 1.001, 1.01, 1.1]

parameter_combs = []
for nsu in num_sim_users_list:
    for avgm in avg_method_list:
        parameter_combs.append((nsu, avgm))
        # if avgm == 'weighted':
        #     for base in incremental_base_list:
        #         parameter_combs.append((nsu, avgm, base))
        # else:
        #     parameter_combs.append((nsu, avgm, 1))
            
random.shuffle(parameter_combs)

In [10]:
data = validate_X.sample(750, random_state=42).index
rmse_dict = {}

for nsu, avgm in tqdm(parameter_combs):
    with Pool(15) as p:
        partial_f = partial(get_recommendations, num_sim_users=nsu, avg_method=avgm)
        validate_pred = pd.DataFrame(p.map(partial_f, np.array(data)), index=data)

    rmse = np.sqrt(((validate_pred - validate_Y.loc[data])**2).mean(axis=1)).mean()
    rmse_dict[(nsu, avgm)] = rmse

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [03:06<00:00, 13.34s/it]


In [11]:
rmse_dict

{(20, 'weighted'): 1.0060779142597882,
 (100, 'weighted'): 0.9834626163287248,
 (20, 'simple'): 0.9982245518484044,
 (50, 'weighted'): 1.0039704549058774,
 (200, 'simple'): 0.9382487534843342,
 (200, 'weighted'): 0.9510880255188843,
 (10, 'simple'): 0.9965872347996549,
 (500, 'simple'): 0.9051905183658246,
 (50, 'simple'): 0.995706588861658,
 (500, 'weighted'): 0.9109556308465195,
 (1000, 'simple'): 0.8933785916580511,
 (10, 'weighted'): 1.0007960179960607,
 (1000, 'weighted'): 0.8950569789072624,
 (100, 'simple'): 0.969866150656389}

In [14]:
anime_ratings = df.mean(axis=0)

In [15]:
validate_avg = pd.DataFrame([anime_ratings]*validate_Y.shape[0], columns=validate_Y.columns, index=validate_Y.index)

In [18]:
np.sqrt(((validate_avg - validate_Y)**2).mean(axis=1)).mean()

0.8774203373492494

In [None]:
get_recommendations('WirlWind')

In [None]:
with Pool(15) as p:
    partial_f = partial(get_recommendations, num_sim_users=2000, avg_method='weighted')
    validate_pred = pd.DataFrame(p.map(partial_f, data.tolist()), index=data)

In [None]:
rmse_list = np.sqrt(((validate_pred - validate_Y.loc[data])**2).mean(axis=1))
rmse_list

In [None]:

np.sqrt(((pd.DataFrame([anime_ratings]*len(data), index=data) - validate_Y.loc[data])**2).mean(axis=1)).mean()

In [None]:
rmse_list.mean()

### Simple vs Weighted Average

### Previous

In [None]:
validate_pred_prev = pd.read_pickle('../data/validate_pred.pkl.xz')

validate_pred.to_pickle('../data/validate_pred.pkl')

%%bash

cd ../data

rm validate_pred.pkl.xz
xz -vT14 validate_pred.pkl

validate_pred = pd.read_pickle('../data/validate_pred.pkl.xz')

In [None]:
rmse = np.sqrt(((validate_pred_prev - validate_Y)**2).mean(axis=1))

In [None]:
rmse.mean()