In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import math
from multiprocessing import Pool
import warnings
from functools import partial
from itertools import product

In [2]:
np.random.seed(42)
tqdm.pandas()
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [4]:
def parallelize(groups, func):
    num_workers = 16
    with Pool(num_workers) as p:
        return pd.concat(p.map(func, [group for name, group in groups])).sort_index()

In [5]:
df = pd.read_pickle('../data/train.pkl.xz')

keeper_mask = pd.DataFrame(np.random.uniform(size=df.shape), 
                           index=df.index, 
                           columns=df.columns).applymap(lambda x: x > 0.75)

train_masked = df.mask(keeper_mask)

train_actual = df.mask(~keeper_mask)

train_masked.to_pickle('../data/train_masked.pkl.xz')
train_actual.to_pickle('../data/train_actual.pkl.xz')

validate_full = pd.read_pickle('../data/validate.pkl.xz')

keeper_mask = pd.DataFrame(np.random.uniform(size=validate_full.shape), 
                           index=validate_full.index, 
                           columns=validate_full.columns).applymap(lambda x: x > 0.75)

validate_masked = validate_full.mask(keeper_mask)

validate_actual = validate_full.mask(~keeper_mask)

validate_masked.to_pickle('../data/validate_masked.pkl.xz')
validate_actual.to_pickle('../data/validate_actual.pkl.xz')

In [35]:
anime_ratings = df.mean(axis=0).sort_values(ascending=False)

In [6]:
validate_masked = pd.read_pickle('../data/validate_masked.pkl.xz')
validate_actual = pd.read_pickle('../data/validate_actual.pkl.xz')

df = df.fillna(0)

sparse = sp.sparse.csr_matrix(df.values, dtype=np.float32)

def similarity_calculator(data):
    return pd.DataFrame(cosine_similarity(sparse, data), index=df.index, columns=data.index).astype(np.float16)

num_workers = 15
chunksize = int(validate_masked.shape[0]/num_workers)+1
chunks = chunker(validate_masked.fillna(0), chunksize)
with Pool(num_workers) as p:
    similarity_matrix = pd.concat(p.map(similarity_calculator, chunks), axis=1)
    
del chunks

similarity_matrix.to_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similarity_matrix.pkl')

similarity_matrix = pd.read_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similarity_matrix.pkl')
def n_similar_users(column, n=1000):
    return pd.Series(column.sort_values(ascending=False).head(n).index, name=column.name)

def similar_users_ordered(column):
    return pd.Series(column.sort_values(ascending=False).index, name=column.name)

with Pool(15) as p:
    similar_users_ordered = pd.concat(p.map(similar_users_ordered, (tup[1] for tup in similarity_matrix.items())), axis=1)

similar_users_ordered.to_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similar_users_ordered.pkl')

df = df.replace(0, np.NaN)

In [7]:
similar_users_ordered = pd.read_pickle('/mnt/int_drive_0/Data/anime-recommendation-engine/similar_users_ordered.pkl')

In [57]:
def get_recommendations(username, num_sim_users=2000, avg_method='simple'):
    if avg_method == 'title':
        return anime_ratings
    
    sim_users_data = df.loc[similar_users_ordered[username].head(num_sim_users).values]

    watch_data = validate_masked.loc[username]

    user_watched = watch_data.loc[watch_data.notnull()].index

    suggestables = set(sim_users_data.columns).difference(user_watched)

    sim_users_data = sim_users_data[suggestables]

    

    if avg_method == 'simple':
        pred_data = sim_users_data.mean(axis=0).sort_values(ascending=False)

    elif avg_method == 'weighted':
        # incremental_base = 1.01
        # user_weights = [incremental_base**x for x in range(sim_users_data.shape[0], 0, -1)]
        user_weights = list(range(sim_users_data.shape[0], 0, -1))

        weighted_sum = sim_users_data.mul(user_weights, axis=0).sum()

        anime_weights = sim_users_data.notnull().astype('int').mul(user_weights, axis=0).sum()

        pred_data = (weighted_sum / anime_weights).sort_values(ascending=False)

    
    return pred_data

## Hyperparameter Tuning

### Number of Similar Users to Consider

In [22]:
%%time

data = validate_masked.sample(1500).index
avg_method_list = ['simple', 'weighted', 'title']
num_sim_users_list = [10, 20, 50, 100, 200, 500, 1000, 2000, 3000, 4000, 5000]
rmse_dict = {avg_method:{num_sim_users:None for num_sim_users in num_sim_users_list} for avg_method in avg_method_list}


for avg_method, num_sim_users in tqdm(list(product(avg_method_list, num_sim_users_list))):
    with Pool(15) as p:
        partial_f = partial(get_recommendations, num_sim_users=num_sim_users, avg_method=avg_method)
        validate_pred = pd.DataFrame(p.map(partial_f, data.tolist()), index=data)

    rmse = np.sqrt(((validate_pred - validate_actual.loc[data])**2).mean(axis=1)).mean()
    rmse_dict[avg_method][num_sim_users] = rmse

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [38:40<00:00, 105.49s/it]

CPU times: user 1min 52s, sys: 33.9 s, total: 2min 26s
Wall time: 38min 40s





In [23]:
rmse_dict

{'simple': {10: 0.20010043857678947,
  20: 0.1950415966258326,
  50: 0.18794863029260073,
  100: 0.18409802871981906,
  200: 0.1799481565217846,
  500: 0.1744847977004152,
  1000: 0.1735082418100596,
  2000: 0.17357124351165187,
  3000: 0.1744043972866788,
  4000: 0.1747082252552745,
  5000: 0.175103277597533},
 'weighted': {10: 0.203415598423892,
  20: 0.1986182015071358,
  50: 0.19053264022762387,
  100: 0.1865186569540035,
  200: 0.18088875179957942,
  500: 0.17533525622359586,
  1000: 0.17334287332710785,
  2000: 0.17311719211616267,
  3000: 0.1734320146771835,
  4000: 0.17390105297189984,
  5000: 0.1742032278802834}}

In [58]:
with Pool(15) as p:
    partial_f = partial(get_recommendations, num_sim_users=2000, avg_method='weighted')
    validate_pred = pd.DataFrame(p.map(partial_f, data.tolist()), index=data)

In [60]:
rmse_list = np.sqrt(((validate_pred - validate_actual.loc[data])**2).mean(axis=1))
rmse_list

username
NataliPe            0.265294
kireiko             0.131675
XMDREAMdragon       0.144744
DakyZET             0.170418
thejetblackwings    0.144517
                      ...   
Yuriko0w0           0.190868
ishmael1991         0.080113
Recknext                 NaN
IKindaLoveAnime     0.177064
thegoodgirl         0.339828
Length: 1500, dtype: float64

In [83]:

np.sqrt(((pd.DataFrame([anime_ratings]*len(data), index=data) - validate_actual.loc[data])**2).mean(axis=1)).mean()

0.17929860906430503

In [84]:
rmse_list.mean()

0.17311719211616267

### Simple vs Weighted Average

### Previous

In [None]:
validate_pred_prev = pd.read_pickle('../data/validate_pred.pkl.xz')

validate_pred.to_pickle('../data/validate_pred.pkl')

%%bash

cd ../data

rm validate_pred.pkl.xz
xz -vT14 validate_pred.pkl

validate_pred = pd.read_pickle('../data/validate_pred.pkl.xz')

In [None]:
rmse = np.sqrt(((validate_pred_prev - validate_actual)**2).mean(axis=1))

In [None]:
rmse.mean()