In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import math

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_pickle("../new_data/train_test/online_train.pkl")
test = pd.read_pickle("../new_data/train_test/online_test.pkl")

In [None]:
def user_topk(user_sim,train,topk):
    fit = user_sim @ train
    fit_sort = np.argsort(np.argsort(-fit,axis=1),axis=1)
    fit_topk = (fit_sort <= (topk-1))
    fit_topk = fit_topk.astype(int)
    return(fit_topk)

In [None]:
def item_topk(item_sim,train,topk):
    fit = train @ item_sim
    fit_sort = np.argsort(np.argsort(-fit,axis=1),axis=1)
    fit_topk = (fit_sort <= (topk-1))
    fit_topk = fit_topk.astype(int)
    return(fit_topk)

In [None]:
def test_topk(topk_fit,test,topk):
    test_fit = topk_fit.loc[[i in test.index for i in topk_fit.index]]
    test = test.loc[[i in test_fit.index for i in test.index]]
    test_fit.index = test.index
    test_fit.columns = test.columns
    test_total = pd.DataFrame(test.sum(axis=1))
    diff = test - test_fit
    diff[diff==-1] = 0
    mis = pd.DataFrame(diff.sum(axis=1))
    test_final = pd.merge(test_total,mis,on='clnt_id',how='left')
    test_final.columns = ['buy','mis']
    test_final['correct'] = test_final['buy']-test_final['mis']
    test_final['recall'] = 1-(test_final['mis']/test_final['buy'])
    test_final['precision'] = test_final['correct']/topk
    return(test_final)

# Pearson Correlaton

## 1) item based 

In [None]:
item_cor = train.corr()
item_cor = item_cor.fillna(0)
item_cor = np.matrix(item_cor)

In [None]:
topk = 10
fit = item_topk(item_cor,train,topk)
fit = pd.DataFrame(fit)
fit.index = train.index

In [None]:
item_cor_fit = test_topk(fit,test,topk)
item_cor_fit['recall'].mean()

In [None]:
item_cor_fit['precision'].mean()

recall1 : 5.79 / precision1 : 23.51

recall5 : 16.14 / precision5 : 17.37

recall10 : 23.22 / precision10 : 13.50

## 2) user based 

In [None]:
user_cor_beh = np.corrcoef(train.values)

In [None]:
topk = 10
fit = user_topk(user_cor_beh,train,topk)
fit = pd.DataFrame(fit)
fit.index = train.index

In [None]:
user_cor_fit = test_topk(fit,test,topk)
user_cor_fit['recall'].mean()

In [None]:
user_cor_fit['precision'].mean()

recall1 : 6.00 / precision1 : 25.65

recall5 : 15.20 / precision5 : 16.57

recall10 : 21.74 / precision10 : 12.11

# Cosine similarity 

# 1) item based

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

item_cos = cosine_similarity(train.T)
item_cos = np.matrix(item_cos)

In [None]:
topk = 1
fit = item_topk(item_cos,train,topk)
fit = pd.DataFrame(fit)
fit.index = train.index

In [None]:
item_cos_fit = test_topk(fit,test,topk)
item_cos_fit['recall'].mean()

In [None]:
item_cos_fit['precision'].mean()

recall1 : 5.73 / precision1 : 23.17

recall5 : 15.97 / precision5 : 17.19

recall10 : 23.00 / precision10 : 13.35

## 2) user behavior based 

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

user_cos_beh = cosine_similarity(train)
user_cos_beh = np.matrix(user_cos_beh)

In [None]:
topk = 10
fit = user_topk(user_cos_beh,train,topk)
fit = pd.DataFrame(fit)
fit.index = train.index

In [None]:
user_cos_fit = test_topk(fit,test,topk)
user_cos_fit['recall'].mean()

In [None]:
user_cos_fit['precision'].mean()

recall1 : 5.82 / precision1 : 25.09

recall5 : 15.03 / precision5 : 16.53

recall10 : 21.20 / precision10 : 12.09

# SVD 

In [None]:
U,S,V = np.linalg.svd(train,full_matrices=False)
U = pd.DataFrame(U)
V = pd.DataFrame(V)

In [None]:
plt.plot(S[0:100])

In [None]:
S[0:500].sum()/S.sum()

In [None]:
svd = U.iloc[:,0:500] @ np.diag(S[0:500]) @ V.iloc[0:500,:]
svd.index = train.index
svd.columns = train.columns

In [None]:
svd_sort = np.argsort(np.argsort(-svd,axis=1),axis=1)

In [None]:
topk = 1
fit = (svd_sort <= (topk-1))
fit = fit.astype(int)

In [None]:
svd_fit = test_topk(fit,test,topk)
svd_fit['recall'].mean()

In [None]:
svd_fit['precision'].mean()

recall1 : 6.69 / precision1 : 26.95

recall5 : 16.87 / precision5 : 17.26

recall10 : 23.31 / precision10 : 13.01

In [None]:
online_score = pd.read_pickle("../new_data/factor_score/online_score.pkl")

In [None]:
online_type = np.argsort(np.argsort(online_score))
online_type = online_type.applymap(lambda x : 1 if x == 4 else 0)

In [None]:
online_type = pd.merge(svd_fit,online_type,on='clnt_id',how='left')

In [None]:
online_type[['online1','online2','online3','online4','online5']].sum()

In [None]:
online_type.loc[online_type['online1']==1]['recall'].mean()

In [None]:
online_type.loc[online_type['online2']==1]['recall'].mean()

In [None]:
online_type.loc[online_type['online3']==1]['recall'].mean()

In [None]:
online_type.loc[online_type['online4']==1]['recall'].mean()

In [None]:
online_type.loc[online_type['online5']==1]['recall'].mean()