In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import math

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_pickle("../new_data/train_test/train.pkl")
test = pd.read_pickle("../new_data/train_test/test.pkl")

In [None]:
def user_topk(user_sim,train,topk):
    fit = user_sim @ train
    fit_sort = np.argsort(np.argsort(-fit,axis=1),axis=1)
    fit_topk = (fit_sort <= (topk-1))
    fit_topk = fit_topk.astype(int)
    return(fit_topk)

In [None]:
def item_topk(item_sim,train,topk):
    fit = train @ item_sim
    fit_sort = np.argsort(np.argsort(-fit,axis=1),axis=1)
    fit_topk = (fit_sort <= (topk-1))
    fit_topk = fit_topk.astype(int)
    return(fit_topk)

In [None]:
def test_topk(topk_fit,test,topk):
    test_fit = topk_fit.loc[[i in test.index for i in topk_fit.index]]
    test = test.loc[[i in test_fit.index for i in test.index]]
    test_fit.index = test.index
    test_fit.columns = test.columns
    test_total = pd.DataFrame(test.sum(axis=1))
    diff = test - test_fit
    diff[diff==-1] = 0
    mis = pd.DataFrame(diff.sum(axis=1))
    test_final = pd.merge(test_total,mis,on='clnt_id',how='left')
    test_final.columns = ['buy','mis']
    test_final['correct'] = test_final['buy']-test_final['mis']
    test_final['recall'] = 1-(test_final['mis']/test_final['buy'])
    test_final['precision'] = test_final['correct']/topk
    return(test_final)

# Pearson Correlaton

## 1) item based 

In [None]:
item_cor = train.corr()
item_cor = item_cor.fillna(0)
item_cor = np.matrix(item_cor)

In [None]:
topk = 1
fit = item_topk(item_cor,train,topk)
fit = pd.DataFrame(fit)
fit.index = train.index

In [None]:
item_cor_fit = test_topk(fit,test,topk)
item_cor_fit['recall'].mean()

In [None]:
item_cor_fit['precision'].mean()

recall1 : 4.83 / precision1 : 19.68

recall5 : 15.04 / precision5 : 14.70

recall10 : 21.88 / precision10 : 11.30

## 2) user based 

In [None]:
user_cor_beh = np.corrcoef(train.values)

In [None]:
topk = 10
fit = user_topk(user_cor_beh,train,topk)
fit = pd.DataFrame(fit)
fit.index = train.index

In [None]:
user_cor_fit = test_topk(fit,test,topk)
user_cor_fit['recall'].mean()

In [None]:
user_cor_fit['precision'].mean()

recall1 : 4.04 / precision1 : 16.97

recall5 : 13.61 / precision5 : 13.67

recall10 : 20.25 / precision10 : 10.37

# Cosine similarity 

# 1) item based

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

item_cos = cosine_similarity(train.T)
item_cos = np.matrix(item_cos)

In [None]:
topk = 10
fit = item_topk(item_cos,train,topk)
fit = pd.DataFrame(fit)
fit.index = train.index

In [None]:
item_cos_fit = test_topk(fit,test,topk)
item_cos_fit['recall'].mean()

In [None]:
item_cos_fit['precision'].mean()

recall1 : 4.33 / precision1 : 18.06

recall5 : 14.25 / precision5 : 14.16

recall10 : 20.95 / precision10 : 10.94

## 2) user behavior based 

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

user_cos_beh = cosine_similarity(train)
user_cos_beh = np.matrix(user_cos_beh)

In [None]:
topk = 1
fit = user_topk(user_cos_beh,train,topk)
fit = pd.DataFrame(fit)
fit.index = train.index

In [None]:
user_cos_fit = test_topk(fit,test,topk)
user_cos_fit['recall'].mean()

In [None]:
user_cos_fit['precision'].mean()

recall1 : 3.77 / precision1 : 16.22

recall5 : 13.07 / precision5 : 13.62

recall10 : 19.64 / precision10 : 10.36

# SVD 

In [None]:
U,S,V = np.linalg.svd(train,full_matrices=False)
U = pd.DataFrame(U)
V = pd.DataFrame(V)

In [None]:
plt.plot(S[0:100])

In [None]:
S[0:500].sum()/S.sum()

In [None]:
svd = U.iloc[:,0:500] @ np.diag(S[0:500]) @ V.iloc[0:500,:]
svd.index = train.index
svd.columns = train.columns

In [None]:
svd_sort = np.argsort(np.argsort(-svd,axis=1),axis=1)

In [None]:
topk = 1
fit = (svd_sort <= (topk-1))
fit = fit.astype(int)

In [None]:
svd_fit = test_topk(fit,test,topk)
svd_fit['recall'].mean()

In [None]:
svd_fit['precision'].mean()

recall1 : 5.88 / precision1 : 22.13

recall5 : 16.16 / precision5 : 14.75

recall10 : 23.00 / precision10 : 11.15