In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster.elbow import kelbow_visualizer
from collections import Counter
import matplotlib
from sklearn.metrics import r2_score
from surprise import Dataset, Reader, accuracy
from surprise import KNNWithMeans, KNNBasic
from surprise.model_selection import train_test_split



## Recommender system

In [2]:
data = pd.read_csv("data_5/sample_transations.csv", index_col=['Unnamed: 0'], parse_dates=['date'])

In [3]:
user_item = data.groupby(['dd_card_number', 'sku']).quantity.sum().reset_index()
user_item['count'] = user_item.groupby('dd_card_number')['quantity'].transform('sum')
user_item['percentage'] = ((user_item['quantity']*100)/user_item['count']).round(1)

In [4]:
reader = Reader(rating_scale=(0, 100))
data = Dataset.load_from_df(user_item[['dd_card_number','sku','percentage']], reader)
train_set, test_set = train_test_split(data, test_size=0.2)

In [5]:
sim_options = {
    "name": "cosine",
    "user_based": True,  
}
model = KNNWithMeans(k=20, sim_options=sim_options)
model = KNNBasic(k=30, min_k=2, sim_options=sim_options)
model.fit(train_set)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1a600074c08>

In [6]:
predictions = model.test(test_set)

In [7]:
#number of items bought by given user
def get_Iu(uid):
    try:
        return len(train_set.ur[train_set.to_inner_uid(uid)])
    except ValueError:  
        return 0
#number of users that have bought given item
def get_Ui(iid):
    try:
        return len(train_set.ir[train_set.to_inner_iid(iid)])
    except ValueError:
        return 0

In [8]:
df_predictions = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_predictions['err'] = abs(df_predictions.est - df_predictions.rui)
df_predictions['Iu'] = df_predictions.uid.apply(get_Iu) #number of items bought by given user
df_predictions['Ui'] = df_predictions.iid.apply(get_Ui) #number of users that have bought given item

In [9]:
best_recommendations = df_predictions.sort_values(by='err')[:3]
worst_recommendations = df_predictions.sort_values(by='err')[-3:]

In [10]:
best_recommendations 

Unnamed: 0,uid,iid,rui,est,details,err,Iu,Ui
750,43,71,0.2,0.2,"{'actual_k': 2, 'was_impossible': False}",2.775558e-17,67,2
41,41,158,0.3,0.3,"{'actual_k': 3, 'was_impossible': False}",5.5511150000000004e-17,35,3
474,34,231,0.3,0.299602,"{'actual_k': 5, 'was_impossible': False}",0.0003976209,17,5


In [11]:
worst_recommendations

Unnamed: 0,uid,iid,rui,est,details,err,Iu,Ui
820,77,77,51.3,6.332016,"{'actual_k': 30, 'was_impossible': False}",44.967984,28,66
505,3,77,55.6,5.346668,"{'actual_k': 30, 'was_impossible': False}",50.253332,27,66
440,45,75,63.0,11.337552,"{'actual_k': 30, 'was_impossible': False}",51.662448,23,34


### Performance Matrices

In [12]:
y_true = np.array(test_set)[:,2]
y_pred = df_predictions.est

In [13]:
pd.DataFrame([['RMSE',accuracy.rmse(predictions, verbose=False)],
              ['MSE',accuracy.mse(predictions, verbose=False)],
              ['MAE',accuracy.mae(predictions, verbose=False)],
              ['R^2',r2_score(y_true, y_pred)]]).set_index(0)

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
RMSE,5.752816
MSE,33.094888
MAE,2.591657
R^2,0.151041
