In [20]:
import pandas as pd
import numpy as np
import os
import glob as glob
import gc

In [49]:
import joblib
from surprise import SVD
from surprise import Reader, Dataset
from collections import defaultdict

### Loading of Data

##### Trained on subset of training data [ to avoid memory error]

In [22]:
"""for file in glob.glob(os.getcwd()+'\*.csv'):
    print(file)"""

df_tr = pd.read_csv('reco_assignment_training.csv')[0:10000]
df_tr['date'] = pd.to_datetime(df_tr['Tran_dt'])

df_ts = pd.read_csv('reco_assignment_holdout.csv')
df_ts['date'] = pd.to_datetime(df_ts['Tran_dt'])

In [25]:
cust = df_tr['Customer_num'].unique()
prod = df_tr['Product_num'].unique()

cust_ts = df_ts['Customer_num'].unique()
prod_ts = df_ts['Product_num'].unique()

cust_new = [i for i in cust_ts if i not in cust]
prod_new = [i for i in prod_ts if i not in prod]
len(cust_new), len(prod_new)

(5263, 12232)

### Transformation of training data

In [28]:
def trans_form(df):    
    df_agg = df_tr.groupby(['Customer_num','Product_num'])['Tran_qty'].sum().reset_index()
    df_agg_tot = df_tr.groupby(['Customer_num'])['Tran_qty'].agg(["mean", "sum"]).reset_index()
    df_tr_agg = df_agg.merge(df_agg_tot, on = ['Customer_num'], how='inner')
    df_tr_agg['rating'] = (df_tr_agg['Tran_qty']*10/df_tr_agg['sum'])
    ### Normalizing the rating converting them into percentage
    return df_tr_agg

In [29]:
df_tr_agg = trans_form(df_tr)

In [30]:
df_tr_agg

Unnamed: 0,Customer_num,Product_num,Tran_qty,mean,sum,rating
0,C_1001004,P_4251,4.00,1.94,5.82,6.872852
1,C_1001004,P_7966,0.82,1.94,5.82,1.408935
2,C_1001004,P_8822,1.00,1.94,5.82,1.718213
3,C_1001322,P_15473,1.00,0.70,1.40,7.142857
4,C_1001322,P_9168,0.40,0.70,1.40,2.857143
...,...,...,...,...,...,...
9799,C_998908,P_18531,1.00,1.00,2.00,5.000000
9800,C_999487,P_23523,1.00,1.00,4.00,2.500000
9801,C_999487,P_27490,1.00,1.00,4.00,2.500000
9802,C_999487,P_2909,1.00,1.00,4.00,2.500000


In [33]:
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(df_tr_agg[["Customer_num", "Product_num", "rating"]], reader)

### Hyper Parameter Tuning

In [34]:
from surprise.model_selection import GridSearchCV

param_grid = {
    "n_epochs": [25, 30, 40, 50],
    "lr_all": [0.005, .01, .015, .02, .03],
    "reg_all": [0.6, .08, .1, .15]
}
#gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=5)
#grid_result = gs.fit(data)

In [145]:
# summarize results
print("Best: %f using %s" % (gs.best_score['rmse'], gs.best_params))
means = gs.cv_results['mean_test_rmse']
stds = gs.cv_results['std_test_rmse']
params = gs.cv_results['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.484893 using {'rmse': {'n_epochs': 50, 'lr_all': 0.03, 'reg_all': 0.1}, 'mae': {'n_epochs': 50, 'lr_all': 0.02, 'reg_all': 0.08}}
0.541930 (0.014928) with: {'n_epochs': 25, 'lr_all': 0.005, 'reg_all': 0.6}
0.530991 (0.015102) with: {'n_epochs': 25, 'lr_all': 0.005, 'reg_all': 0.08}
0.531077 (0.014814) with: {'n_epochs': 25, 'lr_all': 0.005, 'reg_all': 0.1}
0.531702 (0.014912) with: {'n_epochs': 25, 'lr_all': 0.005, 'reg_all': 0.15}
0.526428 (0.015136) with: {'n_epochs': 25, 'lr_all': 0.01, 'reg_all': 0.6}
0.513437 (0.015390) with: {'n_epochs': 25, 'lr_all': 0.01, 'reg_all': 0.08}
0.513802 (0.015146) with: {'n_epochs': 25, 'lr_all': 0.01, 'reg_all': 0.1}
0.515069 (0.015048) with: {'n_epochs': 25, 'lr_all': 0.01, 'reg_all': 0.15}
0.519041 (0.015281) with: {'n_epochs': 25, 'lr_all': 0.015, 'reg_all': 0.6}
0.505254 (0.015113) with: {'n_epochs': 25, 'lr_all': 0.015, 'reg_all': 0.08}
0.505806 (0.015295) with: {'n_epochs': 25, 'lr_all': 0.015, 'reg_all': 0.1}
0.506768 (0.015249) with:

In [36]:
#joblib.dump(gs, 'rcm_whole.pkl')
gs1 = joblib.load('rcm_whole.pkl')

In [37]:
gs1.best_score

{'rmse': 0.4848925242795291, 'mae': 0.16000200734434286}

In [38]:
param_best = gs1.best_params['rmse']
param_best

{'n_epochs': 50, 'lr_all': 0.03, 'reg_all': 0.1}

### Training and Predicition Using SVD

#### Training 

In [39]:
trainset = data.build_full_trainset()
svd = SVD(verbose=True, n_epochs=50, lr_all=0.03, reg_all=0.1)
svd.fit(trainset)

#### Prediction

In [44]:
gc.collect()
testset = trainset.build_anti_testset()
predictions = svd.test(testset)

282

### Top N product Recommendation

In [238]:
def get_top_n(predictions, n=3):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [239]:
top_n = get_top_n(predictions)

In [240]:
### Top 3 product recommendation for user C_1001004
top_n['C_1001004']

[('P_6767', 6.655728455866976),
 ('P_44313', 6.491369020067038),
 ('P_5718', 6.478739325476905)]

In [241]:
from surprise import Dataset, get_dataset_dir, KNNBaseline

### Top 5 similar customer to a given customer

In [242]:
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x2591c884c10>

In [243]:
def top_n_similar_customer(cust, algo, num_recm = 5):
    sim_options = {'name':'pearson_baseline','user_based':True}
    cus_chosen_inner_id = algo.trainset.to_inner_uid(cust)
    cus_chosen_neighbors = algo.get_neighbors(cus_chosen_inner_id, k =num_recm)
    cus_chosen_neighbors = (algo.trainset.to_raw_uid(inner_uid) for inner_uid in cus_chosen_neighbors)
    for cus in cus_chosen_neighbors:
        print(cus)
    return cus_chosen_neighbors

In [244]:
neighbour = top_n_similar_customer('C_1001004', algo)

C_1001322
C_1001329
C_1001363
C_1001411
C_1001612


### Finding avg recall and precision for users based on holdout set

In [245]:
cust = df_tr['Customer_num'].unique()
prod = df_tr['Product_num'].unique()

cust_ts = df_ts['Customer_num'].unique()
prod_ts = df_ts['Product_num'].unique()

cust_new = [i for i in cust_ts if i not in cust]
prod_new = [i for i in prod_ts if i not in prod]
len(cust_new), len(prod_new)

(5263, 12232)

In [246]:
def metric(top_n = top_n, df_tr = df_tr):
    lis = []
    for cus in df_tr['Customer_num'].unique():
        df = pd.DataFrame(top_n[cus], columns=['Product_num','rating'])
        df['Customer_num'] = cus
        lis.append(df)
        
    df_top = pd.concat(lis)[['Customer_num','Product_num','rating']]
    df_prec = df_top.groupby(['Customer_num'])['Product_num'].count().reset_index().rename(columns={'Product_num':'prec_den'})
    df_top1 = df_top.merge(df_prec, how='inner', on=['Customer_num'])
    #Filtering holdoutset to contain only customers and product that are present in training sample
    df_ts_upd = df_ts.loc[(df_ts['Customer_num'].isin(df_tr['Customer_num'].unique()))&(df_ts['Product_num'].isin(df_tr['Product_num'].unique()))]
    df_ts_upd_agg = df_ts_upd.groupby(['Customer_num','Product_num'])['Tran_qty'].sum().reset_index()
    df_ts_cuscount = df_ts_upd_agg.groupby(['Customer_num'])['Product_num'].count().reset_index().rename(columns={'Product_num':'recall_den'})
    df_ts_upd_agg1 = df_ts_upd_agg.merge(df_ts_cuscount, on=['Customer_num'], how='inner')
    df_rec = df_top1.merge(df_ts_upd_agg1, how='outer', on=['Customer_num','Product_num'])
    df_rec['relevance'] = 0
    df_rec.loc[(df_rec['rating'].isnull()==False)&(df_rec['Tran_qty'].isnull()==False),'relevance']= 1
    df_rec_slice = df_rec.loc[(df_rec.relevance==1)]
    num_cust = df_ts_upd['Customer_num'].nunique()
    df_rec_slice['recall'] = df_rec_slice['relevance']/df_rec['recall_den']
    df_rec_slice['precision'] = df_rec_slice['relevance']/df_rec['prec_den']
    precision = df_rec_slice['precision'].sum(axis=0)/num_cust
    recall = df_rec_slice['recall'].sum(axis=0)/num_cust
    print("Avg precision and Avg recall across relevant users in holdout set")
    print({'precision': precision*100, 'recall':recall*100})

In [247]:
metric(top_n, df_tr)

Avg precision and Avg recall across relevant users in holdout set
{'precision': 0.5947136563876653, 'recall': 0.09317608092407711}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rec_slice['recall'] = df_rec_slice['relevance']/df_rec['recall_den']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rec_slice['precision'] = df_rec_slice['relevance']/df_rec['prec_den']
