In [1]:
import pandas as pd
from lib.data_class import *
from models.main_models import *
from models.baseline_models import *
from lib.process_data import *
import logging

Using TensorFlow backend.


In [2]:
logging.basicConfig()
log = logging.getLogger("TR_logger")
log.setLevel(logging.DEBUG)

USE_DATA_SET = 'small'
IC_DATA_DIR = {'micro':'./data/instacart_2017_05_01_micro/',
               'tiny':'./data/instacart_2017_05_01_tiny/',
               'small':'./data/instacart_2017_05_01_small/',
               'medium':'./data/instacart_2017_05_01_medium/',
               'full':'./data/instacart_2017_05_01/'}[USE_DATA_SET]

## load data 
-> using hard-coded function for Instacart dataset

In [3]:
order_data, product_data = instacart_process(data_dir=IC_DATA_DIR)

## create datasets

In [4]:
IC_dataset = DataSet(order_data, product_data)
train_dataset, val_dataset, test_dataset = IC_dataset.test_train_val_split()

# Models

## Gradient boost model

In [5]:
lg_model = LGBoostModel()
lg_model.fit(train_dataset)
lg_model.find_threshold(val_dataset, pts=50, max=0.5)
lg_model.accuracy_test(test_dataset)
print("precision = {}, recall = {}, f1 = {}, ndcg = {}".format(lg_model.prec, lg_model.rec, lg_model.f1, lg_model.ndcg))



precision = 0.38341705143868116, recall = 0.859337924555316, f1 = 0.4815422897327067, ndcg = 0.7542152040221292


## Latent+feature-net model

In [6]:
# train latent models
user_encoder = UserAEM()
user_encoder.fit(train_dataset, verbose=0)
product_encoder = HybridProductLatentModel()
product_encoder.fit(train_dataset, verbose=0)

# train main model
lfn_model = LFNetModel(user_latent_model=user_encoder, product_latent_model=product_encoder)
lfn_model.fit(train_dataset, epochs=20)
lfn_model.find_threshold(val_dataset, pts=10, max=0.5)
lfn_model.accuracy_test(test_dataset)
print("precision = {}, recall = {}, f1 = {}, ndcg = {}".format(lfn_model.prec, lfn_model.rec, lfn_model.f1, lfn_model.ndcg))

creating LFNetModel...
fitting network...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
precision = 0.4100245319554408, recall = 0.806692046547119, f1 = 0.48747338814408503, ndcg = 0.7752690948158483


## Topological latent+feature-net model (without bypass)

In [7]:
# # train latent models
# user_encoder = UserAEM()
# user_encoder.fit(train_dataset, verbose=0)
# product_encoder = HybridProductLatentModel()
# product_encoder.fit(train_dataset, verbose=0)

# # train main model
# tlfn_model = TLFNetModel(user_latent_model=user_encoder, product_latent_model=product_encoder, bypass=False)
# tlfn_model.fit(train_dataset, epochs=20)
# tlfn_model.find_threshold(val_dataset, pts=10, max=0.5)
# tlfn_model.accuracy_test(test_dataset)
# print("precision = {}, recall = {}, f1 = {}, ndcg = {}".format(tlfn_model.prec, tlfn_model.rec, tlfn_model.f1, tlfn_model.ndcg))

## Topological latent+feature-net model (with bypass)

In [8]:
# # train latent models
# user_encoder = UserAEM()
# user_encoder.fit(train_dataset, verbose=0)
# product_encoder = HybridProductLatentModel()
# product_encoder.fit(train_dataset, verbose=0)

# # train main model
# tlfnb_model = TLFNetModel(user_latent_model=user_encoder, product_latent_model=product_encoder)
# tlfnb_model.fit(train_dataset, epochs=20)
# tlfnb_model.find_threshold(val_dataset, pts=10, max=0.5)
# tlfnb_model.accuracy_test(test_dataset)
# print("precision = {}, recall = {}, f1 = {}, ndcg = {}".format(tlfnb_model.prec, tlfnb_model.rec, tlfnb_model.f1, tlfnb_model.ndcg))

## User-topological latent+feature-net model (without bypass)

In [None]:
# train latent models
user_encoder = UserAEM()
user_encoder.fit(train_dataset, verbose=0)
product_encoder = HybridProductLatentModel()
product_encoder.fit(train_dataset, verbose=0)

# train main model
tulfnb_model = TULFNetModel(user_latent_model=user_encoder, product_latent_model=product_encoder)
tulfnb_model.fit(train_dataset, epochs=20)
tulfnb_model.find_threshold(val_dataset, pts=10, max=0.5)
tulfnb_model.accuracy_test(test_dataset)
print("precision = {}, recall = {}, f1 = {}, ndcg = {}".format(tulfnb_model.prec, tulfnb_model.rec, tulfnb_model.f1, tulfnb_model.ndcg))

creating TULFNetModel...
reducing from X_train_user.shape=(90219, 36) to X_train_user_red.shape=(1443, 36) for mapping...
fitting mapper...
--->getting latent space rep...
------> fitting PCA to data of shape (1443, 36)...
--->creating mapper graphs...
------> creating projection components...
---------> on component 1/5...
---------> on component 2/5...
---------> on component 3/5...
---------> on component 4/5...
---------> on component 5/5...
------> entering parallelization...
------> exiting parallelization after 0.3906363560000159 seconds
--->assigning train points to graph node bins...
expanded from self.X_map_red.shape=(1443, 84) to self.X_map.shape=(90219, 84)...
combining X_train.shape=(90219, 50) and self.X_map.shape=(90219, 84)...
obtained self.X_map.shape=(90219, 134)
created mapper encoding of size self.X_map.shape[1]
fitting network...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Ep

# Analysis

In [None]:
res_fn, res_lfn = {}, {}
num_tries = 3

models = [lg_model,
          lfn_model,  
          tulfnb_model]

res=[{}, {}, {}]
for i in range(10):
    for r in res:
        r[i]= np.zeros(4)    
    for j in range(num_tries):
        test_adv = test_dataset.make_adversarial(num_switches=i+1)
        for r, model in zip(res,models):
            model.accuracy_test(test_adv)
            r[i] += np.array([model.prec, model.rec, model.f1, model.ndcg])/num_tries

In [None]:
import matplotlib.pyplot as plt

m=3
lg_data=[res[0][i][m] for i in range(10)]
lfn_data=[res[1][i][m] for i in range(10)]
tulfnb_data=[res[2][i][m] for i in range(10)]


plt.plot(list(range(1,11)),lg_data, label='LG model')
plt.plot(list(range(1,11)),lfn_data, label='LFN model')
plt.plot(list(range(1,11)),tulfnb_data, label='TU-LFN model')
plt.legend()
plt.show()

## plotting the ROC curve

In [None]:
def getROC(model, test_dataset, num_pts=10):
    thresholds = np.linspace(0,1,num_pts)

    preds, test_labels, prior_orders = model.predict(test_dataset, getdf=True)

    user_true = {}
    user_pred = {}
    user_all = {}
    for i,row in enumerate(prior_orders.itertuples()):
        uid = row.user_id
        pid = row.product_id
        if uid not in user_true:
            user_true[uid], user_pred[uid], user_all[uid]  = [], {threshold:[] for threshold in thresholds}, 0
        user_all[uid]+=1
        if test_labels[i] == 1:
            user_true[uid].append(pid)
        for threshold in thresholds:
            if preds[i] > threshold:
                user_pred[uid][threshold].append(pid)

    tprs, tnrs = {threshold:[] for threshold in thresholds}, {threshold:[] for threshold in thresholds}
    for uid in user_true:
        trues = set(user_true[uid])
        tot = user_all[uid]
        for threshold in thresholds:
            preds = set(user_pred[uid][threshold])

            tp = len(trues.intersection(preds))
            fp = len(preds) - tp
            fn = len(trues) - tp
            tn = tot-fp-fn-tp

            tpr = tp/(tp+fn) if tp+fn>0 else 1
            tnr = tn/(tn+fp) if tn+fp>0 else 1
        
            tprs[threshold].append(tpr)
            tnrs[threshold].append(1-tnr)

    out = np.transpose(np.array([[np.mean(tprs[threshold]),np.mean(tnrs[threshold])] for threshold in thresholds]))
    return out

In [None]:
lg_roc = getROC(lg_model,test_dataset,num_pts=20)
lfn_roc = getROC(lfn_model,test_dataset,num_pts=20)
tulfnb_roc = getROC(tulfnb_model,test_dataset,num_pts=20)
plt.plot(lg_roc[1],lg_roc[0])
plt.plot(lfn_roc[1],lfn_roc[0])
plt.plot(tulfnb_roc[1],tulfnb_roc[0])
plt.show()