In [1]:
import featuretools as ft
from featuretools.primitives import Day, Weekend, Weekday, Percentile
import pandas as pd
import numpy as np
import utils_instacart as utils
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from dldb import DLDB, tdfs, make_temporal_cutoffs
from keras.callbacks import EarlyStopping
import os
ft.__version__

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


'0.1.18'

In [2]:
es = utils.load_entityset('partitioned_data/part_0/')

In [3]:
cutoff_time = pd.Timestamp('March 1, 2015')
training_window = ft.Timedelta("60 days")

In [4]:
label_times = utils.make_labels(es,
                                product_name="Banana",
                                cutoff_time=cutoff_time,
                                prediction_window=ft.Timedelta("4 weeks"),
                                training_window=training_window)
labels = label_times.set_index('user_id').sort_index()['label']

In [15]:
# Note: increasing max_depth from 2 to 3 increases auc from .7 to .9
trans_primitives = [Day, Weekend, Weekday, Percentile]
fm, fl = tdfs(entityset=es,
              target_entity="users",
              cutoffs=label_times,
              trans_primitives=trans_primitives,
              training_window=training_window,
              variable_types=[],
              max_depth=3,
              window_size='3d',
              start=cutoff_time - training_window,
              verbose=True)

fm = fm.sort_index()

Building features: 349it [00:00, 5145.97it/s]
Progress: 100%|██████████| 21/21 [08:12<00:00, 23.43s/cutoff time]


In [59]:
fm, fl = ft.dfs(entityset=es,
              target_entity="order_products",
              cutoff_time=cutoff_time,
              trans_primitives=trans_primitives,
              training_window=training_window,
              max_depth=2,
                cutoff_time_in_index=True,
                verbose=True)

fm = fm.sort_index()

Building features: 77it [00:00, 5297.58it/s]
Progress: 100%|██████████| 1/1 [00:13<00:00, 13.88s/cutoff time]


In [60]:
newfm = fm.dropna(subset=['orders.user_id'])
newfm['orders.user_id'] = newfm['orders.user_id'].astype(int)
newfm = newfm.set_index('orders.user_id', append=True)
newfm = newfm.sort_index(level=['orders.user_id', 'time'])
newfm = newfm.loc[(slice(None), slice(None), labels.index), :]
newfm = newfm.reset_index(level=0, drop=True).swaplevel(i=0, j=1)
newfm.index.names = ['user_id', 'time']

In [61]:
fm.to_csv("fm_part_0.csv")
labels.to_frame().to_csv("label_times_part_0.csv")

In [21]:
fl

[<Feature: COUNT(orders)>,
 <Feature: COUNT(order_products)>,
 <Feature: PERCENT_TRUE(order_products.reordered)>,
 <Feature: NUM_UNIQUE(order_products.product_name)>,
 <Feature: NUM_UNIQUE(order_products.department)>,
 <Feature: NUM_UNIQUE(order_products.aisle_id)>,
 <Feature: MODE(order_products.product_name)>,
 <Feature: MODE(order_products.department)>,
 <Feature: MODE(order_products.aisle_id)>,
 <Feature: DAY(first_orders_time)>,
 <Feature: IS_WEEKEND(first_orders_time)>,
 <Feature: WEEKDAY(first_orders_time)>,
 <Feature: SUM(orders.PERCENT_TRUE(order_products.reordered))>,
 <Feature: SUM(orders.NUM_UNIQUE(order_products.product_name))>,
 <Feature: SUM(orders.NUM_UNIQUE(order_products.department))>,
 <Feature: SUM(orders.NUM_UNIQUE(order_products.aisle_id))>,
 <Feature: STD(orders.COUNT(order_products))>,
 <Feature: STD(orders.PERCENT_TRUE(order_products.reordered))>,
 <Feature: STD(orders.NUM_UNIQUE(order_products.product_name))>,
 <Feature: STD(orders.NUM_UNIQUE(order_products.de

In [62]:
dl_model = DLDB(
    regression=False,
    classes=[False, True],
    recurrent_layer_sizes=(32, 32),
    dense_layer_sizes=(32, 32),
    dropout_fraction=0.2,
    recurrent_dropout_fraction=0.1,
    categorical_embedding_size=20,
    categorical_max_vocab=12)
# TODO: cheating a bit, put back in CV later
dl_model.compile(newfm, fl=[f for f in fl if f.get_name() != 'orders.user_id'])

In [None]:
# This is *slightly* cheating because we give it all the categorical values ahead of time
# It most likely won't make a difference, and this step takes some time
# Feel free to move it inside of the cross-validation for loop

# we set '
dl_model.compile(newfm, fl=[f for f in fl if f.get_name() != 'orders.user_id'])

## Train the model and test using cross-validation

In [63]:
cv_score = []
n_splits = 3
test_frac = 0.1
# Use 10% of data as testing set, but only run 3 rounds of cross-validation
# (because they take a while)
splitter = StratifiedKFold(n_splits=int(1/test_frac), shuffle=True)

for i, train_test_index in enumerate(splitter.split(labels, labels)):
    train_labels = labels.iloc[train_test_index[0]]
    test_labels = labels.iloc[train_test_index[1]]
    train_fm = newfm.loc[(train_labels.index, slice(None)), :]
    test_fm = newfm.loc[(test_labels.index, slice(None)), :]

    dl_model.fit(
        train_fm, train_labels,
        validation_split=0.1,
        epochs=100,
        batch_size=32,
        callbacks=[EarlyStopping()])
    
    predictions = dl_model.predict(test_fm)
    cv_score.append(roc_auc_score(test_labels, predictions))
    if i == n_splits - 1:
        break
mean_score = np.mean(cv_score)
stderr = 2 * (np.std(cv_score) / np.sqrt(n_splits))

print("AUC %.2f +/- %.2f" % (mean_score, stderr))

Transforming input matrix into numeric sequences
Fitting Keras model
Train on 678 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
Transforming input matrix into numeric sequences
Fitting Keras model
Train on 679 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
Transforming input matrix into numeric sequences
Fitting Keras model
Train on 679 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
AUC 0.83 +/- 0.06


In [None]:
#TODO: add in denormed columns as well to try to boost auc