In [1]:
import featuretools as ft
from featuretools.primitives import Day, Weekend, Weekday, Percentile
import pandas as pd
import numpy as np
import utils_instacart as utils
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from dldb import DLDB, tdfs
import os
from keras.callbacks import EarlyStopping
ft.__version__

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


'0.1.18'

## Load in the data

In [2]:
es = utils.load_entityset('partitioned_data/part_0/')

## Construct labels

In [3]:
cutoff_time = pd.Timestamp('March 1, 2015')
training_window = ft.Timedelta("60 days")

In [4]:
label_times = utils.make_labels(es,
                                product_name="Banana",
                                cutoff_time=cutoff_time,
                                prediction_window=ft.Timedelta("4 weeks"),
                                training_window=training_window)
labels = label_times.set_index('user_id').sort_index()['label']

## Create time-stamped feature matrix using DFS

We make sure to cutoff the data at the cutoff time, and only use 60 days of data

In [5]:
# Note: increasing max_depth from 2 to 3 increases auc from .7 to .9
trans_primitives = [Day, Weekend, Weekday, Percentile]
fm, fl = tdfs(entityset=es,
              target_entity="users",
              cutoffs=label_times,
              trans_primitives=trans_primitives,
              training_window=training_window,
              max_depth=3,
              window_size='3d',
              start=cutoff_time - training_window,
              verbose=True)

fm = fm.sort_index()

Building features: 349it [00:00, 6589.15it/s]
Progress:  48%|████▊     | 10/21 [02:49<03:06, 16.95s/cutoff time]

KeyboardInterrupt: 

### Join all entities in the data together into one dataframe
Again, we make sure to cutoff the data at the cutoff time, and only use 60 days of data

In [10]:
fm_denormalized = utils.denormalize_entityset(es, cutoff_time, training_window)
fm_denormalized.sort_index(inplace=True)

## Initialize DLDB with desired hyperparameters

In [11]:
dl_model = DLDB(
    regression=False,
    classes=[False, True],
    recurrent_layer_sizes=(32, 32),
    dense_layer_sizes=(32, 32),
    dropout_fraction=0.2,
    recurrent_dropout_fraction=0.1,
    categorical_embedding_size=20,
    categorical_max_vocab=12)

## Compile the network for DFS features

In [12]:
# This is *slightly* cheating because we give it all the categorical values ahead of time
# It most likely won't make a difference, and this step takes some time
# Feel free to move it inside of the cross-validation for loop
dl_model.compile(fm, fl=fl)

## Train the model and test using cross-validation

In [13]:
cv_score = []
n_splits = 3
test_frac = 0.1
# Use 10% of data as testing set, but only run 3 rounds of cross-validation
# (because they take a while)
splitter = StratifiedKFold(n_splits=int(1/test_frac), shuffle=True)

for i, train_test_index in enumerate(splitter.split(labels, labels)):
    train_labels = labels.iloc[train_test_index[0]]
    test_labels = labels.iloc[train_test_index[1]]
    train_fm = fm.loc[(train_labels.index, slice(None)), :]
    test_fm = fm.loc[(test_labels.index, slice(None)), :]


    dl_model.fit(
        train_fm, train_labels,
        # Provide 32 samples to the network at a time
        batch_size=32,
        # Train on at most 100 passes of the dataset (epochs)
        epochs=100,
        # After each epoch, test on a held out 10% validation set
        validation_split=0.1,
        # If no improvement, stop training
        callbacks=[EarlyStopping()])
    
    predictions = dl_model.predict(test_fm)
    cv_score.append(roc_auc_score(test_labels, predictions))
    if i == n_splits - 1:
        break
mean_score = np.mean(cv_score)
stderr = 2 * (np.std(cv_score) / np.sqrt(n_splits))

print("AUC %.2f +/- %.2f" % (mean_score, stderr))

Transforming input matrix into numeric sequences
Fitting Keras model
Train on 678 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
Transforming input matrix into numeric sequences
Fitting Keras model
Train on 679 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
Transforming input matrix into numeric sequences
Fitting Keras model
Train on 679 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
AUC 0.84 +/- 0.10


## Compile the network for the denormalized table

In [15]:
# all columns are categorical except the Boolean "reordered"
categorical_feature_names=[c for c in fm_denormalized.columns if c != 'reordered']
dl_model.compile(fm_denormalized,
                 categorical_feature_names=categorical_feature_names)

## Train the denormalized baseline model and test using cross-validation

In [17]:
cv_score = []

for i, train_test_index in enumerate(splitter.split(labels, labels)):
    train_labels = labels.iloc[train_test_index[0]]
    test_labels = labels.iloc[train_test_index[1]]
    train_fm = fm_denormalized.loc[(train_labels.index, slice(None)), :]
    test_fm = fm_denormalized.loc[(test_labels.index, slice(None)), :]


    dl_model.fit(
        train_fm, train_labels,
        # Provide 32 samples to the network at a time
        batch_size=32,
        # Train on at most 100 passes of the dataset (epochs)
        epochs=100,
        # After each epoch, test on a held out 10% validation set
        validation_split=0.1,
        # If no improvement, stop training
        callbacks=[EarlyStopping()])
    
    predictions = dl_model.predict(test_fm)
    cv_score.append(roc_auc_score(test_labels, predictions))
    if i == n_splits - 1:
        break
mean_score = np.mean(cv_score)
stderr = 2 * (np.std(cv_score) / np.sqrt(n_splits))

print("AUC %.2f +/- %.2f" % (mean_score, stderr))

Transforming input matrix into numeric sequences
Fitting Keras model
Train on 678 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
Transforming input matrix into numeric sequences
Fitting Keras model
Train on 679 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
Transforming input matrix into numeric sequences
Fitting Keras model
Train on 679 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
AUC 0.71 +/- 0.07
