In [1]:
import featuretools as ft
from featuretools.primitives import Day, Weekend, Weekday, Percentile
import pandas as pd
import numpy as np
import utils_instacart as utils
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from dldb import DLDB, tdfs, make_temporal_cutoffs
from keras.callbacks import EarlyStopping
import os
ft.__version__

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


'0.1.18'

In [3]:
es = utils.load_entityset('partitioned_data/part_0/')

In [None]:
cutoff_time = pd.Timestamp('March 1, 2015')
training_window = ft.Timedelta("60 days")

In [None]:
label_times = utils.make_labels(es,
                                product_name="Banana",
                                cutoff_time=cutoff_time,
                                prediction_window=ft.Timedelta("4 weeks"),
                                training_window=training_window)
labels = label_times.set_index('user_id').sort_index()['label']

In [6]:
# Note: increasing max_depth from 2 to 3 increases auc from .7 to .9
trans_primitives = [Day, Weekend, Weekday, Percentile]
fm, fl = tdfs(entityset=es,
              target_entity="users",
              cutoffs=label_times,
              trans_primitives=trans_primitives,
              training_window=training_window,
              max_depth=3,
              window_size='3d',
              start=cutoff_time - training_window,
              verbose=True)

fm = fm.sort_index()

Building features: 349it [00:00, 5482.85it/s]
Progress: 100%|██████████| 21/21 [07:15<00:00, 20.75s/cutoff time]


In [7]:
trans_primitives = [Day, Weekend, Weekday, Percentile]
fl = tdfs(entityset=es,
              target_entity="users",
              cutoffs=label_times,
              trans_primitives=trans_primitives,
              training_window=training_window,
              max_depth=3,
              window_size='3d',
              start=cutoff_time - training_window,
              verbose=True)

Building features: 349it [00:00, 6151.74it/s]


In [8]:
ft.save_features(fl, "fl.p")

In [7]:
fm.to_csv("fm_part_0.csv")
labels.to_frame().to_csv("label_times_part_0.csv")

In [8]:
dl_model = DLDB(
    regression=False,
    classes=[False, True],
    recurrent_layer_sizes=(32, 32),
    dense_layer_sizes=(32, 32),
    dropout_fraction=0.2,
    recurrent_dropout_fraction=0.1,
    categorical_embedding_size=20,
    categorical_max_vocab=12)

In [10]:
# This is *slightly* cheating because we give it all the categorical values ahead of time
# It most likely won't make a difference, and this step takes some time
# Feel free to move it inside of the cross-validation for loop
dl_model.compile(fm, fl=fl)

## Train the model and test using cross-validation

In [12]:
cv_score = []
n_splits = 3
test_frac = 0.1
# Use 10% of data as testing set, but only run 3 rounds of cross-validation
# (because they take a while)
splitter = StratifiedKFold(n_splits=int(1/test_frac), shuffle=True)

for i, train_test_index in enumerate(splitter.split(labels, labels)):
    train_labels = labels.iloc[train_test_index[0]]
    test_labels = labels.iloc[train_test_index[1]]
    train_fm = fm.loc[(train_labels.index, slice(None)), :]
    test_fm = fm.loc[(test_labels.index, slice(None)), :]

    dl_model.fit(
        train_fm, train_labels,
        # Provide 32 samples to the network at a time
        batch_size=32,
        # Train on at most 100 passes of the dataset (epochs)
        epochs=100,
        # After each epoch, test on a held out 10% validation set
        validation_split=0.1,
        # If no improvement, stop training
        callbacks=[EarlyStopping()])
    
    predictions = dl_model.predict(test_fm)
    cv_score.append(roc_auc_score(test_labels, predictions))
    if i == n_splits - 1:
        break
mean_score = np.mean(cv_score)
stderr = 2 * (np.std(cv_score) / np.sqrt(n_splits))

print("AUC %.2f +/- %.2f" % (mean_score, stderr))

Transforming input matrix into numeric sequences
Fitting Keras model
Train on 678 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
Transforming input matrix into numeric sequences
Fitting Keras model
Train on 679 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
Transforming input matrix into numeric sequences
Fitting Keras model
Train on 679 samples, validate on 76 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Transforming input matrix into numeric sequences
Predicting using Keras model
Transforming outputs
AUC 0.80 +/- 0.06
