In [1]:
import featuretools as ft
from featuretools.primitives import Day, Weekend, Weekday, Percentile
import pandas as pd
import numpy as np
import utils_instacart as utils
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from dldb import DLDB
import os
ft.__version__

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


'0.1.20'

## 1. Load in the data

The data is partitioned into chunks based on `user_id`, and loaded into the Featuretools Entityset format. See [the original demo](https://github.com/Featuretools/predict_next_purchase) for more explanation about how the data is partitioned and the Entityset is formed.

In [2]:
es = utils.load_entityset('partitioned_data/part_0/')

## 2. Construct labels

This utility function picks out a window of time, and finds which users bought bananas. Again, more explanation in [the original demo](https://github.com/Featuretools/predict_next_purchase).

In [3]:
cutoff_time = pd.Timestamp('March 1, 2015')
training_window = "60 days"

In [4]:
label_times = utils.make_labels(es,
                                product_name="Banana",
                                cutoff_time=cutoff_time,
                                prediction_window=pd.Timedelta("28 days"),
                                training_window=pd.Timedelta(training_window))
labels = label_times.set_index('user_id').sort_index()['label']

## Create time-stamped feature matrix using DFS

Here is where things start to get interesting. We use the `make_temporal_cutoffs` function in Featuretools to produce a feature matrix with several rows per user. It works by adding additional cutoff times in the past to each `(user_id, cutoff_time)` provided in `label_times`.

This function has a few different ways of selecting these additional cutoff times. Here, we provide `window_size='3d'` and `start=cutoff_time - training_window`, which will go back in time in increments of 3 days until 60 days before the cutoff time of March 1st. This produces a sequence of 20 cutoff times per user.

We could have also specified `num_windows=20` and `window_size=3d` to produce the same result.

The rest of the arguments are standard DFS arguments. For an overview of DFS, check out the [Featuretools documentation](https://docs.featuretools.com/automated_feature_engineering/afe.html).

In [6]:
trans_primitives = [Day, Weekend, Weekday, Percentile]
temporal_cutoffs = ft.make_temporal_cutoffs(instance_ids=label_times['user_id'],
                                            cutoffs=label_times['time'],
                                            window_size='3d',
                                            start=[cutoff_time - pd.Timedelta(training_window)] * len(label_times))
# Note that ft.dfs expects either an Featuretools Timedelta object or a string, 
# not a Pandas Timedelta object
ftens, fl = ft.dfs(entityset=es,
                target_entity="users",
                cutoff_time=temporal_cutoffs,
                trans_primitives=trans_primitives,
                training_window=training_window,
                max_depth=2,
                verbose=True)

ftens = ftens.sort_index()

Built 78 features
Elapsed: 05:13 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks


In [7]:
# Can save/restore our work without having to recompute feature matrix
#ftens.to_csv('ftens_part_0.csv')
#ftens = pd.read_csv('ftens_part_0.csv', parse_dates=['time'], index_col=['user_id', 'time'])

In [8]:
#ft.save_features(fl, 'fl_part_0.p')
#fl = ft.load_features('fl_part_0.p', es)

## Initialize DLDB with desired hyperparameters

In this example, we use 2 fairly small [LSTM](https://keras.io/layers/recurrent/) layers and 2 feed-forward layers (called "Dense layers" in Keras/Tensor Flow terminology). DLDB has an extremely simple API, and exposes a large number of hyperparameters, so is amenable to hyperparameter optimization algorithms.

Each categorical feature will be mapped to a 12-dimensional embedding, with a maximum of 20 unique categorical values (the top 20 most frequent values will be chosen, and the rest will be converted to a single token).

In [9]:
dl_model = DLDB(
    regression=False,
    classes=[False, True],
    recurrent_layer_sizes=(32, 32),
    dense_layer_sizes=(32, 16),
    dropout_fraction=0.2,
    recurrent_dropout_fraction=0.2,
    categorical_embedding_size=12,
    categorical_max_vocab=20)

## Train the model and test using cross-validation

We use a `batch_size` of 128 (for each gradient update step) and train over 3 passes of the dataset (epochs).

In [10]:
n_splits=20
splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

In [11]:
cv_score = []

for train_test_index in splitter.split(labels, labels):
    train_labels = labels.iloc[train_test_index[0]]
    test_labels = labels.iloc[train_test_index[1]]
    train_ftens = ftens.loc[train_labels.index, :]
    test_ftens = ftens.loc[test_labels.index, :]

    dl_model.fit(
        train_ftens, train_labels,
        fl=fl,
        # Provide this many samples to the network at a time
        batch_size=128,
        epochs=3,
        # Set this to number of cores
        workers=8,
        use_multiprocessing=True,
        shuffle=False,)
    
    predictions = dl_model.predict(test_ftens)
    score = roc_auc_score(test_labels, predictions)
    print("cv score: ", score)
    cv_score.append(score)
mean_score = np.mean(cv_score)
stderr = 2 * (np.std(cv_score) / np.sqrt(n_splits))

print("AUC %.2f +/- %.2f" % (mean_score, stderr))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Transforming input tensor into numeric sequences
Predicting using Keras model
Transforming outputs
cv score:  0.49642857142857144
Epoch 1/3
Epoch 2/3
Epoch 3/3
Transforming input tensor into numeric sequences
Predicting using Keras model
Transforming outputs
cv score:  0.375
Epoch 1/3
Epoch 2/3
Epoch 3/3
Transforming input tensor into numeric sequences
Predicting using Keras model
Transforming outputs
cv score:  0.4285714285714286
Epoch 1/3
Epoch 2/3

Epoch 3/3
Transforming input tensor into numeric sequences
Predicting using Keras model
Transforming outputs
cv score:  0.1875
Epoch 1/3
Epoch 2/3
Epoch 3/3
Transforming input tensor into numeric sequences
Predicting using Keras model
Transforming outputs
cv score:  0.4125
Epoch 1/3
Epoch 2/3
Epoch 3/3
Transforming input tensor into numeric sequences
Predicting using Keras model
Transforming outputs
cv score:  0.3803571428571429
Epoch 1/3
Epoch 2/3
Epoch 3/3
Transforming input tensor into numeric sequences
Pr