# Predict Next Purchase

In this example, you'll learn how to create a machine learning application that predicts whether customers will purchase groceries within the next week.

In [None]:
from demo.predict_next_purchase import load_sample
from evalml import AutoMLSearch
from evalml.preprocessing import split_data
import composeml as cp
import featuretools as ft
import matplotlib as mpl

To start, we have historical data of online grocery orders.

In [None]:
df = load_sample()

df.head()

We want to predict whether a customer will purchase a specific product. So, we define a labeling function where the product is a parameter of the function. This way we can reuse the labeling function to generate labels for different products.

In [None]:
def bought_product(ds, product_name):
    return ds.product_name.str.contains(product_name).any()

In [None]:
lm = cp.LabelMaker(
    target_entity='user_id',
    time_index='order_time',
    labeling_function=bought_product,
    window_size='7d',
)

In [None]:
lt = lm.search(
    df.sort_values('order_time'),
    minimum_data='3d',
    num_examples_per_instance=-1,
    product_name='Banana',
    gap='3d',
    verbose=False,
)

lt.head()

In [None]:
lt.describe()

In [None]:
%matplotlib inline
fig = mpl.pyplot.figure(figsize=(5, 8))
ax0 = fig.add_subplot(211)
ax1 = mpl.pyplot.subplot(212)
fig.tight_layout()

lt.plot.distribution(ax=ax0)
lt.plot.count_by_time(ax=ax1);

In [None]:
es = ft.EntitySet('instacart')

es.entity_from_dataframe(
    dataframe=df.reset_index(),
    entity_id='order_products',
    time_index='order_time',
    index='id',
)

es.normalize_entity(
    base_entity_id='order_products',
    new_entity_id='orders',
    index='order_id',
    additional_variables=['user_id'],
    make_time_index=False,
)

es.normalize_entity(
    base_entity_id='orders',
    new_entity_id='users',
    index='user_id',
    make_time_index=False,
)

es.normalize_entity(
    base_entity_id='order_products',
    new_entity_id='products',
    index='product_id',
    additional_variables=['aisle_id', 'department_id'],
    make_time_index=False,
)

es.normalize_entity(
    base_entity_id='products',
    new_entity_id='aisles',
    index='aisle_id',
    additional_variables=['department_id'],
    make_time_index=False,
)

es.normalize_entity(
    base_entity_id='aisles',
    new_entity_id='departments',
    index='department_id',
    make_time_index=False,
)

es["order_products"]["department"].interesting_values = ['produce']
es["order_products"]["product_name"].interesting_values = ['Banana']
es.plot()

In [None]:
fm, fd = ft.dfs(
    entityset=es,
    target_entity='users',
    cutoff_time=lt,
    cutoff_time_in_index=True,
    include_cutoff_time=False,
    verbose=False,
)

fm.head()

In [None]:
y = fm.pop('bought_product')
splits = split_data(fm, y, test_size=0.2, random_state=0)
X_train, X_holdout, y_train, y_holdout = splits

In [None]:
automl = AutoMLSearch(problem_type='binary', objective='f1', random_state=0)
automl.search(X_train, y_train, data_checks=None, show_iteration_plot=False)

In [None]:
automl.best_pipeline.describe()
automl.best_pipeline.graph()

In [None]:
best_pipeline = automl.best_pipeline.fit(X_train, y_train)
score = best_pipeline.score(X_holdout, y_holdout, objectives=['f1'])
dict(score)