## UK Retail Dataset Case Study

In [113]:
import featuretools as ft
from utils import make_label_times, load_uk_retail_data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
ft.__version__
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [133]:
item_purchases, invoices, items,customers = load_uk_retail_data()

The following relations exist
* A customer may have multiple invoices 
* An item may have been purchased multiple times 
* An invoice may have multiple item purchases 

In [134]:
entities = {
        "item_purchases": (item_purchases, "item_purchase_id", "InvoiceDate" ),
        "items": (items, "StockCode"),
        "customers": (customers,"CustomerID"),
        "invoices":(invoices,"InvoiceNo","first_item_purchases_time")
        }

relationships = [("customers", "CustomerID","invoices", "CustomerID"), 
                ("invoices", "InvoiceNo","item_purchases", "InvoiceNo"),
                ("items", "StockCode","item_purchases", "StockCode")]

In [135]:
label_times = make_label_times(item_purchases, invoices,
                 cutoff_time=pd.Timestamp("2011-06-01"),
                 prediction_window=pd.Timedelta("14d"),
                 training_window=pd.Timedelta("21d"),
                 lead=pd.Timedelta("7d"),
                 threshold=2)

In [136]:
from featuretools.primitives import (Day, Hour, Minute, Month, Weekday, Week, Weekend, Mean, Max, Min, Std, Skew)


trans_primitives = [Minute, Hour, Day, Week, Month, Weekday, Weekend]

feature_matrix,features = ft.dfs(entities=entities,
                                 relationships=relationships,
                                 target_entity="customers",
                                 trans_primitives=trans_primitives,
                                 agg_primitives=[Mean,Max,Std],
                                 cutoff_time=label_times,
                                 training_window="21d")
feature_matrix.drop("Country", axis=1, inplace=True)

In [126]:
feature_matrix

Unnamed: 0_level_0,WEEK(first_invoices_time),HOUR(first_invoices_time),MAX(item_purchases.Quantity),STD(item_purchases.UnitPrice),DAY(first_invoices_time),IS_WEEKEND(first_invoices_time),MINUTE(first_invoices_time),MONTH(first_invoices_time),MAX(item_purchases.UnitPrice),MEAN(item_purchases.Quantity),...,MAX(invoices.STD(item_purchases.UnitPrice)),STD(invoices.MAX(item_purchases.Quantity)),MEAN(invoices.STD(item_purchases.UnitPrice)),MAX(invoices.MEAN(item_purchases.Quantity)),MAX(invoices.STD(item_purchases.Quantity)),MEAN(invoices.MAX(item_purchases.UnitPrice)),MEAN(invoices.MAX(item_purchases.Quantity)),MEAN(invoices.MEAN(item_purchases.Quantity)),STD(invoices.MAX(item_purchases.UnitPrice)),STD(invoices.MEAN(item_purchases.UnitPrice))
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12353.0,20,17,8,3.911122,19,False,47,5,9.95,5.000000,...,3.911122,0.000000,3.911122,5.000000,2.236068,9.950000,8.000000,5.000000,0.000000,0.000000
12355.0,19,13,96,3.328575,9,False,49,5,12.75,18.461538,...,3.328575,0.000000,3.328575,18.461538,26.880406,12.750000,96.000000,18.461538,0.000000,0.000000
12360.0,21,9,36,6.455818,23,False,43,5,40.00,9.644444,...,6.455818,0.000000,6.455818,9.644444,5.453462,40.000000,36.000000,9.644444,0.000000,0.000000
12372.0,7,12,25,4.064873,16,False,46,2,18.00,19.368421,...,4.064873,0.000000,4.064873,19.368421,8.079933,18.000000,25.000000,19.368421,0.000000,0.000000
12394.0,18,14,16,3.312645,6,False,1,5,15.00,8.666667,...,3.312645,0.000000,3.312645,8.666667,3.616540,15.000000,16.000000,8.666667,0.000000,0.000000
12395.0,48,16,12,4.303943,3,False,35,12,15.00,6.458333,...,4.662612,0.000000,4.310620,6.545455,4.335134,15.000000,12.000000,6.465035,0.000000,0.268252
12399.0,12,9,48,2.999547,23,False,42,3,15.00,22.454545,...,2.999547,0.000000,2.999547,22.454545,17.752662,15.000000,48.000000,22.454545,0.000000,0.000000
12414.0,5,14,12,13.598706,2,False,51,2,40.00,7.000000,...,13.598706,0.000000,13.598706,7.000000,4.163332,40.000000,12.000000,7.000000,0.000000,0.000000
12415.0,1,11,600,2.733503,6,False,12,1,14.95,112.519481,...,6.059573,192.000000,4.218107,113.260274,131.485301,13.725000,408.000000,106.130137,1.225000,1.031473
12423.0,51,10,12,3.933422,21,False,54,12,15.00,7.318182,...,3.933422,0.000000,3.933422,7.318182,3.758462,15.000000,12.000000,7.318182,0.000000,0.000000


In [137]:
y=label_times['purchases>threshold']
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, y, test_size=0.35)

In [128]:
clf = RandomForestClassifier(random_state=0,n_estimators=500,class_weight="balanced",verbose=True)
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.9s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=0,
            verbose=True, warm_start=False)

In [129]:
predicted_labels = clf.predict(X_test)


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.1s finished


In [130]:
precision, recall, fscore, support = precision_recall_fscore_support(y_test, predicted_labels)

In [131]:
precision,recall

(array([ 0.79559748,  0.        ]), array([ 0.99215686,  0.        ]))