# About
Welcome to the functionality examples notebook. This notebook is only intended for local use: it's a place to try out and explore the `henchman` api without worrying about what will render in html on github or in the docs.

In [1]:
import pandas as pd
import featuretools as ft

es = ft.demo.load_retail()

cutoff_times = pd.read_csv('../../../../Downloads/predict_may_sales.csv')[['customer_id', 'cutoff_time', 'total']]
cutoff_times['cutoff_time'] = pd.to_datetime(cutoff_times['cutoff_time'])

fm, features = ft.dfs(entityset=es, target_entity='customers', cutoff_time=cutoff_times, verbose=True)


Built 143 features
Elapsed: 00:18 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks


In [2]:
es

Entityset: demo_retail_data
  Entities:
    order_products [Rows: 401604, Columns: 7]
    products [Rows: 3684, Columns: 3]
    orders [Rows: 22190, Columns: 5]
    customers [Rows: 4372, Columns: 2]
  Relationships:
    order_products.product_id -> products.product_id
    order_products.order_id -> orders.order_id
    orders.customer_id -> customers.customer_id

# Diagnostics

In [3]:
from henchman.diagnostics import overview, warnings, column_report, profile

In [4]:
overview(es['order_products'].df)


+--------------+
|  Data Shape  |
+--------------+
Number of columns: 7
Number of rows: 401604

+------------------+
|  Missing Values  |
+------------------+
Most values missing from column: 0
Average missing values by column: 0.00

+----------------+
|  Memory Usage  |
+----------------+
Total memory used: 80.01 MB
Average memory by column: 10.00 MB

+--------------+
|  Data Types  |
+--------------+
                index
0                    
int64               2
datetime64[ns]      1
float64             2
object              2


In [5]:
column_report(es['order_products'].df)


+-------------------------+
|  Object Column Summary  |
+-------------------------+

## order_id ##
Unique: 22190
Mode: 576339, (matches 0.1% of rows)

## product_id ##
Unique: 3684
Mode: 85123A, (matches 0.5% of rows)

+-----------------------+
|  Time Column Summary  |
+-----------------------+

## order_date ##
Last Time: 2011-12-09 12:50:00
First Time: 2010-12-01 08:26:00

+--------------------------+
|  Numeric Column Summary  |
+--------------------------+

## order_product_id ##
Maximum: 401603, Minimum: 0, Mean: 200801.50
Quartile 3: 301202.25 | Median: 200801.50| Quartile 1: 100400.75

## quantity ##
Maximum: 80995, Minimum: -80995, Mean: 12.18
Quartile 3: 12.00 | Median: 5.00| Quartile 1: 2.00

## unit_price ##
Maximum: 64300.5, Minimum: 0.0, Mean: 5.73
Quartile 3: 6.19 | Median: 3.22| Quartile 1: 2.06

## total ##
Maximum: 277974.84, Minimum: -277974.84, Mean: 34.01
Quartile 3: 32.67 | Median: 19.30| Quartile 1: 7.01


In [6]:
warnings(fm)


+------------+
+------------+
COUNT(orders) and NUM_UNIQUE(orders.MODE(order_products.product_id)) are linearly correlated: 0.968
COUNT(orders) and NUM_UNIQUE(orders.DAY(first_order_products_time)) are linearly correlated: 0.938
SUM(order_products.quantity) and SUM(order_products.total) are linearly correlated: 0.923
STD(order_products.quantity) and STD(orders.MAX(order_products.quantity)) are linearly correlated: 0.931
STD(order_products.unit_price) and MEAN(order_products.unit_price) are linearly correlated: 0.980
STD(order_products.unit_price) and STD(orders.MAX(order_products.unit_price)) are linearly correlated: 0.923
STD(order_products.unit_price) and STD(orders.MIN(order_products.unit_price)) are linearly correlated: 0.922
STD(order_products.unit_price) and STD(orders.MEAN(order_products.unit_price)) are linearly correlated: 0.928
STD(order_products.unit_price) and MEAN(orders.MAX(order_products.unit_price)) are linearly correlated: 0.941
STD(order_products.unit_price) and MEAN

MEAN(orders.MAX(order_products.quantity)) and MEAN(orders.MEAN(order_products.quantity)) are linearly correlated: 0.919
MEAN(orders.MAX(order_products.unit_price)) and MEAN(orders.MIN(order_products.unit_price)) are linearly correlated: 0.909
MEAN(orders.MAX(order_products.unit_price)) and MEAN(orders.MEAN(order_products.unit_price)) are linearly correlated: 0.929
MEAN(orders.MAX(order_products.total)) and MEAN(orders.MEAN(order_products.total)) are linearly correlated: 0.910
MEAN(orders.MIN(order_products.quantity)) and MEAN(orders.MEAN(order_products.quantity)) are linearly correlated: 0.958
MEAN(orders.MIN(order_products.unit_price)) and MEAN(orders.MEAN(order_products.unit_price)) are linearly correlated: 0.997
MEAN(orders.MIN(order_products.total)) and MEAN(orders.MEAN(order_products.total)) are linearly correlated: 0.938
MEAN(orders.COUNT(order_products)) and MEAN(orders.NUM_UNIQUE(order_products.product_id)) are linearly correlated: 0.999
NUM_UNIQUE(orders.MODE(order_products.pr

# Plotting

In [7]:
from henchman.plotting import show
from henchman.plotting import (feature_importances, histogram, piechart, scatter, timeseries)

In [8]:
show(piechart(es['orders'].df['cancelled']), title='Cancelled Orders')

In [9]:
show(piechart(es['orders'].df['country'], mergepast=10), height=400, width=500)

In [10]:
show(timeseries(es['customers'].df['first_orders_time'], es['customers'].df['customer_id'], 
                n_bins=20, aggregate='count'), 
     width=900, height=300)

In [11]:
show(timeseries(es['order_products'].df['order_date'], es['order_products'].df['total'], 
                aggregate='sum', n_bins=12), 
     width=900, height=300)

In [12]:
show(scatter(es['orders'].df['cancelled'], es['orders'].df['cancelled'], 
             agg=es['orders'].df['country'], hover=True, aggregate='mean'), 
     title='Cancelled by country', x_axis='Cancelled', y_axis='Cancelled', height=300, width=300)

# Selection

In [13]:
from henchman.selection import RandomSelect, Dendrogram

In [14]:
from henchman.learning import inplace_encoder
X = inplace_encoder(fm.copy())
y = X.pop('total')
y = y > 1000

  y = column_or_1d(y, warn=True)


In [15]:
selector_1 = RandomSelect(n_feats=10)
selector_1.fit(X)
selector_1.transform(X).head()

Unnamed: 0_level_0,MAX(orders.STD(order_products.quantity)),SUM(orders.MAX(order_products.total)),SKEW(orders.SUM(order_products.quantity)),MAX(order_products.unit_price),STD(orders.SKEW(order_products.quantity)),MAX(orders.SKEW(order_products.quantity)),NUM_UNIQUE(orders.MODE(order_products.product_id)),SKEW(orders.COUNT(order_products)),SUM(orders.MIN(order_products.unit_price)),STD(order_products.total)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12372.0,7.194442,78.705,0.0,29.7,0.0,0.777519,1,0.0,0.693,15.926582
12395.0,46.231303,329.01,0.189633,29.7,0.708551,2.059801,5,-0.152923,15.9225,28.204842
12399.0,7.527284,33.66,0.0,24.75,0.0,-0.796031,1,0.0,0.693,5.28418
12414.0,15.226293,196.68,0.0,66.0,0.122809,0.587105,2,0.0,1.254,34.122594
12415.0,94.897294,4521.33,0.638199,577.5,1.128444,3.15939,6,0.522861,589.5615,270.167331


In [16]:
selector_2 = Dendrogram(X, max_threshes=500)

100%|██████████| 482/482 [00:11<00:00, 42.93it/s]
100%|██████████| 482/482 [00:06<00:00, 76.76it/s] 


In [17]:
from henchman.plotting import dendrogram
show(dendrogram(selector_2))

In [18]:
selector_2._shuffle_all_representatives()

In [19]:
X_p = selector_2.transform(X, n_feats=80)
X_p.head()

There are 80 distinct connected componentsat thresh step 8 in the Dendrogram
You might also be interested in 84 components at step 7


Unnamed: 0_level_0,NUM_UNIQUE(orders.MODE(order_products.product_id)),PERCENT_TRUE(orders.cancelled),NUM_UNIQUE(orders.country),MODE(orders.country),SUM(order_products.total),SUM(order_products.unit_price),MIN(orders.MEAN(order_products.quantity)),MEAN(orders.MEAN(order_products.unit_price)),STD(orders.MEAN(order_products.total)),MAX(orders.STD(order_products.quantity)),...,MEAN(orders.SKEW(order_products.unit_price)),MEAN(orders.SKEW(order_products.total)),MEAN(orders.NUM_UNIQUE(order_products.product_id)),NUM_UNIQUE(orders.MONTH(first_order_products_time)),NUM_UNIQUE(orders.WEEKDAY(first_order_products_time)),MODE(orders.MODE(order_products.product_id)),MODE(orders.DAY(first_order_products_time)),MODE(orders.YEAR(first_order_products_time)),MODE(orders.MONTH(first_order_products_time)),MODE(orders.WEEKDAY(first_order_products_time))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12372.0,1,0.0,1,4,658.713,105.4845,11.2,5.274225,0.0,7.194442,...,2.94918,1.367421,20.0,1,1,80,16,2011,2,2
12395.0,5,0.333333,1,2,2013.2145,288.42,-3.0,5.432045,27.777604,46.231303,...,1.513774,0.722533,9.5,4,2,70,3,2011,2,4
12399.0,1,0.0,1,2,475.2825,54.615,18.2,2.73075,0.0,7.527284,...,3.877687,0.172201,20.0,1,1,73,23,2011,3,2
12414.0,2,0.0,1,1,654.324,157.2615,20.714286,13.535893,21.940286,15.226293,...,1.749444,0.117086,6.0,2,2,32,2,2011,2,1
12415.0,6,0.285714,1,0,62038.416,1475.892,-100.0,85.987048,389.398995,94.897294,...,0.760533,1.043787,28.857143,4,4,55,3,2011,1,3


In [20]:
warnings(X_p)


+------------+
+------------+


In [21]:
from henchman.learning import inplace_encoder, create_holdout, create_model

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
splits = 5
scores, fit_model = create_model(X, y, RandomForestClassifier(), roc_auc_score, n_splits=splits)
print('Average score of {:.2f} over {} splits (stdev {:.3f})'.format(np.mean(scores), splits, np.std(scores)))

Average score of 0.81 over 5 splits (stdev 0.036)


In [23]:
scores, fit_model2 = create_model(X_p, y, RandomForestClassifier(), roc_auc_score, n_splits=splits)
print('Average score of {:.2f} over {} splits (stdev {:.3f})'.format(np.mean(scores), splits, np.std(scores)))

Average score of 0.79 over 5 splits (stdev 0.051)


In [24]:
show(feature_importances(X_p, fit_model2, n_feats=10), height=300)

In [25]:
show(histogram(X['MAX(orders.SUM(order_products.total))'], y, col_max=5000))

TypeError: nan_to_num() takes 1 positional argument but 2 were given

In [None]:
from henchman.plotting import roc_auc
show(roc_auc(X_p, y, RandomForestClassifier(), n_splits=splits), height=400, width=400)

In [None]:
from henchman.plotting import f1
show(f1(X_p, y, RandomForestClassifier(), n_splits=splits), height=400, width=400)

In [27]:
import numpy as np
np.__version__

'1.11.3'

In [26]:
pd.__version__


'0.23.1'