In [11]:
from IPython.display import Image
from IPython.core.display import HTML 

# Featuretools

* a python library/ framework for automated feature engineering
* based on "Deep Feature Synthesis" paper/ research
* by Featurelabs https://www.featurelabs.com/
* Website: https://www.featuretools.com/
* Documentation: https://docs.featuretools.com/
* Source code: https://github.com/Featuretools/featuretools
* Examples: https://github.com/Featuretools/

### Deep Feature Synthesis

* Paper: http://www.jmaxkanter.com/static/papers/DSAA_DSM_2015.pdf
* Article: https://www.featurelabs.com/blog/deep-feature-synthesis/
* DFS works with the structured transactional and relational datasets 
* Across datasets features are derived by using primitive mathematical operations
* New features are composed from using derived features (hence "Deep")

### DFS example

In [15]:
Image(url= "../img/max-order-size.svg", width=600, height=600)

## Working with data

In [17]:
import numpy as np
import pandas as pd
import featuretools as ft

from featuretools.primitives import *

In [20]:
# data taken from kaggle.com/c/talkingdata-adtracking-fraud-detection
input_file = '../data/train_sample.csv'
pd.read_csv('../data/train_sample.csv').head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


### Load typed data

In [21]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
to_parse = ['click_time']
df_train = pd.read_csv(input_file, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
df_train['id'] = range(len(df_train))

### Create an EntitySet

In [24]:
es = ft.EntitySet(id='clicks')

### Create and add an entity

In [26]:
es = es.entity_from_dataframe(
    entity_id='clicks',
    dataframe=df_train,
    index='id',
    time_index='click_time',
    variable_types={
        'ip': ft.variable_types.Categorical,
        'app': ft.variable_types.Categorical,
        'device': ft.variable_types.Categorical,
        'os': ft.variable_types.Categorical,
        'channel': ft.variable_types.Categorical,
        'is_attributed': ft.variable_types.Boolean,
    }
)

In [28]:
es

Entityset: clicks
  Entities:
    clicks (shape = [100000, 8])
  Relationships:
    No relationships

In [27]:
es["clicks"].variables

[<Variable: click_time (dtype: datetime_time_index, format: None)>,
 <Variable: ip (dtype = categorical, count = 100000)>,
 <Variable: app (dtype = categorical, count = 100000)>,
 <Variable: device (dtype = categorical, count = 100000)>,
 <Variable: os (dtype = categorical, count = 100000)>,
 <Variable: channel (dtype = categorical, count = 100000)>,
 <Variable: is_attributed (dtype = boolean, count = 100000)>,
 <Variable: id (dtype = index, count = 100000)>]

### Create another entity with relationship to the "clicks" table

In [35]:
'''
es = es.entity_from_dataframe(
    entity_id='users',
    dataframe=df_users,
    index='id')

new_relationship = ft.Relationship(
    es["users"]["id"],
    es["clicks"]["user_id"])

es = es.add_relationship(new_relationship)
'''

'\nes = es.entity_from_dataframe(\n    entity_id=\'users\',\n    dataframe=df_users,\n    index=\'id\')\n\nnew_relationship = ft.Relationship(\n    es["users"]["id"],\n    es["clicks"]["user_id"])\n\nes = es.add_relationship(new_relationship)\n'

### Create an entity from the existing table

In [29]:
es = es.normalize_entity(
    base_entity_id='clicks', 
    new_entity_id='apps', 
    index='app', 
    make_time_index=False)

* Creates a new "apps" entity based on data from "clicks" entity
* Creates a new relationship between "apps" and "clicks" and adds it to the EntitySet

In [30]:
es['clicks'].variables

[<Variable: click_time (dtype: datetime_time_index, format: None)>,
 <Variable: ip (dtype = categorical, count = 100000)>,
 <Variable: app (dtype = id, count = 100000)>,
 <Variable: device (dtype = categorical, count = 100000)>,
 <Variable: os (dtype = categorical, count = 100000)>,
 <Variable: channel (dtype = categorical, count = 100000)>,
 <Variable: is_attributed (dtype = boolean, count = 100000)>,
 <Variable: id (dtype = index, count = 100000)>]

In [31]:
es['apps'].variables

[<Variable: app (dtype = index, count = 161)>]

In [32]:
es

Entityset: clicks
  Entities:
    clicks (shape = [100000, 8])
    apps (shape = [161, 1])
  Relationships:
    clicks.app -> apps.app

### Create features

In [37]:
feature_matrix, feature_defs = ft.dfs(entityset=es, target_entity="apps")

In [38]:
feature_defs

[<Feature: COUNT(clicks)>,
 <Feature: PERCENT_TRUE(clicks.is_attributed)>,
 <Feature: NUM_UNIQUE(clicks.ip)>,
 <Feature: NUM_UNIQUE(clicks.device)>,
 <Feature: NUM_UNIQUE(clicks.os)>,
 <Feature: NUM_UNIQUE(clicks.channel)>,
 <Feature: MODE(clicks.ip)>,
 <Feature: MODE(clicks.device)>,
 <Feature: MODE(clicks.os)>,
 <Feature: MODE(clicks.channel)>,
 <Feature: NUM_UNIQUE(clicks.DAY(click_time))>,
 <Feature: NUM_UNIQUE(clicks.YEAR(click_time))>,
 <Feature: NUM_UNIQUE(clicks.MONTH(click_time))>,
 <Feature: NUM_UNIQUE(clicks.WEEKDAY(click_time))>,
 <Feature: MODE(clicks.DAY(click_time))>,
 <Feature: MODE(clicks.YEAR(click_time))>,
 <Feature: MODE(clicks.MONTH(click_time))>,
 <Feature: MODE(clicks.WEEKDAY(click_time))>]

In [40]:
feature_matrix.head()

Unnamed: 0_level_0,COUNT(clicks),PERCENT_TRUE(clicks.is_attributed),NUM_UNIQUE(clicks.ip),NUM_UNIQUE(clicks.device),NUM_UNIQUE(clicks.os),NUM_UNIQUE(clicks.channel),MODE(clicks.ip),MODE(clicks.device),MODE(clicks.os),MODE(clicks.channel),NUM_UNIQUE(clicks.DAY(click_time)),NUM_UNIQUE(clicks.YEAR(click_time)),NUM_UNIQUE(clicks.MONTH(click_time)),NUM_UNIQUE(clicks.WEEKDAY(click_time)),MODE(clicks.DAY(click_time)),MODE(clicks.YEAR(click_time)),MODE(clicks.MONTH(click_time)),MODE(clicks.WEEKDAY(click_time))
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,3135,0.0,2723,4,56,27,5348,1,19,134,4,1,1,4,7,2017,11,1
2,11737,0.0,7759,3,75,21,5314,1,19,477,4,1,1,4,8,2017,11,2
3,18279,0.000219,12040,4,78,32,5348,1,19,280,4,1,1,4,8,2017,11,2
4,58,0.0,56,2,20,1,79881,1,19,101,3,1,1,3,9,2017,11,3
5,188,0.074468,187,2,36,4,26995,1,19,377,4,1,1,4,7,2017,11,1
