# Featuretools by example

* Kaggle: "TalkingData AdTracking Fraud Detection Challenge"
* https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection
* Not about fraud :( but attribution prediction

In [5]:
import numpy as np
import pandas as pd
import featuretools as ft

from featuretools.primitives import *

In [6]:
# data is taken from kaggle.com/c/talkingdata-adtracking-fraud-detection
input_file = '../data/train_sample.csv'
pd.read_csv(input_file).head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [7]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
to_parse = ['click_time']

In [33]:
df_train = pd.read_csv(input_file, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
df_train.sort_values(by='click_time')
df_train['id'] = range(len(df_train))
df_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,id
0,87540,12,1,13,497,2017-11-07 09:30:38,0,0
1,105560,25,1,17,259,2017-11-07 13:40:27,0,1
2,101424,12,1,19,212,2017-11-07 18:05:24,0,2
3,94584,13,1,13,477,2017-11-07 04:58:08,0,3
4,68413,12,1,1,178,2017-11-09 09:00:09,0,4


In [34]:
es = ft.EntitySet(id='clicks')

es = es.entity_from_dataframe(
    entity_id='clicks',
    dataframe=df_train,
    index='id',
    time_index='click_time',
    variable_types={
        'ip': ft.variable_types.Categorical,
        'app': ft.variable_types.Categorical,
        'device': ft.variable_types.Categorical,
        'os': ft.variable_types.Categorical,
        'channel': ft.variable_types.Categorical,
        'is_attributed': ft.variable_types.Boolean,
    }
)

es = es.normalize_entity(base_entity_id='clicks', new_entity_id='apps', index='app', make_time_index=False)

In [35]:
es

Entityset: clicks
  Entities:
    clicks (shape = [100000, 8])
    apps (shape = [161, 1])
  Relationships:
    clicks.app -> apps.app

In [36]:
# Hint Featuretools: creates features for the particular value
es['clicks']['is_attributed'].interesting_values = [True]

In [45]:
# Manually defined domain specific feature: "Seed Features"
google_apps = ft.Feature(es['apps']['app']).isin([1,2,3,4,5])

In [56]:
X = df_train.copy()
X_features = X[X['id'] < 70000]
X_train = X[X['id'] >= 70000]
cutoff_time = X_features['click_time'].max()
cutoff_time

Timestamp('2017-11-09 15:59:51')

In [63]:
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_entity='apps',
    seed_features=[google_apps],
    cutoff_time=cutoff_time,
    training_window=ft.Timedelta("3 days"),
    max_depth=5
)



In [64]:
feature_defs

[<Feature: app.isin([1, 2, 3, 4, 5])>,
 <Feature: COUNT(clicks)>,
 <Feature: PERCENT_TRUE(clicks.is_attributed)>,
 <Feature: NUM_UNIQUE(clicks.ip)>,
 <Feature: NUM_UNIQUE(clicks.device)>,
 <Feature: NUM_UNIQUE(clicks.os)>,
 <Feature: NUM_UNIQUE(clicks.channel)>,
 <Feature: MODE(clicks.ip)>,
 <Feature: MODE(clicks.device)>,
 <Feature: MODE(clicks.os)>,
 <Feature: MODE(clicks.channel)>,
 <Feature: COUNT(clicks WHERE is_attributed = True)>,
 <Feature: NUM_UNIQUE(clicks.DAY(click_time))>,
 <Feature: NUM_UNIQUE(clicks.YEAR(click_time))>,
 <Feature: NUM_UNIQUE(clicks.MONTH(click_time))>,
 <Feature: NUM_UNIQUE(clicks.WEEKDAY(click_time))>,
 <Feature: MODE(clicks.DAY(click_time))>,
 <Feature: MODE(clicks.YEAR(click_time))>,
 <Feature: MODE(clicks.MONTH(click_time))>,
 <Feature: MODE(clicks.WEEKDAY(click_time))>]

In [62]:
feature_matrix

Unnamed: 0_level_0,"app.isin([1, 2, 3, 4, 5])",COUNT(clicks),PERCENT_TRUE(clicks.is_attributed),NUM_UNIQUE(clicks.ip),NUM_UNIQUE(clicks.device),NUM_UNIQUE(clicks.os),NUM_UNIQUE(clicks.channel),MODE(clicks.ip),MODE(clicks.device),MODE(clicks.os),MODE(clicks.channel),COUNT(clicks WHERE is_attributed = True),NUM_UNIQUE(clicks.DAY(click_time)),NUM_UNIQUE(clicks.YEAR(click_time)),NUM_UNIQUE(clicks.MONTH(click_time)),NUM_UNIQUE(clicks.WEEKDAY(click_time)),MODE(clicks.DAY(click_time)),MODE(clicks.YEAR(click_time)),MODE(clicks.MONTH(click_time)),MODE(clicks.WEEKDAY(click_time))
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,True,3135,0.000000,2723,4,56,27,5348,1,19,134,0.0,4,1,1,4,7,2017,11,1
2,True,11737,0.000000,7759,3,75,21,5314,1,19,477,0.0,4,1,1,4,8,2017,11,2
3,True,18279,0.000219,12040,4,78,32,5348,1,19,280,4.0,4,1,1,4,8,2017,11,2
4,True,58,0.000000,56,2,20,1,79881,1,19,101,0.0,3,1,1,3,9,2017,11,3
5,True,188,0.074468,187,2,36,4,26995,1,19,377,14.0,4,1,1,4,7,2017,11,1
6,False,1303,0.000000,1209,2,47,4,5314,1,19,459,0.0,4,1,1,4,7,2017,11,1
7,False,981,0.000000,790,4,56,1,48240,1,13,101,0.0,3,1,1,3,9,2017,11,3
8,False,2004,0.001996,1778,3,51,3,5348,1,19,145,4.0,4,1,1,4,9,2017,11,3
9,False,8992,0.000890,6721,5,73,29,5348,1,19,466,8.0,4,1,1,4,9,2017,11,3
10,False,388,0.046392,373,1,42,5,5348,1,19,377,18.0,4,1,1,4,7,2017,11,1
