In [2]:
import os
import gc

import numpy as np
import pandas as pd
import featuretools as ft

from datetime import datetime
from glob import glob

from featuretools.primitives import *

In [11]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
to_parse = ['click_time']

In [31]:
df = pd.read_csv('../data/raw/train.csv', usecols=to_read, dtype=dtypes, parse_dates=to_parse)
df['id'] = df.index

In [33]:
es = ft.EntitySet(id='clicks')
es = es.entity_from_dataframe(
    entity_id='clicks',
    dataframe=df,
    index='id',
    time_index='click_time',
    variable_types={
        'app': ft.variable_types.Categorical,
        'device': ft.variable_types.Categorical,
        'os': ft.variable_types.Categorical,
        'channel': ft.variable_types.Categorical,
        'is_attributed': ft.variable_types.Boolean,
    }
)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='ip', index='ip', make_time_index=False)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='app', index='app', make_time_index=False)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='device', index='device', make_time_index=False)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='channel', index='channel', make_time_index=False)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='os', index='os', make_time_index=False)

# es = es.entity_from_dataframe(
#     entity_id='download',
#     dataframe=df_downloads,
#     index='id',
#     time_index='click_time'
# )

# new_relationship = ft.Relationship(es['download']['id'], es['clicks']['id'])
# es = es.add_relationship(new_relationship)

es.add_last_time_indexes()
es['clicks']['is_attributed'].interesting_values = [True]

In [34]:
def create_features(entityset, target_entity, cutoff_time, training_window):
    feature_matrix, feature_defs = ft.dfs(
        entityset=entityset,
        target_entity=target_entity,
        cutoff_time=cutoff_time,
        training_window=training_window,
        agg_primitives=[PercentTrue, Trend, TimeSinceLast, AvgTimeBetween, Mode],
        where_primitives=[Trend, TimeSinceLast, AvgTimeBetween, Mode],
        max_depth=3,
#         features_only=True
    )
    return feature_matrix

In [None]:
target_entities = ['ip', 'app', 'device', 'os', 'channel']
# target_entities = ['channel']
training_windows = ['1 day']
cutoff_time=datetime.datetime(2017, 11, 7, 17, 0)

for target_entity in target_entities:
    features_dir = f"../data/interim/features/{target_entity}"
    if not os.path.exists(features_dir): os.makedirs(features_dir)
    for training_window in training_windows:
        training_window=ft.Timedelta(training_window)
        feature_matrix = create_features(es, target_entity=target_entity, cutoff_time=cutoff_time, training_window=training_window)
#         feature_matrix = feature_matrix[to_select]
        tw_suffix = training_window.get_name().replace(' ', '').lower()
        feature_matrix.columns = [str(col) + f"_{target_entity}_{tw_suffix}" for col in feature_matrix.columns]
        output_file = f"{features_dir}/train_2017-11-07_1600_{target_entity}_{tw_suffix}_attributed_features.csv"
        print(f"Writing {output_file}")
        feature_matrix.to_csv(output_file)
        del feature_matrix
        gc.collect()

In [23]:
pd.read_csv('../data/interim/features/channel/train_2017-11-07_1600_channel_1day_attributed_features.csv').head()

Unnamed: 0,channel,PERCENT_TRUE(clicks.is_attributed)_channel_1day,TIME_SINCE_LAST(clicks.click_time)_channel_1day,TIME_SINCE_LAST(clicks.click_time WHERE is_attributed = True)_channel_1day,"TREND(clicks.app.PERCENT_TRUE(clicks.is_attributed), click_time)_channel_1day","TREND(clicks.app.PERCENT_TRUE(clicks.is_attributed), click_time WHERE is_attributed = True)_channel_1day","TREND(clicks.ip.PERCENT_TRUE(clicks.is_attributed), click_time)_channel_1day","TREND(clicks.ip.PERCENT_TRUE(clicks.is_attributed), click_time WHERE is_attributed = True)_channel_1day","TREND(clicks.app.TIME_SINCE_LAST(clicks.click_time), click_time)_channel_1day","TREND(clicks.app.TIME_SINCE_LAST(clicks.click_time), click_time WHERE is_attributed = True)_channel_1day","TREND(clicks.ip.TIME_SINCE_LAST(clicks.click_time), click_time)_channel_1day","TREND(clicks.ip.TIME_SINCE_LAST(clicks.click_time), click_time WHERE is_attributed = True)_channel_1day","TREND(clicks.os.TIME_SINCE_LAST(clicks.click_time), click_time)_channel_1day","TREND(clicks.os.TIME_SINCE_LAST(clicks.click_time), click_time WHERE is_attributed = True)_channel_1day","TREND(clicks.device.PERCENT_TRUE(clicks.is_attributed), click_time)_channel_1day","TREND(clicks.device.PERCENT_TRUE(clicks.is_attributed), click_time WHERE is_attributed = True)_channel_1day","TREND(clicks.device.TIME_SINCE_LAST(clicks.click_time), click_time)_channel_1day","TREND(clicks.device.TIME_SINCE_LAST(clicks.click_time), click_time WHERE is_attributed = True)_channel_1day","TREND(clicks.os.PERCENT_TRUE(clicks.is_attributed), click_time)_channel_1day","TREND(clicks.os.PERCENT_TRUE(clicks.is_attributed), click_time WHERE is_attributed = True)_channel_1day"
0,3,0.000000,85919.0,,0.000000e+00,,-3.055539e-09,,0.000000e+00,,-0.368798,,-0.000345,,5.584846e-10,,0.000032,,2.587679e-10,
1,4,,,,,,,,,,,,,,,,,,,
2,5,1.000000,27485.0,27485.0,,,,,,,,,,,,,,,,
3,13,0.000000,83182.0,,0.000000e+00,,-6.860502e-07,,0.000000e+00,,-1.021006,,0.000269,,0.000000e+00,,0.000000,,-2.502986e-08,
4,15,,,,,,,,,,,,,,,,,,,
5,17,0.000000,78253.0,,0.000000e+00,,0.000000e+00,,0.000000e+00,,-0.336470,,-0.049375,,-6.240152e-41,,0.000000,,6.163848e-09,
6,18,0.000000,24883.0,,0.000000e+00,,5.016724e-08,,0.000000e+00,,-1.489131,,0.016071,,-6.206866e-09,,-0.000359,,4.554389e-08,
7,19,0.000000,85091.0,,-4.378576e-11,,2.855329e-09,,3.576447e-06,,-0.547376,,0.000788,,-9.682645e-39,,0.000000,,-7.523912e-11,
8,21,0.076923,83922.0,55277.0,-2.930069e-07,-0.000010,-1.350081e-06,-0.000015,-1.503887e-01,0.588043,-0.499453,0.161451,0.006941,0.000107,9.341158e-10,1.181262e-39,0.000054,0.0,-3.550677e-10,4.002488e-09
9,22,0.000000,6596.0,,,,,,,,,,,,,,,,,
