In [7]:
import os
import gc

import numpy as np
import pandas as pd
import featuretools as ft

from datetime import datetime
from glob import glob

from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *

In [3]:
sorted(glob('../data/interim/train_2017-11-08*00.csv'))

['../data/interim/train_2017-11-08_0000.csv',
 '../data/interim/train_2017-11-08_0100.csv',
 '../data/interim/train_2017-11-08_0200.csv',
 '../data/interim/train_2017-11-08_0300.csv',
 '../data/interim/train_2017-11-08_0400.csv',
 '../data/interim/train_2017-11-08_0500.csv',
 '../data/interim/train_2017-11-08_0600.csv',
 '../data/interim/train_2017-11-08_0700.csv',
 '../data/interim/train_2017-11-08_0800.csv',
 '../data/interim/train_2017-11-08_0900.csv',
 '../data/interim/train_2017-11-08_1000.csv',
 '../data/interim/train_2017-11-08_1100.csv',
 '../data/interim/train_2017-11-08_1200.csv',
 '../data/interim/train_2017-11-08_1300.csv',
 '../data/interim/train_2017-11-08_1400.csv',
 '../data/interim/train_2017-11-08_1500.csv',
 '../data/interim/train_2017-11-08_1600.csv',
 '../data/interim/train_2017-11-08_1700.csv',
 '../data/interim/train_2017-11-08_1800.csv',
 '../data/interim/train_2017-11-08_1900.csv',
 '../data/interim/train_2017-11-08_2000.csv',
 '../data/interim/train_2017-11-08

In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
to_parse = ['click_time']

In [3]:
df = pd.read_csv('../data/raw/train_sample.csv', usecols=to_read, dtype=dtypes, parse_dates=to_parse)
df['id'] = df.index

In [17]:
df_downloads = df[df['is_attributed'] == 1][['id', 'click_time']].copy()

In [18]:
df_downloads.head()

Unnamed: 0,id,click_time
284,284,2017-11-08 02:22:13
481,481,2017-11-08 06:10:05
1208,1208,2017-11-07 09:54:22
1341,1341,2017-11-09 10:58:46
1412,1412,2017-11-07 22:19:03


In [4]:
es = ft.EntitySet(id='clicks')
es = es.entity_from_dataframe(
    entity_id='clicks',
    dataframe=df,
    index='id',
    time_index='click_time',
    variable_types={
        'app': ft.variable_types.Categorical,
        'device': ft.variable_types.Categorical,
        'os': ft.variable_types.Categorical,
        'channel': ft.variable_types.Categorical,
        'is_attributed': ft.variable_types.Boolean,
    }
)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='ip', index='ip', make_time_index=False)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='app', index='app', make_time_index=False)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='device', index='device', make_time_index=False)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='channel', index='channel', make_time_index=False)
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='os', index='os', make_time_index=False)

# es = es.entity_from_dataframe(
#     entity_id='download',
#     dataframe=df_downloads,
#     index='id',
#     time_index='click_time'
# )

# new_relationship = ft.Relationship(es['download']['id'], es['clicks']['id'])
# es = es.add_relationship(new_relationship)

es.add_last_time_indexes()
es['clicks']['is_attributed'].interesting_values = [True]

In [13]:
def create_features(entityset, target_entity, cutoff_time, training_window):
    feature_matrix, feature_defs = ft.dfs(
        entityset=entityset,
        target_entity=target_entity,
        cutoff_time=cutoff_time,
        training_window=training_window,
        agg_primitives=[PercentTrue],
        max_depth=3,
#         features_only=True
    )
    return feature_matrix

In [16]:
target_entities = ['ip', 'app', 'device', 'os', 'channel']
target_entities = ['channel']
training_windows = ['1 day']
cutoff_time=datetime.datetime(2017, 11, 7, 16, 0)

for target_entity in target_entities:
    features_dir = f"../data/interim/features/{target_entity}"
    if not os.path.exists(features_dir): os.makedirs(features_dir)
    for training_window in training_windows:
        training_window=ft.Timedelta(training_window)
        feature_matrix = create_features(es, target_entity=target_entity, cutoff_time=cutoff_time, training_window=training_window)
        tw_suffix = training_window.get_name().replace(' ', '').lower()
        feature_matrix.columns = [str(col) + f"_{target_entity}_{tw_suffix}" for col in feature_matrix.columns]
        output_file = f"{features_dir}/train_2017-11-07_1600_{target_entity}_{tw_suffix}_attributed_features.csv"
        print(f"Writing {output_file}")
        feature_matrix.to_csv(output_file)
        del feature_matrix
        gc.collect()

Writing ../data/interim/features/channel/train_2017-11-07_1600_channel_1day_attributed_features.csv
