In [2]:
import os
import gc

import numpy as np
import pandas as pd
import featuretools as ft

from datetime import datetime
from glob import glob
from itertools import combinations

from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *

In [3]:
def create_entityset(filename, target_entity):
    df = pd.read_hdf(filename)
    df.reset_index(drop=True, inplace=True)
    df['id'] = df.index

    es = ft.EntitySet(id='clicks')
    es = es.entity_from_dataframe(
        entity_id='clicks',
        dataframe=df,
        index='id',
        time_index='click_time',
        variable_types={
            'ip': ft.variable_types.Categorical,
            'app': ft.variable_types.Categorical,
            'device': ft.variable_types.Categorical,
            'os': ft.variable_types.Categorical,
            'channel': ft.variable_types.Categorical,
            'is_attributed': ft.variable_types.Boolean,
        }
    )
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id=target_entity, index=target_entity, make_time_index=False)
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id='hour', index='hour', make_time_index=False)
    es.add_last_time_indexes()
    es['clicks']['is_attributed'].interesting_values = [True]
    return es

def calc_feature_matrix(es, target_entity, cutoff_time, training_window):
    feature_matrix, _ = ft.dfs(
        entityset=es,
        target_entity=target_entity,
        trans_primitives=[],
        agg_primitives=[Count, PercentTrue, AvgTimeBetween, Skew, Std, Median],
        where_primitives=[Count, PercentTrue, AvgTimeBetween],
        cutoff_time=cutoff_time,
        training_window=training_window,
        max_features=-1,
        max_depth=3
    )

    return feature_matrix


def create_features(entity_sets, target_entity, cutoff_time, training_window):
    tw_suffix = training_window.get_name().replace(' ', '').lower()

    feature_matrices = entity_sets.map(
        calc_feature_matrix,
        target_entity=target_entity,
        cutoff_time=cutoff_time,
        training_window=training_window)
    out = feature_matrices.compute()
    feature_matrix = pd.concat(out)
#     feature_matrix = feature_matrix[[c for c in feature_matrix.columns if c in to_select]]
    feature_matrix.columns = [str(col) + f"_{target_entity}_{tw_suffix}" for col in feature_matrix.columns]

    del out, feature_matrices
    gc.collect()
    return feature_matrix

In [16]:
target_entities_init = ['app', 'device', 'os', 'channel', 'ip', 'hour']
target_entities = []

for t in combinations(target_entities_init, 1):
#     if 'device' in t: continue
    target_entities.append(t[0])
for t in combinations(target_entities_init, 2):
#     if 'ip' in t: continue
    target_entities.append(list(t))
for t in combinations(target_entities_init, 3):
#     if 'ip' in t: continue
    target_entities.append(list(t))
# for t in combinations(target_entities_init, 4):
#     target_entities.append(list(t))

target_entities.remove('device')
target_entities.remove(['app', 'ip'])
target_entities.remove(['os', 'ip'])
target_entities.remove(['channel', 'ip'])
target_entities.remove(['app', 'os', 'ip'])
target_entities.remove(['app', 'device', 'ip'])
target_entities.remove(['app', 'channel', 'ip'])
target_entities.remove(['app', 'ip', 'hour'])
target_entities.remove(['device', 'os', 'ip'])
target_entities.remove(['device', 'channel', 'ip'])
target_entities.remove(['device', 'ip', 'hour'])
target_entities.remove(['os', 'channel', 'ip'])
target_entities.remove(['os', 'ip', 'hour'])
target_entities.remove(['channel', 'ip', 'hour'])

In [17]:
target_entities

['app',
 'os',
 'channel',
 'ip',
 'hour',
 ['app', 'device'],
 ['app', 'os'],
 ['app', 'channel'],
 ['app', 'hour'],
 ['device', 'os'],
 ['device', 'channel'],
 ['device', 'ip'],
 ['device', 'hour'],
 ['os', 'channel'],
 ['os', 'hour'],
 ['channel', 'hour'],
 ['ip', 'hour'],
 ['app', 'device', 'os'],
 ['app', 'device', 'channel'],
 ['app', 'device', 'hour'],
 ['app', 'os', 'channel'],
 ['app', 'os', 'hour'],
 ['app', 'channel', 'hour'],
 ['device', 'os', 'channel'],
 ['device', 'os', 'hour'],
 ['device', 'channel', 'hour'],
 ['os', 'channel', 'hour']]

In [18]:
# pbar = ProgressBar()
# pbar.register()

continue_next = False

# target_entities = [['channel', 'hour']]
feature_name_suffix = 'attributed6'
training_windows = ['7 day']
cutoff_times= [
    datetime.datetime(2017, 11, 7, 17, 0),
    datetime.datetime(2017, 11, 8, 17, 0),
    datetime.datetime(2017, 11, 9, 17, 0)
]

for target_entity in target_entities:
    if not continue_next:
        if target_entity == ['app', 'os', 'hour']:
            continue_next = True
            continue
        else:
            continue
        
    target_entity_name = target_entity if type(target_entity) == str else "_".join(target_entity)
    print(f"Processing {target_entity_name}")
    features_dir = f"../data/interim/features/{target_entity_name}"
    if not os.path.exists(features_dir): os.makedirs(features_dir)
    filenames = glob(f"../data/interim/partitioned3/{target_entity_name}/train_*.hdf.compress")
    b = bag.from_sequence(filenames)
    entity_sets = b.map(create_entityset, target_entity_name)

    for cutoff_time in cutoff_times:
        for training_window in training_windows:
            tw_suffix = training_window.replace(' ', '').lower()
            feature_matrix = create_features(entity_sets, target_entity=target_entity_name, cutoff_time=cutoff_time, training_window=ft.Timedelta(training_window))
            output_file = f"{features_dir}/features_{cutoff_time.strftime('%Y-%m-%d_%H%M')}_{tw_suffix}_{feature_name_suffix}.hdf.compress"
            print(f"Writing {output_file}")
            feature_matrix.to_hdf(output_file, 'features', mode='w', complib='blosc', fletcher32=True, complevel=9)
            del feature_matrix
            gc.collect()
        
    del b, entity_sets
    gc.collect()

Processing app_channel_hour
Writing ../data/interim/features/app_channel_hour/features_2017-11-08_1700_7day_attributed6.hdf.compress
Writing ../data/interim/features/app_channel_hour/features_2017-11-09_1700_7day_attributed6.hdf.compress
Processing device_os_channel
Writing ../data/interim/features/device_os_channel/features_2017-11-07_1700_7day_attributed6.hdf.compress
Writing ../data/interim/features/device_os_channel/features_2017-11-08_1700_7day_attributed6.hdf.compress
Writing ../data/interim/features/device_os_channel/features_2017-11-09_1700_7day_attributed6.hdf.compress
Processing device_os_hour
Writing ../data/interim/features/device_os_hour/features_2017-11-07_1700_7day_attributed6.hdf.compress
Writing ../data/interim/features/device_os_hour/features_2017-11-08_1700_7day_attributed6.hdf.compress
Writing ../data/interim/features/device_os_hour/features_2017-11-09_1700_7day_attributed6.hdf.compress
Processing device_channel_hour
Writing ../data/interim/features/device_channel_h

In [6]:
df = pd.read_hdf('../data/interim/features/channel_hour/features_2017-11-09_1700_7day_attributed6.hdf.compress')

In [7]:
df.columns

Index(['COUNT(clicks)_channel_hour_7days',
       'PERCENT_TRUE(clicks.is_attributed)_channel_hour_7days',
       'AVG_TIME_BETWEEN(clicks.click_time)_channel_hour_7days',
       'COUNT(clicks WHERE is_attributed = True)_channel_hour_7days',
       'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_channel_hour_7days',
       'SKEW(clicks.hour.COUNT(clicks))_channel_hour_7days',
       'SKEW(clicks.hour.PERCENT_TRUE(clicks.is_attributed))_channel_hour_7days',
       'SKEW(clicks.hour.AVG_TIME_BETWEEN(clicks.click_time))_channel_hour_7days',
       'STD(clicks.hour.COUNT(clicks))_channel_hour_7days',
       'STD(clicks.hour.PERCENT_TRUE(clicks.is_attributed))_channel_hour_7days',
       'STD(clicks.hour.AVG_TIME_BETWEEN(clicks.click_time))_channel_hour_7days',
       'MEDIAN(clicks.hour.COUNT(clicks))_channel_hour_7days',
       'MEDIAN(clicks.hour.PERCENT_TRUE(clicks.is_attributed))_channel_hour_7days',
       'MEDIAN(clicks.hour.AVG_TIME_BETWEEN(clicks.click_time))_channe