In [17]:
import os
import gc

import numpy as np
import pandas as pd
import featuretools as ft

from datetime import datetime
from glob import glob
from itertools import combinations

from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *

In [18]:
def create_entityset(filename, target_entity):
    df = pd.read_hdf(filename)
    df.reset_index(drop=True, inplace=True)
    df['id'] = df.index
#     df['hour'] = df['click_time'].dt.hour

    es = ft.EntitySet(id='clicks')
    es = es.entity_from_dataframe(
        entity_id='clicks',
        dataframe=df,
        index='id',
        time_index='click_time',
        variable_types={
            'ip': ft.variable_types.Categorical,
            'app': ft.variable_types.Categorical,
            'device': ft.variable_types.Categorical,
            'os': ft.variable_types.Categorical,
            'channel': ft.variable_types.Categorical,
            'is_attributed': ft.variable_types.Boolean,
        }
    )
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id=target_entity, index=target_entity, make_time_index=False)
#     es = es.normalize_entity(base_entity_id='clicks', new_entity_id='hour', index='hour', make_time_index=False)
    es.add_last_time_indexes()
    return es

def calc_feature_matrix(es, target_entity, cutoff_time, training_window):
    feature_matrix, _ = ft.dfs(
        entityset=es,
        target_entity=target_entity,
        trans_primitives=[],
#         agg_primitives=[Count, AvgTimeBetween, Trend, NUnique, Skew, Std, Median],
        agg_primitives=[Count, AvgTimeBetween],
        cutoff_time=cutoff_time,
        training_window=training_window,
        max_features=-1,
        max_depth=1
    )

    return feature_matrix


def create_features(entity_sets, target_entity, cutoff_time, training_window):
    tw_suffix = training_window.get_name().replace(' ', '').lower()

    feature_matrices = entity_sets.map(
        calc_feature_matrix,
        target_entity=target_entity,
        cutoff_time=cutoff_time,
        training_window=training_window)
    out = feature_matrices.compute()
    feature_matrix = pd.concat(out)
#     feature_matrix = feature_matrix[[c for c in feature_matrix.columns if c in to_select]]
    feature_matrix.columns = [str(col) + f"_{target_entity}_{tw_suffix}" for col in feature_matrix.columns]

    del out, feature_matrices
    gc.collect()
    return feature_matrix

In [19]:
target_entities_init = ['app', 'device', 'os', 'channel', 'ip', 'hour']
target_entities = []

for t in combinations(target_entities_init, 1):
    if 'device' in t: continue
    target_entities.append(t[0])
for t in combinations(target_entities_init, 2):
    if 'ip' in t: continue
    target_entities.append(list(t))
for t in combinations(target_entities_init, 3):
    if 'ip' in t: continue
    target_entities.append(list(t))
# for t in combinations(target_entities_init, 4):
#     target_entities.append(list(t))

In [20]:
# target_entities = ['channel']
feature_name_suffix = 'count'
# training_windows = ['1 hour', '1 day']
training_windows = ['1 hour']

cutoff_times = []
train_summary = pd.read_csv('../data/interim/day_hour_train.csv')
test_summary = pd.read_csv('../data/interim/day_hour_test.csv')
train_summary = train_summary.append(test_summary, ignore_index=True)
start_row = train_summary[(train_summary['day'] == 7) & (train_summary['hour'] == 17)].index[0]
for i in range(start_row, train_summary.shape[0]):
    row = train_summary.iloc[i].to_dict()
    cutoff_times.append(datetime.datetime(2017, 11, row['day'], row['hour'], 0))
# cutoff_times = [datetime.datetime(2017, 11, 10, 10)]


for target_entity in target_entities:
    target_entity_name = target_entity if type(target_entity) == str else "_".join(target_entity)
    print(f"Processing {target_entity_name}")
    features_dir = f"../data/interim/features/{target_entity_name}"
    if not os.path.exists(features_dir): os.makedirs(features_dir)
    filenames = glob(f"../data/interim/partitioned_all/{target_entity_name}/train_*.hdf.compress")
    b = bag.from_sequence(filenames)
    entity_sets = b.map(create_entityset, target_entity_name)

    for cutoff_time in cutoff_times:
        for training_window in training_windows:
            tw_suffix = training_window.replace(' ', '').lower()
            feature_matrix = create_features(entity_sets, target_entity=target_entity_name, cutoff_time=cutoff_time, training_window=ft.Timedelta(training_window))

            output_file = f"{features_dir}/features_{cutoff_time.strftime('%Y-%m-%d_%H%M')}_{tw_suffix}_{feature_name_suffix}.hdf.compress"
            print(f"Writing {output_file}")
            feature_matrix.to_hdf(output_file, 'features', mode='w', complib='blosc', fletcher32=True, complevel=9)
            del feature_matrix
            gc.collect()
        
    del b, entity_sets
    gc.collect()

Processing app
Writing ../data/interim/features/app/features_2017-11-07_1800_1hour_count.hdf.compress
Writing ../data/interim/features/app/features_2017-11-07_1900_1hour_count.hdf.compress
Writing ../data/interim/features/app/features_2017-11-07_2000_1hour_count.hdf.compress
Writing ../data/interim/features/app/features_2017-11-07_2100_1hour_count.hdf.compress
Writing ../data/interim/features/app/features_2017-11-07_2200_1hour_count.hdf.compress
Writing ../data/interim/features/app/features_2017-11-07_2300_1hour_count.hdf.compress
Writing ../data/interim/features/app/features_2017-11-08_0000_1hour_count.hdf.compress
Writing ../data/interim/features/app/features_2017-11-08_0100_1hour_count.hdf.compress
Writing ../data/interim/features/app/features_2017-11-08_0200_1hour_count.hdf.compress
Writing ../data/interim/features/app/features_2017-11-08_0300_1hour_count.hdf.compress
Writing ../data/interim/features/app/features_2017-11-08_0400_1hour_count.hdf.compress
Writing ../data/interim/feat

Process ForkPoolWorker-2669:
Process ForkPoolWorker-2687:
Process ForkPoolWorker-2682:
Process ForkPoolWorker-2675:
Process ForkPoolWorker-2688:
Process ForkPoolWorker-2684:
Process ForkPoolWorker-2674:
Process ForkPoolWorker-2680:
Process ForkPoolWorker-2686:
Process ForkPoolWorker-2681:
Process ForkPoolWorker-2678:
Process ForkPoolWorker-2661:
Process ForkPoolWorker-2683:
Process ForkPoolWorker-2685:
Process ForkPoolWorker-2664:
Process ForkPoolWorker-2657:
Process ForkPoolWorker-2679:
Process ForkPoolWorker-2660:
Process ForkPoolWorker-2671:
Process ForkPoolWorker-2659:
Process ForkPoolWorker-2667:
Process ForkPoolWorker-2662:
Process ForkPoolWorker-2673:
Process ForkPoolWorker-2677:
Process ForkPoolWorker-2666:
Process ForkPoolWorker-2672:
Process ForkPoolWorker-2658:
Process ForkPoolWorker-2670:
Process ForkPoolWorker-2665:
Process ForkPoolWorker-2676:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent ca

KeyboardInterrupt: 

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
 

  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/conda/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "

KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrup

In [12]:
df = pd.read_hdf('../data/interim/features/channel/features_2017-11-10_1000_1hour_count.hdf.compress')

In [13]:
df.columns

Index(['COUNT(clicks)_channel_1hour', 'AVG_TIME_BETWEEN(clicks.click_time)_channel_1hour'], dtype='object')