In [1]:
import os
import gc

import numpy as np
import pandas as pd
import featuretools as ft
import dask
import dask.dataframe as dd

from datetime import datetime
from glob import glob
from itertools import combinations

from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *

In [2]:
def create_entityset(df, target_entity, target_entity_name):
    if target_entity_name not in df.columns:
        if len(target_entity) == 2:
            df[target_entity_name] = df[target_entity[0]].astype(str) + "_" + df[target_entity[1]].astype(str)
        elif len(target_entity) == 3:
            df[target_entity_name] = df[target_entity[0]].astype(str) + "_" + df[target_entity[1]].astype(str) + "_" + df[target_entity[2]].astype(str)
        elif len(target_entity) == 4:
            df[target_entity_name] = df[target_entity[0]].astype(str) + "_" + df[target_entity[1]].astype(str) + "_" + df[target_entity[2]].astype(str) + "_" + df[target_entity[3]].astype(str)
    
    es = ft.EntitySet(id='clicks')
    es = es.entity_from_dataframe(
        entity_id='clicks',
        dataframe=df,
        index='id',
        time_index='click_time',
        variable_types={
            'ip': ft.variable_types.Categorical,
            'app': ft.variable_types.Categorical,
            'device': ft.variable_types.Categorical,
            'os': ft.variable_types.Categorical,
            'channel': ft.variable_types.Categorical,
            'is_attributed': ft.variable_types.Boolean,
        }
    )
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id=target_entity_name, index=target_entity_name, make_time_index=False)
    es.add_last_time_indexes()
    es['clicks']['is_attributed'].interesting_values = [True]
    return es

def calc_feature_matrix(es, target_entity, cutoff_time, training_window):
    feature_matrix, _ = ft.dfs(
        entityset=es,
        target_entity=target_entity,
        trans_primitives=[Hour],
#         agg_primitives=[PercentTrue, Trend, TimeSinceLast, AvgTimeBetween, Mode, Count],
#         where_primitives=[Trend, TimeSinceLast, AvgTimeBetween, Mode, Count],
#         cutoff_time=cutoff_time,
#         training_window=training_window,
        max_features=-1,
        max_depth=3
    )

    return feature_matrix


def create_features(entity_sets, target_entity, cutoff_time, training_window):
    tw_suffix = training_window.get_name().replace(' ', '').lower()

    feature_matrices = entity_sets.map(
        calc_feature_matrix,
        target_entity=target_entity,
        cutoff_time=cutoff_time,
        training_window=training_window)
    out = feature_matrices.compute()
    feature_matrix = pd.concat(out)
#     feature_matrix = feature_matrix[[c for c in feature_matrix.columns if c in to_select]]
    feature_matrix.columns = [str(col) + f"_{target_entity}_{tw_suffix}" for col in feature_matrix.columns]

    del out, feature_matrices
    gc.collect()
    return feature_matrix

In [3]:
pbar = ProgressBar()
pbar.register()

In [4]:
df = pd.read_hdf('../data/raw/train.hdf.compress')
df['id'] = df.index

In [5]:
target_entities_init = ['app', 'device', 'os', 'channel', 'ip']
target_entities = []

for t in combinations(target_entities_init, 1):
    target_entities.append(t[0])
for t in combinations(target_entities_init, 2):
    target_entities.append(list(t))
# for t in combinations(target_entities_init, 3):
#     target_entities.append(list(t))
# for t in combinations(target_entities_init, 4):
#     target_entities.append(list(t))

In [6]:
feature_name_suffix = 'attributed'
training_windows = ['1 day']
cutoff_time=datetime.datetime(2017, 11, 8, 17, 0)
train_filename = 'features_2017-11-08_1700'
target_entities = [['os', 'channel']]

for target_entity in target_entities:
    target_entity_name = target_entity if type(target_entity) == str else "_".join(target_entity)
    print(f"Processing {target_entity_name}")
    features_dir = f"../data/interim/features/{target_entity_name}"
    if not os.path.exists(features_dir): os.makedirs(features_dir)

    dfg = df.groupby(target_entity)
    b = bag.from_sequence([g for _, g in dfg], npartitions=len(dfg))
    entity_sets = b.map(create_entityset, target_entity, target_entity_name)

    for training_window in training_windows:
        tw_suffix = training_window.replace(' ', '').lower()
        feature_matrix = create_features(entity_sets, target_entity=target_entity_name, cutoff_time=cutoff_time, training_window=ft.Timedelta(training_window))
        output_file = f"{features_dir}/{train_filename}_{tw_suffix}_{feature_name_suffix}.hdf.compress"
        print(f"Writing {output_file}")
        feature_matrix.to_hdf(output_file, 'train', mode='w', complib='blosc', fletcher32=True, complevel=9)
        del feature_matrix
        gc.collect()
        
    del dfg, b, entity_sets
    gc.collect()

Processing os_channel
[##################                      ] | 45% Completed |  6min 50.9s

Process ForkPoolWorker-26:
Process ForkPoolWorker-25:


[##################                      ] | 45% Completed |  6min 51.1s

Process ForkPoolWorker-3:





Process ForkPoolWorker-30:
Process ForkPoolWorker-21:
Process ForkPoolWorker-4:
Process ForkPoolWorker-24:
Process ForkPoolWorker-27:
Traceback (most recent call last):
Process ForkPoolWorker-29:
Process ForkPoolWorker-11:
Process ForkPoolWorker-31:
Process ForkPoolWorker-8:
Process ForkPoolWorker-14:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-15:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process ForkPoolWorker-16:
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 2

KeyboardInterrupt
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/opt/conda/lib/python3.6/multiprocessing/queues.py", line 335, in get
    res = self._reader.recv_bytes()
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/opt/conda/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
KeyboardInterrupt
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/opt/conda/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
KeyboardInterrupt
  File "/opt/conda/lib/python3.6/multiprocessing/sy

KeyboardInterrupt: 

Process ForkPoolWorker-19:
Process ForkPoolWorker-12:
Process ForkPoolWorker-9:
Process ForkPoolWorker-18:
Process ForkPoolWorker-1:
Process ForkPoolWorker-13:
Process ForkPoolWorker-10:
Process ForkPoolWorker-22:
Process ForkPoolWorker-23:
Process ForkPoolWorker-20:
Process ForkPoolWorker-28:
Process ForkPoolWorker-2:
Process ForkPoolWorker-32:
Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Process ForkPoolWorker-7:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-17:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent cal

  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/opt/conda/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/opt/conda/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/opt/conda/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/opt/conda/lib/python3.6/multiprocessing/queues.py", 