In [1]:
import os
import gc

import numpy as np
import pandas as pd
import featuretools as ft

from datetime import datetime
from glob import glob
from itertools import combinations

from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *

In [2]:
def create_entityset(filename, target_entity):
    df = pd.read_hdf(filename)
    df.reset_index(drop=True, inplace=True)
    df['id'] = df.index
    df['hour'] = df['click_time'].dt.hour

    es = ft.EntitySet(id='clicks')
    es = es.entity_from_dataframe(
        entity_id='clicks',
        dataframe=df,
        index='id',
        time_index='click_time',
        variable_types={
            'ip': ft.variable_types.Categorical,
            'app': ft.variable_types.Categorical,
            'device': ft.variable_types.Categorical,
            'os': ft.variable_types.Categorical,
            'channel': ft.variable_types.Categorical,
            'is_attributed': ft.variable_types.Boolean,
        }
    )
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id=target_entity, index=target_entity, make_time_index=False)
#     es = es.normalize_entity(base_entity_id='clicks', new_entity_id='hour', index='hour', make_time_index=False)
    es.add_last_time_indexes()
    return es

def calc_feature_matrix(es, target_entity, cutoff_time, training_window):
    feature_matrix, _ = ft.dfs(
        entityset=es,
        target_entity=target_entity,
        trans_primitives=[],
#         agg_primitives=[Count, AvgTimeBetween, Trend, NUnique, Skew, Std, Median],
        agg_primitives=[Count, AvgTimeBetween, NUnique],
#         cutoff_time=cutoff_time,
#         training_window=training_window,
        max_features=-1,
        max_depth=4
    )

    return feature_matrix


def create_features(entity_sets, target_entity, cutoff_time, training_window):
    tw_suffix = training_window.get_name().replace(' ', '').lower()

    feature_matrices = entity_sets.map(
        calc_feature_matrix,
        target_entity=target_entity,
        cutoff_time=cutoff_time,
        training_window=training_window)
    out = feature_matrices.compute()
    feature_matrix = pd.concat(out)
#     feature_matrix = feature_matrix[[c for c in feature_matrix.columns if c in to_select]]
    feature_matrix.columns = [str(col) + f"_{target_entity}_{tw_suffix}" for col in feature_matrix.columns]

    del out, feature_matrices
    gc.collect()
    return feature_matrix

In [3]:
target_entities_init = ['app', 'device', 'os', 'channel', 'ip', 'hour']
target_entities = []

for t in combinations(target_entities_init, 1):
    if 'device' in t: continue
    target_entities.append(t[0])
for t in combinations(target_entities_init, 2):
    if 'ip' in t: continue
    target_entities.append(list(t))
for t in combinations(target_entities_init, 3):
    if 'ip' in t: continue
#      target_entities.append(list(t))
# for t in combinations(target_entities_init, 4):
#     target_entities.append(list(t))

In [4]:
# target_entities = ['app']
feature_name_suffix = 'countall'
# training_windows = ['1 hour', '1 day']
training_windows = ['7 days']

cutoff_time=datetime.datetime(2017, 11, 10, 17, 0)

for target_entity in target_entities:
    target_entity_name = target_entity if type(target_entity) == str else "_".join(target_entity)
    print(f"Processing {target_entity_name}")
    features_dir = f"../data/interim/features/{target_entity_name}"
    if not os.path.exists(features_dir): os.makedirs(features_dir)
    filenames = glob(f"../data/interim/partitioned_all/{target_entity_name}/train_*.hdf.compress")
    b = bag.from_sequence(filenames)
    entity_sets = b.map(create_entityset, target_entity_name)

    for training_window in training_windows:
        tw_suffix = training_window.replace(' ', '').lower()
        feature_matrix = create_features(entity_sets, target_entity=target_entity_name, cutoff_time=cutoff_time, training_window=ft.Timedelta(training_window))

        output_file = f"{features_dir}/features_{cutoff_time.strftime('%Y-%m-%d_%H%M')}_{tw_suffix}_{feature_name_suffix}.hdf.compress"
        print(f"Writing {output_file}")
        feature_matrix.to_hdf(output_file, 'features', mode='w', complib='blosc', fletcher32=True, complevel=9)
        del feature_matrix
        gc.collect()
        
    del b, entity_sets
    gc.collect()

Processing app
Writing ../data/interim/features/app/features_2017-11-10_1700_7days_countall.hdf.compress
Processing os
Writing ../data/interim/features/os/features_2017-11-10_1700_7days_countall.hdf.compress
Processing channel
Writing ../data/interim/features/channel/features_2017-11-10_1700_7days_countall.hdf.compress
Processing ip
Writing ../data/interim/features/ip/features_2017-11-10_1700_7days_countall.hdf.compress
Processing hour
Writing ../data/interim/features/hour/features_2017-11-10_1700_7days_countall.hdf.compress
Processing app_device
Writing ../data/interim/features/app_device/features_2017-11-10_1700_7days_countall.hdf.compress
Processing app_os
Writing ../data/interim/features/app_os/features_2017-11-10_1700_7days_countall.hdf.compress
Processing app_channel
Writing ../data/interim/features/app_channel/features_2017-11-10_1700_7days_countall.hdf.compress
Processing app_hour
Writing ../data/interim/features/app_hour/features_2017-11-10_1700_7days_countall.hdf.compress
Pro

In [12]:
df = pd.read_hdf("../data/interim/features/app/features_2017-11-10_1500_7days_countall.hdf.compress")

In [13]:
df.columns

Index(['COUNT(clicks)_app_7days',
       'AVG_TIME_BETWEEN(clicks.click_time)_app_7days',
       'NUM_UNIQUE(clicks.ip)_app_7days',
       'NUM_UNIQUE(clicks.device)_app_7days',
       'NUM_UNIQUE(clicks.os)_app_7days',
       'NUM_UNIQUE(clicks.channel)_app_7days'],
      dtype='object')