In [1]:
import os
import gc

import numpy as np
import pandas as pd
import featuretools as ft

from datetime import datetime
from glob import glob

from featuretools.primitives import *

In [2]:
df = pd.read_hdf('../data/raw/train.hdf.compress')

In [3]:
df['id'] = df.index

In [4]:
def create_entityset(df, target_entity):
    es = ft.EntitySet(id='clicks')
    es = es.entity_from_dataframe(
        entity_id='clicks',
        dataframe=df,
        index='id',
        time_index='click_time',
        variable_types={
            'ip': ft.variable_types.Categorical,
            'app': ft.variable_types.Categorical,
            'device': ft.variable_types.Categorical,
            'os': ft.variable_types.Categorical,
            'channel': ft.variable_types.Categorical,
            'is_attributed': ft.variable_types.Boolean,
        }
    )
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id=target_entity, index=target_entity, make_time_index=False)

    es.add_last_time_indexes()
    es['clicks']['is_attributed'].interesting_values = [True]
    return es

In [5]:
def create_features(entityset, target_entity, cutoff_time, training_window):
    feature_matrix, feature_defs = ft.dfs(
        entityset=entityset,
        target_entity=target_entity,
        cutoff_time=cutoff_time,
        training_window=training_window,
        agg_primitives=[PercentTrue],
#         agg_primitives=[PercentTrue, Trend, TimeSinceLast, AvgTimeBetween, Mode],
#         where_primitives=[Trend, TimeSinceLast, AvgTimeBetween, Mode],
        max_depth=3,
#         features_only=True
    )
    return feature_matrix

In [8]:
%%time
target_entities = ['app', 'device', 'os', 'channel', 'ip']
# target_entities = ['channel']
training_windows = ['1 day']
cutoff_time=datetime.datetime(2017, 11, 7, 17, 0)

for target_entity in target_entities:
    print(f"Processing {target_entity}")
    print("Creating entity set")
    es = create_entityset(df, target_entity)
    features_dir = f"../data/interim/features/{target_entity}"
    if not os.path.exists(features_dir): os.makedirs(features_dir)
    
    for training_window in training_windows:
        training_window=ft.Timedelta(training_window)
        print("Creating features")
        feature_matrix = create_features(es, target_entity=target_entity, cutoff_time=cutoff_time, training_window=training_window)
        tw_suffix = training_window.get_name().replace(' ', '').lower()
        feature_matrix.columns = [str(col) + f"_{target_entity}_{tw_suffix}" for col in feature_matrix.columns]
        output_file = f"{features_dir}/train_2017-11-07_1700_{tw_suffix}_attributed.hdf.compress"
        print(f"Writing {output_file}")
        feature_matrix.to_hdf(output_file, 'features', mode='w', complib='blosc', fletcher32=True, complevel=9)
        del feature_matrix
        gc.collect()
    del es
    gc.collect()

Processing app
Creating entity set
Creating features
Writing ../data/interim/features/app/train_2017-11-07_1700_1day_attributed.hdf.compress
Processing device
Creating entity set
Creating features


KeyboardInterrupt: 

In [7]:
pd.read_hdf('../data/interim/features/channel/train_2017-11-07_1700_1day_attributed.hdf.compress').head()

Unnamed: 0_level_0,PERCENT_TRUE(clicks.is_attributed)_channel_1day
channel,Unnamed: 1_level_1
0,0.068773
3,0.00042
4,0.068111
5,0.486134
13,0.000413
