In [1]:
import os
import gc

import numpy as np
import pandas as pd
import featuretools as ft

from datetime import datetime
from glob import glob

from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *

In [4]:
def create_entityset(filename, target_entity):
    df = pd.read_csv(filename, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
    df['id'] = df.index

    es = ft.EntitySet(id='clicks')
    es = es.entity_from_dataframe(
        entity_id='clicks',
        dataframe=df,
        index='id',
        time_index='click_time',
        variable_types={
            'ip': ft.variable_types.Categorical,
            'app': ft.variable_types.Categorical,
            'device': ft.variable_types.Categorical,
            'os': ft.variable_types.Categorical,
            'channel': ft.variable_types.Categorical,
            'is_attributed': ft.variable_types.Boolean,
        }
    )
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id=target_entity, index=target_entity, make_time_index=False)
    es.add_last_time_indexes()
    es['clicks']['is_attributed'].interesting_values = [True]
    return es

def calc_feature_matrix(es, target_entity, cutoff_time, training_window):
    feature_matrix, _ = ft.dfs(
        entityset=es,
        target_entity=target_entity,
        trans_primitives=[Hour],
        agg_primitives=[PercentTrue, Trend, TimeSinceLast, AvgTimeBetween, Mode, Count],
        where_primitives=[Trend, TimeSinceLast, AvgTimeBetween, Mode, Count],
        cutoff_time=cutoff_time,
        training_window=training_window,
        max_features=-1,
        max_depth=3
    )

    return feature_matrix


def create_features(entity_sets, target_entity, cutoff_time, training_window):
    tw_suffix = training_window.get_name().replace(' ', '').lower()

    feature_matrices = entity_sets.map(
        calc_feature_matrix,
        target_entity=target_entity,
        cutoff_time=cutoff_time,
        training_window=training_window)
    out = feature_matrices.compute()
    feature_matrix = pd.concat(out)
    feature_matrix = feature_matrix[[c for c in feature_matrix.columns if c in to_select]]
    feature_matrix.columns = [str(col) + f"_{target_entity}_{tw_suffix}" for col in feature_matrix.columns]

    del out, feature_matrices
    gc.collect()
    return feature_matrix

In [6]:
pbar = ProgressBar()
pbar.register()

feature_name_suffix = 'attributed'
target_entities = ['channel', 'app', 'device', 'os']
training_windows = ['1 day']
cutoff_time=datetime.datetime(2017, 11, 8, 17, 0)
train_filename = 'features_2017-11-08_1700'

for target_entity in target_entities:
    print(f"Processing {target_entity}")
    features_dir = f"../data/interim/features/{target_entity}"
    if not os.path.exists(features_dir): os.makedirs(features_dir)
    filenames = glob(f"../data/interim/partitioned/{target_entity}/train_*.csv")
    b = bag.from_sequence(filenames)
    entity_sets = b.map(create_entityset, target_entity)

    for training_window in training_windows:
        tw_suffix = training_window.replace(' ', '').lower()
        feature_matrix = create_features(entity_sets, target_entity=target_entity, cutoff_time=cutoff_time, training_window=ft.Timedelta(training_window))
        
        output_file = f"{features_dir}/{train_filename}_{tw_suffix}_{feature_name_suffix}.csv"
        print(f"Writing {output_file}")
        feature_matrix.to_csv(output_file)
        del feature_matrix
        gc.collect()
        
    del entity_sets
    gc.collect()

Processing channel
[########################################] | 100% Completed |  2min 10.7s
[########################################] | 100% Completed |  2min 10.8s
Writing ../data/interim/features/channel/features_2017-11-08_1700_1day_attributed.csv
Processing app
[########################################] | 100% Completed |  4min 25.7s
[########################################] | 100% Completed |  4min 25.7s
Writing ../data/interim/features/app/features_2017-11-08_1700_1day_attributed.csv
Processing device
[########################################] | 100% Completed | 24min 52.0s
[########################################] | 100% Completed | 24min 52.0s
Writing ../data/interim/features/device/features_2017-11-08_1700_1day_attributed.csv
Processing os
[########################################] | 100% Completed |  7min 47.7s
[########################################] | 100% Completed |  7min 47.7s
Writing ../data/interim/features/os/features_2017-11-08_1700_1day_attributed.csv
