In [1]:
import os
import gc

import numpy as np
import pandas as pd
import featuretools as ft

from glob import glob

from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *

In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
to_parse = ['click_time']

In [3]:
pbar = ProgressBar()
pbar.register()

In [4]:
def createEntitySet(filename):
    df = pd.read_csv(filename, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
    df['id'] = range(len(df))
    
    es = ft.EntitySet(id='clicks')
    es = es.entity_from_dataframe(
        entity_id='clicks',
        dataframe=df,
        index='id',
        time_index='click_time',
        
        variable_types={
            'app': ft.variable_types.Categorical,
            'device': ft.variable_types.Categorical,
            'os': ft.variable_types.Categorical,
            'channel': ft.variable_types.Categorical,
            'is_attributed': ft.variable_types.Boolean,
        }
    )

    es = es.normalize_entity(base_entity_id='clicks', new_entity_id='app', index='app', make_time_index=False)
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id='device', index='device', make_time_index=False)
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id='os', index='os', make_time_index=False)
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id='channel', index='channel', make_time_index=False)
    es.add_last_time_indexes()
    return es


def calc_feature_matrix(es, entity_id, cutoff_time, training_window):
    feature_matrix, _ = ft.dfs(
        entityset=es,
        target_entity=entity_id,
        cutoff_time=cutoff_time,
        training_window=training_window,
        max_depth=3
    )

    return feature_matrix

def createFeatures(filename, entity_sets, target_entity, cutoff_time, training_window):
    tw_suffix = training_window.get_name().replace(' ', '').lower()
    
    b = bag.from_sequence(entity_sets)
    feature_matrices = b.map(
        calc_feature_matrix, 
        entity_id=target_entity, 
        cutoff_time=cutoff_time, 
        training_window=training_window)
    out = feature_matrices.compute()
    feature_matrix = pd.concat(out)
    feature_matrix.columns = [str(col) + f"_{target_entity}_{tw_suffix}" for col in feature_matrix.columns]
    
    name, ext = os.path.splitext(os.path.basename(filename))
    output_file = os.path.dirname(filename) + '/features/' + f"{name}_{target_entity}_{tw_suffix}_features{ext}"
    feature_matrix.to_csv(output_file)
    
    del feature_matrix
    del feature_matrices
    del b

In [5]:
# target_entities = ['app', 'device', 'os', 'channel']
target_entities = ['channel']
filenames_train = sorted(glob('../data/interim/train_2017-11-08_1000.csv'))
training_windows = ['1 hours']

for target_entity in target_entities:
    filenames = glob(f"../data/interim/partitioned/{target_entity}/train_*.csv")
    b = bag.from_sequence(filenames)
    entity_sets = b.map(createEntitySet).compute()
    gc.collect()
    
    for filename in filenames_train:
        print(f"Processing: {filename}")
        df = pd.read_csv(filename, usecols=['click_time'], parse_dates=to_parse)
        cutoff_time = df['click_time'].min()
        del df
        for training_window in training_windows:
            createFeatures(filename, entity_sets, target_entity=target_entity, cutoff_time=cutoff_time, training_window=ft.Timedelta(training_window))
            gc.collect()
    
    del entity_sets
    del b
    gc.collect()

[########################################] | 100% Completed |  2min 26.3s
Processing: ../data/interim/train_2017-11-08_1000.csv
[########################################] | 100% Completed |  2min 55.6s
