In [1]:
import os
import gc

import numpy as np
import pandas as pd
import featuretools as ft

from glob import glob

from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *

In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
to_parse = ['click_time']

In [3]:
pbar = ProgressBar()
pbar.register()

In [4]:
def createEntitySet(filename):
    df = pd.read_csv(filename, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
    df['id'] = range(len(df))
    
    es = ft.EntitySet(id='clicks')
    es = es.entity_from_dataframe(
        entity_id='clicks',
        dataframe=df,
        index='id',
        time_index='click_time',
        
        variable_types={
            'app': ft.variable_types.Categorical,
            'device': ft.variable_types.Categorical,
            'os': ft.variable_types.Categorical,
            'channel': ft.variable_types.Categorical,
            'is_attributed': ft.variable_types.Boolean,
        }
    )

    es = es.normalize_entity(base_entity_id='clicks', new_entity_id='channels', index='channel', make_time_index=False)
    es.add_last_time_indexes()
    return es


def calc_feature_matrix(es, entity_id, cutoff_time, training_window):
    feature_matrix, _ = ft.dfs(
        entityset=es,
        target_entity=entity_id,
        cutoff_time=cutoff_time,
        training_window=training_window,
        max_depth=2
    )

    return feature_matrix

def createFeatures(filename, entity_sets):
    df_train = pd.read_csv(filename, usecols=['click_time'], parse_dates=to_parse)    
    cutoff_time = df_train['click_time'].min()
    training_window = ft.Timedelta("1 hours")
    b = bag.from_sequence(entity_sets)
    feature_matrices = b.map(calc_feature_matrix, entity_id='channels', cutoff_time=cutoff_time, training_window=training_window)
    out = feature_matrices.compute()
    feature_matrix = pd.concat(out)
    name, ext = os.path.splitext(filename)
    output_file = f"{name}_channel_features{ext}"
    feature_matrix.to_csv(output_file)
    del df_train
    del feature_matrix
    del feature_matrices
    del b

In [5]:
input_path = '../data/interim/partitioned/channel'
filenames = glob(f"{input_path}/train_*.csv")
b = bag.from_sequence(filenames)
entity_sets = b.map(createEntitySet).compute()
gc.collect()

[########################################] | 100% Completed |  3min  2.0s


8317

In [None]:
filenames_train = sorted(glob('../data/interim/train_2017-11-08_*00.csv'))
for filename in filenames_train:
    print(f"Processing: {filename}")
    createFeatures(filename, entity_sets)
    gc.collect()

Processing: ../data/interim/train_2017-11-08_0000.csv
[########################################] | 100% Completed |  1min 55.4s
Processing: ../data/interim/train_2017-11-08_0100.csv
[########################################] | 100% Completed |  1min 58.1s
Processing: ../data/interim/train_2017-11-08_0200.csv
[########################################] | 100% Completed |  1min 56.6s
Processing: ../data/interim/train_2017-11-08_0300.csv
[########################################] | 100% Completed |  2min 12.0s
Processing: ../data/interim/train_2017-11-08_0400.csv
[########################################] | 100% Completed |  2min  1.0s
Processing: ../data/interim/train_2017-11-08_0500.csv
[########################################] | 100% Completed |  2min  0.5s
Processing: ../data/interim/train_2017-11-08_0600.csv
[########################################] | 100% Completed |  2min  5.3s
Processing: ../data/interim/train_2017-11-08_0700.csv
[########################################] | 100% 