In [1]:
import os

import numpy as np
import pandas as pd
import featuretools as ft
from featuretools.primitives import *

In [2]:
input_feature_file = '../data/interim/train_2017-11-08_0300.csv'
input_train_file = '../data/interim/train_2017-11-08_0400.csv'
output_train_file = '../data/interim/train_2017-11-08_0400_percent.csv'

dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
to_parse = ['click_time']
df_train_feature = pd.read_csv(input_feature_file, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
df_train_feature['id'] = range(len(df_train_feature))

In [3]:
es = ft.EntitySet(id='clicks')
es = es.entity_from_dataframe(
    entity_id='clicks',
    dataframe=df_train_feature,
    index='id',
    time_index='click_time',
    variable_types={
        'ip': ft.variable_types.Categorical,
        'app': ft.variable_types.Categorical,
        'device': ft.variable_types.Categorical,
        'os': ft.variable_types.Categorical,
        'channel': ft.variable_types.Categorical,
        'is_attributed': ft.variable_types.Boolean,
    }
)

es = es.normalize_entity(base_entity_id='clicks', new_entity_id='ips', index='ip')
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='apps', index='app')
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='devices', index='device')
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='oses', index='os')
es = es.normalize_entity(base_entity_id='clicks', new_entity_id='channels', index='channel')

In [4]:
df_train = pd.read_csv(input_train_file, usecols=to_read, dtype=dtypes)

In [5]:
def add_percenttrue(df, entity_id, index):
    feature_matrix, feature_defs = ft.dfs(
        entityset=es,
        target_entity=entity_id,
        agg_primitives=[PercentTrue],
        trans_primitives=[],
        max_depth=1)

    feature_matrix.columns = [str(col) + f"_{entity_id}" for col in feature_matrix.columns]

    return df.join(feature_matrix, on=index)

In [6]:
df_train = add_percenttrue(df_train, entity_id='ips', index='ip')
df_train = add_percenttrue(df_train, entity_id='apps', index='app')
df_train = add_percenttrue(df_train, entity_id='devices', index='device')
df_train = add_percenttrue(df_train, entity_id='oses', index='os')
df_train = add_percenttrue(df_train, entity_id='channels', index='channel')

In [7]:
df_train.to_csv(output_train_file, index=False)