In [1]:
import os

import numpy as np
import pandas as pd
import featuretools as ft

from glob import glob

from dask import bag
from dask.diagnostics import ProgressBar
from featuretools.primitives import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [2]:
input_path = '../data/interim/partitioned/channel'
input_train_file = '../data/interim/train_2017-11-08_0400.csv'
output_train_file = '../data/interim/train_2017-11-08_0400_percent.csv'

dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}
to_read = ['app', 'device', 'os', 'channel', 'is_attributed', 'click_time']
to_parse = ['click_time']

In [3]:
pbar = ProgressBar()
pbar.register()

In [4]:
filenames = glob(f"{input_path}/train_*.csv")

In [5]:
def createEntitySet(filename):
    df = pd.read_csv(filename, usecols=to_read, dtype=dtypes, parse_dates=to_parse)
    df['id'] = range(len(df))
    
    es = ft.EntitySet(id='clicks')
    es = es.entity_from_dataframe(
        entity_id='clicks',
        dataframe=df,
        index='id',
        time_index='click_time',
        
        variable_types={
            'app': ft.variable_types.Categorical,
            'device': ft.variable_types.Categorical,
            'os': ft.variable_types.Categorical,
            'channel': ft.variable_types.Categorical,
            'is_attributed': ft.variable_types.Boolean,
        }
    )

#     es = es.normalize_entity(base_entity_id='clicks', new_entity_id='apps', index='app', make_time_index=False)
#     es = es.normalize_entity(base_entity_id='clicks', new_entity_id='devices', index='device', make_time_index=False)
#     es = es.normalize_entity(base_entity_id='clicks', new_entity_id='oses', index='os', make_time_index=False)
    es = es.normalize_entity(base_entity_id='clicks', new_entity_id='channels', index='channel', make_time_index=False)
    es.add_last_time_indexes()
    return es

In [6]:
b = bag.from_sequence(filenames)
entity_sets = b.map(createEntitySet)

In [8]:
def calc_feature_matrix(es, entity_id, cutoff_time):
    feature_matrix, feature_defs = ft.dfs(
        entityset=es,
        target_entity=entity_id,
#         agg_primitives=[PercentTrue],
#         trans_primitives=[],
        cutoff_time=cutoff_time,
        training_window=ft.Timedelta("3 hours"),
        max_depth=1)

#     feature_matrix.columns = [str(col) + f"_{entity_id}" for col in feature_matrix.columns]
    return feature_matrix, feature_defs

In [9]:
df_train = pd.read_csv(input_train_file, usecols=to_read, dtype=dtypes, parse_dates=to_parse)

In [10]:
cutoff_time = df_train['click_time'].min()
feature_matrices = entity_sets.map(calc_feature_matrix, entity_id='channels', cutoff_time=cutoff_time)

In [11]:
out = feature_matrices.compute()
_, feature_defs = out[0]
feature_matrices = list(map(list, zip(*out)))[0]
feature_matrix = pd.concat(feature_matrices)

[########################################] | 100% Completed |  3min  5.8s


In [12]:
X = df_train.join(feature_matrix, on='channel')
X.drop(columns=['click_time'], inplace=True)
X = X.fillna(0)
y = X.pop('is_attributed')

In [18]:
clf = RandomForestClassifier(n_estimators=400, maxn_jobs=-1)
scores = cross_val_score(estimator=clf,X=X, y=y, cv=3, scoring="roc_auc", verbose=True)

"AUC %.2f +/- %.2f" % (scores.mean(), scores.std())

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  9.8min finished


'AUC 0.95 +/- 0.00'

In [15]:
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
def feature_importances(model, features, n=10):
    importances = model.feature_importances_
    zipped = sorted(zip(features, importances), key=lambda x: -x[1])
    for i, f in enumerate(zipped[:n]):
        print("%d: Feature: %s, %.3f" % (i+1, f[0], f[1]))

    return [f[0] for f in zipped[:n]]

top_features = feature_importances(clf, X, n=20)

1: Feature: app, 0.274
2: Feature: PERCENT_TRUE(clicks.is_attributed), 0.224
3: Feature: os, 0.131
4: Feature: NUM_UNIQUE(clicks.app), 0.082
5: Feature: NUM_UNIQUE(clicks.device), 0.074
6: Feature: NUM_UNIQUE(clicks.os), 0.055
7: Feature: device, 0.044
8: Feature: COUNT(clicks), 0.042
9: Feature: channel, 0.028
10: Feature: MODE(clicks.app), 0.019
11: Feature: MODE(clicks.os), 0.017
12: Feature: MODE(clicks.device), 0.010
