In [1]:
import math
import time
import gc
import os

import numpy as np
import pandas as pd

from glob import glob
from itertools import combinations
from pathlib import Path

In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32',
    'hour': 'uint8',
}

In [3]:
feature_include = tuple([
     'COUNT(clicks)_',
     'COUNT(clicks WHERE is_attributed = True)_',
    'PERCENT_TRUE(clicks.is_attributed)_',
     'AVG_TIME_BETWEEN(clicks.click_time)_',
     'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_',

     'SKEW(clicks.hour.COUNT(clicks))_',
     'SKEW(clicks.hour.PERCENT_TRUE(clicks.is_attributed))_',
     'SKEW(clicks.hour.AVG_TIME_BETWEEN(clicks.click_time))_',
     'STD(clicks.hour.COUNT(clicks))_app_7days',
     'STD(clicks.hour.PERCENT_TRUE(clicks.is_attributed))_',
     'STD(clicks.hour.AVG_TIME_BETWEEN(clicks.click_time))_',
     'MEDIAN(clicks.hour.COUNT(clicks))_app_7days',
     'MEDIAN(clicks.hour.PERCENT_TRUE(clicks.is_attributed))_',
     'MEDIAN(clicks.hour.AVG_TIME_BETWEEN(clicks.click_time))_',
])

In [4]:
target_entities_init = ['app', 'device', 'os', 'channel', 'ip', 'hour']
target_entities = []

for t in combinations(target_entities_init, 1):
#     if 'device' in t: continue
    target_entities.append(t[0])
for t in combinations(target_entities_init, 2):
#     if 'ip' in t: continue
    target_entities.append(list(t))
for t in combinations(target_entities_init, 3):
#     if 'ip' in t: continue
    target_entities.append(list(t))
# for t in combinations(target_entities_init, 4):
#     target_entities.append(list(t))

target_entities.remove('device')
target_entities.remove(['app', 'ip'])
target_entities.remove(['os', 'ip'])
target_entities.remove(['channel', 'ip'])
target_entities.remove(['app', 'os', 'ip'])
target_entities.remove(['app', 'device', 'ip'])
target_entities.remove(['app', 'channel', 'ip'])
target_entities.remove(['app', 'ip', 'hour'])
target_entities.remove(['device', 'os', 'ip'])
target_entities.remove(['device', 'channel', 'ip'])
target_entities.remove(['device', 'ip', 'hour'])
target_entities.remove(['os', 'channel', 'ip'])
target_entities.remove(['os', 'ip', 'hour'])
target_entities.remove(['channel', 'ip', 'hour'])

In [5]:
def remove(df, columns):
    if type(columns) is list: columns = tuple(columns)
    to_drop = [c for c in df.columns if not c.startswith(columns)]
    if len(to_drop) > 0: df.drop(columns=to_drop, inplace=True)
    for c in df.columns:
        df[c] = df[c].astype(np.float32)
    return df

def split(df, target_entity):
    if type(target_entity) == str:
        df[target_entity] = df.index
    elif len(target_entity) == 2:
        df[target_entity[0]], df[target_entity[1]] = df.index.str.split('_').str
        df[target_entity[0]] = df[target_entity[0]].astype(dtypes[target_entity[0]])
        df[target_entity[1]] = df[target_entity[1]].astype(dtypes[target_entity[1]])
    elif len(target_entity) == 3:
        df[target_entity[0]], df[target_entity[1]], df[target_entity[2]] = df.index.str.split('_').str
        df[target_entity[0]] = df[target_entity[0]].astype(dtypes[target_entity[0]])
        df[target_entity[1]] = df[target_entity[1]].astype(dtypes[target_entity[1]])
        df[target_entity[2]] = df[target_entity[2]].astype(dtypes[target_entity[2]])
    return df

def combine_features(df, features_prefix, feature_suffix):
    for target_entity in target_entities:
        target_entity_name = target_entity if type(target_entity) == str else "_".join(target_entity)
        feature_files = sorted(glob(f"../data/interim/features/{target_entity_name}/{features_prefix}*{feature_suffix}.hdf.compress"))
        assert len(feature_files) > 0
        for feature_file in feature_files:
            df_feature = pd.read_hdf(feature_file)
            df_feature = remove(df_feature, feature_include)
            df_feature = split(df_feature, target_entity)
            df = pd.merge(df, df_feature, how='left', left_on=target_entity, right_on=target_entity)
            del df_feature
            gc.collect()
    return df

In [None]:
force = True
cache_train = '../data/cache/train_vw_2017-11-08.hdf.compress'
cache_val = '../data/cache/validate_vw_2017-11-09.hdf.compress'
feature_suffix = 'attributed6'
train_feature_prefix = 'features_2017-11-07_1700'
valid_feature_prefix = 'features_2017-11-08_1700'

if not Path(cache_train).exists() or force:
    print("Train cache doesn't exist, creating")
    if Path(cache_train).exists(): os.remove(cache_train)

    train_summary = pd.read_csv('../data/interim/day_hour_train.csv')
    start_row = train_summary[(train_summary['day'] == 7) & (train_summary['hour'] == 16)].index[0]
    stop_row = train_summary[(train_summary['day'] == 8) & (train_summary['hour'] == 16)].index[0]
    for i in range(start_row, stop_row + 1):
        row = train_summary.iloc[i].to_dict()
        df_train = pd.read_hdf('../data/raw/train.hdf.compress', start=row['start'], stop=row['end'])
        df_train['hour'] = df_train['click_time'].dt.hour
        df_train = combine_features(df_train, train_feature_prefix, feature_suffix)
        df_train.to_hdf(cache_train, 'train', format='table', mode='a', append=True, complib='blosc', fletcher32=True, complevel=9)
    gc.collect()

if not Path(cache_val).exists() or force:
    print("Validation cache doesn't exist, creating")
    train_summary = pd.read_csv('../data/interim/day_hour_train.csv')
    start_row = train_summary[(train_summary['day'] == 9) & (train_summary['hour'] == 4)]['start'].values[0]
    stop_row = train_summary[(train_summary['day'] == 9) & (train_summary['hour'] == 6)]['end'].values[0]
    df_val = pd.read_hdf('../data/raw/train.hdf.compress', start=start_row, stop=stop_row)
    df_val['hour'] = df_val['click_time'].dt.hour
    df_val = combine_features(df_val, valid_feature_prefix, feature_suffix)
    df_val.to_hdf(cache_val, 'train', format='table', mode='a', append=True, complib='blosc', fletcher32=True, complevel=9)

gc.collect()

Train cache doesn't exist, creating
