In [1]:
import math
import time
import gc
import os

import numpy as np
import pandas as pd

from glob import glob
from itertools import combinations
from pathlib import Path

In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
    'click_id': 'uint32',
    'hour': 'uint8',
}

In [3]:
feature_include = tuple([
#      'COUNT(clicks)_',
#      'COUNT(clicks WHERE is_attributed = True)_',
    'PERCENT_TRUE(clicks.is_attributed)_',
     'AVG_TIME_BETWEEN(clicks.click_time)_',
     'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_',

#      'SKEW(clicks.hour.COUNT(clicks))_',
     'SKEW(clicks.hour.PERCENT_TRUE(clicks.is_attributed))_',
     'SKEW(clicks.hour.AVG_TIME_BETWEEN(clicks.click_time))_',
#      'STD(clicks.hour.COUNT(clicks))_app_7days',
     'STD(clicks.hour.PERCENT_TRUE(clicks.is_attributed))_',
     'STD(clicks.hour.AVG_TIME_BETWEEN(clicks.click_time))_',
#      'MEDIAN(clicks.hour.COUNT(clicks))_app_7days',
     'MEDIAN(clicks.hour.PERCENT_TRUE(clicks.is_attributed))_',
     'MEDIAN(clicks.hour.AVG_TIME_BETWEEN(clicks.click_time))_',
])

In [4]:
target_entities_init = ['app', 'device', 'os', 'channel', 'ip', 'hour']
target_entities = []

for t in combinations(target_entities_init, 1):
#     if 'device' in t: continue
    target_entities.append(t[0])
for t in combinations(target_entities_init, 2):
#     if 'ip' in t: continue
    target_entities.append(list(t))
for t in combinations(target_entities_init, 3):
#     if 'ip' in t: continue
    target_entities.append(list(t))
# for t in combinations(target_entities_init, 4):
#     target_entities.append(list(t))

target_entities.remove('device')
target_entities.remove(['app', 'ip'])
target_entities.remove(['os', 'ip'])
target_entities.remove(['channel', 'ip'])
target_entities.remove(['app', 'os', 'ip'])
target_entities.remove(['app', 'device', 'ip'])
target_entities.remove(['app', 'channel', 'ip'])
target_entities.remove(['app', 'ip', 'hour'])
target_entities.remove(['device', 'os', 'ip'])
target_entities.remove(['device', 'channel', 'ip'])
target_entities.remove(['device', 'ip', 'hour'])
target_entities.remove(['os', 'channel', 'ip'])
target_entities.remove(['os', 'ip', 'hour'])
target_entities.remove(['channel', 'ip', 'hour'])

# target_entities = ['app']

In [5]:
def remove(df, columns):
    if type(columns) is list: columns = tuple(columns)
    to_drop = [c for c in df.columns if not c.startswith(columns)]
    if len(to_drop) > 0: df.drop(columns=to_drop, inplace=True)
    for c in df.columns:
        df[c] = df[c].astype(np.float32)
    return df

def split(df, target_entity):
    if type(target_entity) == str:
        df[target_entity] = df.index
    elif len(target_entity) == 2:
        df[target_entity[0]], df[target_entity[1]] = df.index.str.split('_').str
        df[target_entity[0]] = df[target_entity[0]].astype(dtypes[target_entity[0]])
        df[target_entity[1]] = df[target_entity[1]].astype(dtypes[target_entity[1]])
    elif len(target_entity) == 3:
        df[target_entity[0]], df[target_entity[1]], df[target_entity[2]] = df.index.str.split('_').str
        df[target_entity[0]] = df[target_entity[0]].astype(dtypes[target_entity[0]])
        df[target_entity[1]] = df[target_entity[1]].astype(dtypes[target_entity[1]])
        df[target_entity[2]] = df[target_entity[2]].astype(dtypes[target_entity[2]])
    return df

def combine_features(df, features_prefix, feature_suffix):
    for target_entity in target_entities:
        target_entity_name = target_entity if type(target_entity) == str else "_".join(target_entity)
        feature_files = sorted(glob(f"../data/interim/features/{target_entity_name}/{features_prefix}*{feature_suffix}.hdf.compress"))
        assert len(feature_files) > 0
        for feature_file in feature_files:
            df_feature = pd.read_hdf(feature_file)
            df_feature = remove(df_feature, feature_include)
            df_feature = split(df_feature, target_entity)
            df = pd.merge(df, df_feature, how='left', left_on=target_entity, right_on=target_entity)
            del df_feature
            gc.collect()
    return df

In [6]:
def merge(input_file, output_file, start_day, start_hour, end_day, end_hour, feature_prefix, feature_suffix):
    print(f"Creating output file {output_file}")
    if Path(output_file).exists(): os.remove(output_file)
    train_summary = pd.read_csv('../data/interim/day_hour_train.csv')
    start_row = train_summary[(train_summary['day'] == start_day) & (train_summary['hour'] == start_hour)].index[0]
    stop_row = train_summary[(train_summary['day'] == end_day) & (train_summary['hour'] == end_hour)].index[0]
    for i in range(start_row, stop_row + 1):
        row = train_summary.iloc[i].to_dict()
        df_train = pd.read_hdf(input_file, start=row['start'], stop=row['end'])
        df_train['hour'] = df_train['click_time'].dt.hour
        df_train = combine_features(df_train, feature_prefix, feature_suffix)
        df_train.to_hdf(output_file, 'train', format='table', mode='a', append=True, complib='blosc', fletcher32=True, complevel=9)
    gc.collect()

In [7]:
force = True
cache_train = '../data/cache/train_vw_2017-11-08.hdf.compress'
cache_val = '../data/cache/validate_vw_2017-11-09.hdf.compress'

if not Path(cache_train).exists() or force:
    merge('../data/raw/train.hdf.compress', cache_train, 7, 16, 8, 16, 'features_2017-11-07_1700', 'attributed')

if not Path(cache_val).exists() or force:
    merge('../data/raw/train.hdf.compress', cache_val, 9, 4, 9, 6, 'features_2017-11-08_1700', 'attributed')

gc.collect()

Creating output file ../data/cache/train_vw_2017-11-08.hdf.compress
Creating output file ../data/cache/validate_vw_2017-11-09.hdf.compress


48

In [15]:
def construct_line(row, header):
    ignore = ['is_attributed', 'attributed_time', 'click_time']
    categorical = ['ip', 'app', 'device', 'os', 'channel', 'hour']
    numerical = []
    categorical_prefixes = tuple(['MODE', 'DAY', 'YEAR', 'MONTH', 'WEEKDAY', 'HOUR', 'MINUTE', 'SECOND'])
    numerical_prefixes = tuple(['SUM', 'STD', 'MAX', 'SKEW', 'MIN', 'MEAN', 'MEDIAN', 'COUNT', 'NUM_UNIQUE', 'PERCENT_TRUE', 'AVG_TIME_BETWEEN'])

    label = 2 * int(row['is_attributed']) - 1

    str_vw = f"{label}"
    for k, v in row.items():
        if k in ignore:
            continue

        if k in categorical:
            str_vw += f" |{header[k]} {v}"
        elif k in numerical:
            if math.isnan(v) or math.isclose(v, 0.0, abs_tol=0.00001): continue
            str_vw += f" |{header[k]} {header[k]}:{v}"
        else:
            if k.startswith(categorical_prefixes):
                str_vw += f" |{header[k]} {v}"
            elif k.startswith(numerical_prefixes):
                if math.isnan(v) or math.isclose(v, 0.0, abs_tol=0.00001): continue
                str_vw += f" |{header[k]} {header[k]}:{v}"
            else:
                str_vw += f" |{header[k]} {v}"

    str_vw += '\n'
    return str_vw

In [13]:
import pickle

df = pd.read_hdf(cache_train, stop=1)
header = {}
for i, c in enumerate(df.columns):
#     if i == ord('|'):
#         header[c] = chr(255)
#     elif i == ord(' '):
#         header[c] = chr(254)
#     elif i == ord('\n'):
#         header[c] = chr(253)
#     elif i== ord(':'):
#         header[c] = chr(252)
#     else:
#         header[c] = chr(i)
    header[c] = f"n{i}"

pickle.dump(header, open("header.p", "wb"))
header = pickle.load(open("header.p", "rb"))

In [16]:
def convert(input_file, output_file):
    with open(output_file, mode='w', encoding='utf-8') as output_file:
        for chunk in pd.read_hdf(cache_train, chunksize=100000):
            for index, row in chunk.iterrows():
                output_file.write(construct_line(row, header))

In [None]:
convert(cache_train, '../data/interim/vw/train_2017-11-07_1700_08_1600.vw')

In [None]:
convert(cache_val, '../data/interim/vw/validate_2017-11-09_0400_09_0600.vw')