In [1]:
import math
import time
import gc

import wordbatch
import numpy as np
import pandas as pd

from contextlib import contextmanager
from glob import glob
from itertools import combinations
from pathlib import Path

from sklearn.metrics import roc_auc_score
from wordbatch.extractors import WordHash
from wordbatch.models import FM_FTRL

In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}

In [3]:
categorical_features = [
    'ip', 'app', 'device', 'os', 'channel',
]

numerical_features_q = [
    'PERCENT_TRUE(clicks.is_attributed)_app_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_ip_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_device_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_ip_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_ip_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_ip_1day',
    
    'AVG_TIME_BETWEEN(clicks.click_time)_app_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_device_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_app_device_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_device_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_app_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_app_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_app_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_device_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_device_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_device_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_os_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_os_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_os_ip_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_os_ip_1day',
]

numerical_features_l = []

numerical_features = numerical_features_q + numerical_features_l

interaction_features = [
    ['app', 'channel'],
    ['app', 'device'],
    ['app', 'os'],
    ['os', 'channel'],
    ['ip', 'app'],
#     ['device', 'PERCENT_TRUE(clicks.is_attributed)_device_1day'],
#     ['app', 'PERCENT_TRUE(clicks.is_attributed)_app_1day'],
#     ['os', 'PERCENT_TRUE(clicks.is_attributed)_os_1day'],
#     ['channel', 'PERCENT_TRUE(clicks.is_attributed)_channel_1day'],
    
#     ['device', 'COUNT(clicks)_device_1day'],
#     ['app', 'COUNT(clicks)_app_1day'],
#     ['os', 'COUNT(clicks)_os_1day'],
#     ['channel', 'COUNT(clicks)_channel_1day'],
    
#     ['device', 'COUNT(clicks WHERE is_attributed = True)_device_1day'],
#     ['app', 'COUNT(clicks WHERE is_attributed = True)_app_1day'],
#     ['os', 'COUNT(clicks WHERE is_attributed = True)_os_1day'],
#     ['channel', 'COUNT(clicks WHERE is_attributed = True)_channel_1day'],

]

In [4]:
target_entities_init = ['app', 'device', 'os', 'channel', 'ip']
target_entities = []

for t in combinations(target_entities_init, 1):
    target_entities.append(t[0])
for t in combinations(target_entities_init, 2):
    target_entities.append(list(t))
# for t in combinations(target_entities_init, 3):
#     target_entities.append(list(t))
# for t in combinations(target_entities_init, 4):
#     target_entities.append(list(t))

target_entities.remove(['channel', 'ip'])

In [5]:
def split(df, target_entity):
    if type(target_entity) == str:
        df[target_entity] = df.index
    else:
        df[target_entity[0]], df[target_entity[1]] = df.index.str.split('_', 1).str
        df[target_entity[0]] = df[target_entity[0]].astype(dtypes[target_entity[0]])
        df[target_entity[1]] = df[target_entity[1]].astype(dtypes[target_entity[1]])
    return df

def combine_features(df, features_prefix, feature_suffix):
    for target_entity in target_entities:
        target_entity_name = target_entity if type(target_entity) == str else "_".join(target_entity)
        feature_files = sorted(glob(f"../data/interim/features/{target_entity_name}/{features_prefix}*{feature_suffix}.hdf.compress"))
        assert len(feature_files) > 0
        for feature_file in feature_files:
            df_feature = pd.read_hdf(feature_file)
            df_feature = split(df_feature, target_entity)
            df = pd.merge(df, df_feature, how='left', left_on=target_entity, right_on=target_entity)
            del df_feature
            gc.collect()
    return df

In [6]:
cache_train = '../data/cache/train_wordbatch.hdf.compress'
if not Path(cache_train).exists():
    print("Train cache doesn't exist, creating")
    df_train = pd.read_hdf('../data/interim/downsampled/train_2017-11-07_1700_08_1600_0.hdf.compress')
    df_train = combine_features(df_train, 'features_2017-11-07_1700', 'attributed2')
    df_train.to_hdf(cache_train, 'train', mode='w', complib='blosc', fletcher32=True, complevel=9)
else:
    df_train = pd.read_hdf(cache_train)

cache_val = '../data/cache/validate_wordbatch.hdf.compress'
if not Path(cache_val).exists():
    print("Validation cache doesn't exist, creating")
    train_summary = pd.read_csv('../data/interim/day_hour_train.csv')
    val_start_row = train_summary[(train_summary['day'] == 9) & (train_summary['hour'] == 4)]['start'].values[0]
    val_stop_row = train_summary[(train_summary['day'] == 9) & (train_summary['hour'] == 4)]['end'].values[0]
    df_val = pd.read_hdf('../data/raw/train.hdf.compress', start=val_start_row, stop=val_stop_row)
    df_val = combine_features(df_val, 'features_2017-11-08_1700', 'attributed2')
    df_val.to_hdf(cache_val, 'train', mode='w', complib='blosc', fletcher32=True, complevel=9)
else:
    df_val = pd.read_hdf(cache_val)

Train cache doesn't exist
Validation cache doesn't exist


In [7]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [8]:
D = 2 ** 25

wb = wordbatch.WordBatch(
    None, 
    extractor=(
        WordHash, 
        {
            "ngram_range": (1, 1), 
            "analyzer": "word",
            "lowercase": False, 
            "n_features": D,
            "norm": None, 
            "binary": True
        }),
#     minibatch_size=batchsize // 80,
#     method='threading',
    procs=24,
    freeze=True,
    timeout=1800,
    verbose=0
)

clf = FM_FTRL(
    alpha=0.05,
    beta=0.1,
    L1=0.0,
    L2=0.0,
    D=D,
    alpha_fm=0.02,
    L2_fm=0.0,
    init_fm=0.01,
    weight_fm=1.0,
    D_fm=8,
    e_noise=0.0,
    iters=3,
    inv_link="sigmoid",
    e_clip=1.0,
    threads=24,
    use_avx=1,
    verbose=0
)

In [9]:
def discretize(df):
    for c in numerical_features_q:
        df[c] = pd.qcut(df[c], 10, labels=False, duplicates='drop')
    for c in numerical_features_l:
        df[c] = pd.qcut(df[c], 100, labels=False, duplicates='drop')
    return df

In [10]:
def df2csr3(df):
    df = df.fillna(0)
    df = discretize(df)

    for i, c in enumerate(categorical_features):
        name = f"C_{i}_"
        df[name] = name + df[c].astype(str) + " "

    for i, c in enumerate(numerical_features):
        name = f"N_{i}_"
        df[name] = name + df[c].astype(str) + " "

    for i, (c1, c2) in enumerate(interaction_features):
        name = f"X_{i}_"
        df[name] = name + df[c1].astype(str) + "_" + df[c2].astype(str) + " "

    cols = [c for c in df.columns if c.startswith(('C_', 'N_', 'X_'))]
    return df[cols].sum(axis=1).values

In [11]:
def process_data(df):
    y = df.pop('is_attributed')
    w = np.multiply([1.0 if x == 1 else 0.2 for x in y], 1)
    str_array = df2csr3(df)
    X = wb.transform(str_array)
    del df, str_array
    gc.collect()
    return X, y, w

In [None]:
%%time
X_val, y_val, _ = process_data(df_val)

CPU times: user 6min 7s, sys: 3min 4s, total: 9min 11s
Wall time: 9min 24s


In [None]:
with timer("data"):
    X_train, y_train, w_train = process_data(df_train)

del df_train, df_val
gc.collect()
    
for epoch in range(0, 11, 1):
    with timer("partial_fit"):
        clf.partial_fit(X_train, y_train, sample_weight=w_train)
    with timer("evaluate_batch test"):
        print("ROC AUC:", roc_auc_score(y_val, clf.predict(X_val)))


del X_train, y_train, w_train
gc.collect()
    
del X_val, y_val
gc.collect()

[data] done in 210 s
[partial_fit] done in 1010 s
ROC AUC: 0.9533677302628775
[evaluate_batch test] done in 402 s
[partial_fit] done in 587 s
ROC AUC: 0.9533886980782357
[evaluate_batch test] done in 325 s
