In [6]:
import wordbatch
from wordbatch.extractors import WordHash
from wordbatch.models import FM_FTRL
import math
import pandas as pd
from sklearn.metrics import roc_auc_score
import time
import numpy as np
import gc
from contextlib import contextmanager

In [7]:
categorical_features = [
    'app', 'device', 'os', 'channel',
    'MODE(clicks.ip)_app_1day',
    'MODE(clicks.device)_app_1day',
    'MODE(clicks.os)_app_1day',
    'MODE(clicks.channel)_app_1day',
    'MODE(clicks.ip WHERE is_attributed = True)_app_1day',
    'MODE(clicks.device WHERE is_attributed = True)_app_1day',
    'MODE(clicks.os WHERE is_attributed = True)_app_1day',
    'MODE(clicks.channel WHERE is_attributed = True)_app_1day',
    'MODE(clicks.HOUR(click_time))_app_1day',
    'MODE(clicks.HOUR(click_time) WHERE is_attributed = True)_app_1day',
    'MODE(clicks.ip)_device_1day',
    'MODE(clicks.app)_device_1day',
    'MODE(clicks.os)_device_1day',
    'MODE(clicks.channel)_device_1day',
    'MODE(clicks.ip WHERE is_attributed = True)_channel_1day',
    'MODE(clicks.app WHERE is_attributed = True)_channel_1day',
    'MODE(clicks.device WHERE is_attributed = True)_channel_1day',
    'MODE(clicks.os WHERE is_attributed = True)_channel_1day',
    'MODE(clicks.HOUR(click_time))_channel_1day',
    'MODE(clicks.HOUR(click_time) WHERE is_attributed = True)_channel_1day',
    'MODE(clicks.ip WHERE is_attributed = True)_device_1day',
    'MODE(clicks.app WHERE is_attributed = True)_device_1day',
    'MODE(clicks.os WHERE is_attributed = True)_device_1day',
    'MODE(clicks.channel WHERE is_attributed = True)_device_1day',
    'MODE(clicks.HOUR(click_time))_device_1day',
    'MODE(clicks.HOUR(click_time) WHERE is_attributed = True)_device_1day',
    'MODE(clicks.ip WHERE is_attributed = True)_os_1day',
    'MODE(clicks.app WHERE is_attributed = True)_os_1day',
    'MODE(clicks.device WHERE is_attributed = True)_os_1day',
    'MODE(clicks.channel WHERE is_attributed = True)_os_1day',
    'MODE(clicks.HOUR(click_time))_os_1day',
    'MODE(clicks.HOUR(click_time) WHERE is_attributed = True)_os_1day',
    'MODE(clicks.ip)_os_1day',
    'MODE(clicks.app)_os_1day',
    'MODE(clicks.device)_os_1day',
    'MODE(clicks.channel)_os_1day',
    'MODE(clicks.ip)_channel_1day',
    'MODE(clicks.app)_channel_1day',
    'MODE(clicks.device)_channel_1day',
    'MODE(clicks.os)_channel_1day',
]

In [8]:
train_filenames = [
 '../data/interim/combined/train_2017-11-07_1700_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-07_1800_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-07_1900_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-07_2000_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-07_2100_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-07_2200_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-07_2300_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_0000_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_0100_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_0200_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_0300_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_0400_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_0500_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_0600_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_0700_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_0800_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_0900_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_1000_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_1100_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_1200_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_1300_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_1400_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_1500_attributed.hdf.compress',
 '../data/interim/combined/train_2017-11-08_1600_attributed.hdf.compress']

In [9]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [89]:
D = 2 ** 25

wb = wordbatch.WordBatch(
    None, 
    extractor=(
        WordHash, 
        {
            "ngram_range": (1, 1), 
            "analyzer": "word",
            "lowercase": False, 
            "n_features": D,
            "norm": None, 
            "binary": True
        }),
#     minibatch_size=batchsize // 80,
#     method='threading',
    procs=24,
    freeze=True,
    timeout=1800,
    verbose=0
)

clf = FM_FTRL(
    alpha=0.05,
    beta=0.1,
    L1=0.0,
    L2=0.0,
    D=D,
    alpha_fm=0.02,
    L2_fm=0.0,
    init_fm=0.01,
    weight_fm=1.0,
    D_fm=8,
    e_noise=0.0,
    iters=3,
    inv_link="sigmoid",
    e_clip=1.0,
    threads=24,
    use_avx=1,
    verbose=0
)

In [11]:
def mkstr(row, columns):
    return ' '.join(f"C{i}{int(v)}" for i, v in enumerate(row[columns]) if (math.isnan(v) == False))

In [12]:
def discretize(df):
    return df

In [13]:
# np.log2(1 + df_val['PERCENT_TRUE(clicks.is_attributed)_app_1day'].values).astype(int)

In [14]:
# ds, bins = pd.qcut(df_val['PERCENT_TRUE(clicks.is_attributed)_app_1day'], 100, labels=False, duplicates='drop', retbins=True)
# bins

In [23]:
def df2csr2(df):
    df = discretize(df)
#     for fea in numerical_features: df[fea]= np.log2(1 + df[fea].values).astype(int)
    
#     df = df.rename(columns=lambda x: x.replace(" ", "_"))
#     cols = list(df)

    cats = df.apply(lambda row: mkstr(row, categorical_features), axis=1, raw=True)
    str_array= (" AXC" + df['app'].astype(str)+"_"+df['channel'].astype(str) \
        + " OXC" + df['os'].astype(str)+"_"+df['channel'].astype(str) \
        + " AXD" + df['app'].astype(str)+"_"+df['device'].astype(str) \
        + " IXA" + df['ip'].astype(str)+"_"+df['app'].astype(str) \
        + " AXO" + df['app'].astype(str)+"_"+df['os'].astype(str)
      )

    return (cats + str_array).values

In [84]:
def df2csr3(df):
    df.fillna(0)

    str_array = (" AXC" + df['app'].astype(str)+"_"+df['channel'].astype(str) \
        + " OXC" + df['os'].astype(str)+"_"+df['channel'].astype(str) \
        + " AXD" + df['app'].astype(str)+"_"+df['device'].astype(str) \
        + " IXA" + df['ip'].astype(str)+"_"+df['app'].astype(str) \
        + " AXO" + df['app'].astype(str)+"_"+df['os'].astype(str)
      )

    for i, c in enumerate(categorical_features):
        df[c] = "C" + str(i) + df[c].astype(str) + " "
       
    cats = df[categorical_features].sum(axis=1)
    
    return (cats + str_array).values

In [85]:
def process_data(filename):
    df = pd.read_hdf(filename)
    y = df.pop('is_attributed')
    w = np.multiply([1.0 if x == 1 else 0.2 for x in y], 1)
    str_array = df2csr3(df)
    X = wb.transform(str_array)
    del df, str_array
    gc.collect()
    return X, y, w

In [90]:
%%time
X_val, y_val, _ = process_data('../data/interim/combined/train_2017-11-09_0400_attributed.hdf.compress')

CPU times: user 4min 7s, sys: 1min 28s, total: 5min 36s
Wall time: 5min 54s


In [86]:
i = 0
for train_filename in train_filenames[:5]:
    with timer("data"):
        X_train, y_train, w_train = process_data(train_filename)
    with timer("partial_fit"):
        clf.partial_fit(X_train, y_train, sample_weight=w_train)

    i = i + 1
    if i % 5 == 0:
        print("Test:")
        with timer("evaluate_batch test"):
            print(i, "ROC AUC:", roc_auc_score(y_val, clf.predict(X_val)))


    del X_train, y_train, w_train
    gc.collect()

[data] done in 110 s
[partial_fit] done in 296 s
[data] done in 63 s
[partial_fit] done in 167 s
[data] done in 45 s
[partial_fit] done in 117 s
[data] done in 39 s
[partial_fit] done in 152 s
[data] done in 51 s
[partial_fit] done in 253 s
Test:


ValueError: Found input variables with inconsistent numbers of samples: [4032691, 1]

In [88]:
print(i, "ROC AUC:", roc_auc_score(y_val, clf.predict(X_val)))

5 ROC AUC: 0.9118884846888228
