In [None]:
# !sudo pip install git+https://github.com/anttttti/Wordbatch.git

In [1]:
import wordbatch
from wordbatch.extractors import WordHash
from wordbatch.models import FM_FTRL
# from wordbatch.data_utils import *
import threading
import pandas as pd
from sklearn.metrics import roc_auc_score
import time
import numpy as np
import gc
from contextlib import contextmanager

In [2]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def df_add_counts(df, cols):
    arr_slice = df[cols].values
    unq, unqtags, counts = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(0) + 1),
                                     return_inverse=True, return_counts=True)
    df["_".join(cols)+'_count'] = counts[unqtags]

def df2csr(wb, df, pick_hours=None):
    df.reset_index(drop=True, inplace=True)
    with timer("Adding counts"):
        df['click_time']= pd.to_datetime(df['click_time'])
        dt= df['click_time'].dt
        df['day'] = dt.day.astype('uint8')
        df['hour'] = dt.hour.astype('uint8')
        del(dt)
        df_add_counts(df, ['ip', 'day', 'hour'])
        df_add_counts(df, ['ip', 'app'])
        df_add_counts(df, ['ip', 'app', 'os'])
        df_add_counts(df, ['ip', 'device'])
        df_add_counts(df, ['app', 'channel'])
        #cpuStats()

    with timer("Adding next click times"):
        D= 2**26
        df['category'] = (df['ip'].astype(str) + "_" + df['app'].astype(str) + "_" + df['device'].astype(str) \
                         + "_" + df['os'].astype(str)).apply(hash) % D
        click_buffer= np.full(D, 3000000000, dtype=np.uint32)
        df['epochtime']= df['click_time'].astype(np.int64) // 10 ** 9
        next_clicks= []
        for category, time in zip(reversed(df['category'].values), reversed(df['epochtime'].values)):
            next_clicks.append(click_buffer[category]-time)
            click_buffer[category]= time
        del(click_buffer)
        df['next_click']= list(reversed(next_clicks))

    for fea in ['ip_day_hour_count','ip_app_count','ip_app_os_count','ip_device_count',
                'app_channel_count','next_click']:  df[fea]= np.log2(1 + df[fea].values).astype(int)

    with timer("Generating str_array"):
        str_array= ("I" + df['ip'].astype(str) \
            + " A" + df['app'].astype(str) \
            + " D" + df['device'].astype(str) \
            + " O" + df['os'].astype(str) \
            + " C" + df['channel'].astype(str) \
            + " WD" + df['day'].astype(str) \
            + " H" + df['hour'].astype(str) \
            + " AXC" + df['app'].astype(str)+"_"+df['channel'].astype(str) \
            + " OXC" + df['os'].astype(str)+"_"+df['channel'].astype(str) \
            + " AXD" + df['app'].astype(str)+"_"+df['device'].astype(str) \
            + " IXA" + df['ip'].astype(str)+"_"+df['app'].astype(str) \
            + " AXO" + df['app'].astype(str)+"_"+df['os'].astype(str) \
            + " IDHC" + df['ip_day_hour_count'].astype(str) \
            + " IAC" + df['ip_app_count'].astype(str) \
            + " AOC" + df['ip_app_os_count'].astype(str) \
            + " IDC" + df['ip_device_count'].astype(str) \
            + " AC" + df['app_channel_count'].astype(str) \
            + " NC" + df['next_click'].astype(str)
          ).values
    #cpuStats()
    if 'is_attributed' in df.columns:
        labels = df['is_attributed'].values
        weights = np.multiply([1.0 if x == 1 else 0.2 for x in df['is_attributed'].values],
                              df['hour'].apply(lambda x: 1.0 if x in pick_hours else 0.5))
    else:
        labels = []
        weights = []
    return str_array, labels, weights

In [3]:
batchsize = 10000000
D = 2 ** 25

wb = wordbatch.WordBatch(
    None, 
    extractor=(
        WordHash, 
        {
            "ngram_range": (1, 1), 
            "analyzer": "word",
            "lowercase": False, 
            "n_features": D,
            "norm": None, 
            "binary": True
        }),
    minibatch_size=batchsize // 80,
    procs=8,
    freeze=True,
    timeout=1800,
    verbose=0
)

clf = FM_FTRL(
    alpha=0.05,
    beta=0.1,
    L1=0.0,
    L2=0.0,
    D=D,
    alpha_fm=0.02,
    L2_fm=0.0,
    init_fm=0.01,
    weight_fm=1.0,
    D_fm=8,
    e_noise=0.0,
    iters=3,
    inv_link="sigmoid",
    e_clip=1.0,
    threads=24,
    use_avx=1,
    verbose=0
)

In [19]:
validate_filename = '../data/interim/train_2017-11-09_0400.csv'

df_val = pd.read_csv(validate_filename, engine='c', sep=",", dtype=dtypes)
str_array_val, labels_val, weights_val = df2csr(wb, df_val, pick_hours={4, 5, 10, 13, 14})
X_val = wb.transform(str_array_val)
del df_val, str_array_val
gc.collect()

[Adding counts] done in 8 s
[Adding next click times] done in 38 s
[Generating str_array] done in 119 s


179

In [5]:
df = pd.read_hdf('../data/raw/train.hdf.compress')
str_array, y, weights= df2csr(wb, df, pick_hours={4, 5, 10, 13, 14})
with timer("transform"):
    X = wb.transform(str_array)
with timer("fit_batch"):
    clf.partial_fit(X, y, sample_weight=weights)
print("Train:")
with timer("evaluate_batch train"):
    print(i, "ROC AUC:", roc_auc_score(y, clf.predict(X)))

KeyboardInterrupt: 