In [None]:
# !sudo pip install git+https://github.com/anttttti/Wordbatch.git

In [1]:
import wordbatch
from wordbatch.extractors import WordHash
from wordbatch.models import FM_FTRL
# from wordbatch.data_utils import *
import threading
import pandas as pd
from sklearn.metrics import roc_auc_score
import time
import numpy as np
import gc
from contextlib import contextmanager

In [2]:
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}

train_filenames = [
    '../data/interim/combined/train_2017-11-07_1700_08_1600_0_attributed.csv.gz',
    '../data/interim/combined/train_2017-11-07_1700_08_1600_1_attributed.csv.gz',
    '../data/interim/combined/train_2017-11-07_1700_08_1600_2_attributed.csv.gz',
    '../data/interim/combined/train_2017-11-07_1700_08_1600_3_attributed.csv.gz',
    '../data/interim/combined/train_2017-11-07_1700_08_1600_4_attributed.csv.gz',
    '../data/interim/combined/train_2017-11-07_1700_08_1600_5_attributed.csv.gz',
    '../data/interim/combined/train_2017-11-07_1700_08_1600_6_attributed.csv.gz',
    '../data/interim/combined/train_2017-11-07_1700_08_1600_7_attributed.csv.gz',
    '../data/interim/combined/train_2017-11-07_1700_08_1600_8_attributed.csv.gz',
    '../data/interim/combined/train_2017-11-07_1700_08_1600_9_attributed.csv.gz',
]

validate_filename = '../data/interim/combined/train_2017-11-09_0400_attributed.csv.gz'


categorical_features = [
    'app', 'device', 'os', 'channel',
    'MODE(clicks.ip)_app_1day',
    'MODE(clicks.device)_app_1day',
    'MODE(clicks.os)_app_1day',
    'MODE(clicks.channel)_app_1day',
    'MODE(clicks.ip WHERE is_attributed = True)_app_1day',
    'MODE(clicks.device WHERE is_attributed = True)_app_1day',
    'MODE(clicks.os WHERE is_attributed = True)_app_1day',
    'MODE(clicks.channel WHERE is_attributed = True)_app_1day',
    'MODE(clicks.HOUR(click_time))_app_1day',
    'MODE(clicks.HOUR(click_time) WHERE is_attributed = True)_app_1day',
    'MODE(clicks.ip)_device_1day',
    'MODE(clicks.app)_device_1day',
    'MODE(clicks.os)_device_1day',
    'MODE(clicks.channel)_device_1day',
    'MODE(clicks.ip WHERE is_attributed = True)_channel_1day',
    'MODE(clicks.app WHERE is_attributed = True)_channel_1day',
    'MODE(clicks.device WHERE is_attributed = True)_channel_1day',
    'MODE(clicks.os WHERE is_attributed = True)_channel_1day',
    'MODE(clicks.HOUR(click_time))_channel_1day',
    'MODE(clicks.HOUR(click_time) WHERE is_attributed = True)_channel_1day',
    'MODE(clicks.ip WHERE is_attributed = True)_device_1day',
    'MODE(clicks.app WHERE is_attributed = True)_device_1day',
    'MODE(clicks.os WHERE is_attributed = True)_device_1day',
    'MODE(clicks.channel WHERE is_attributed = True)_device_1day',
    'MODE(clicks.HOUR(click_time))_device_1day',
    'MODE(clicks.HOUR(click_time) WHERE is_attributed = True)_device_1day',
    'MODE(clicks.ip WHERE is_attributed = True)_os_1day',
    'MODE(clicks.app WHERE is_attributed = True)_os_1day',
    'MODE(clicks.device WHERE is_attributed = True)_os_1day',
    'MODE(clicks.channel WHERE is_attributed = True)_os_1day',
    'MODE(clicks.HOUR(click_time))_os_1day',
    'MODE(clicks.HOUR(click_time) WHERE is_attributed = True)_os_1day',
    'MODE(clicks.ip)_os_1day',
    'MODE(clicks.app)_os_1day',
    'MODE(clicks.device)_os_1day',
    'MODE(clicks.channel)_os_1day',
    'MODE(clicks.ip)_channel_1day',
    'MODE(clicks.app)_channel_1day',
    'MODE(clicks.device)_channel_1day',
    'MODE(clicks.os)_channel_1day',
]


numerical_features = [
    'COUNT(clicks)_device_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_device_1day',
    'COUNT(clicks WHERE is_attributed = True)_device_1day',
    'PERCENT_TRUE(clicks.is_attributed)_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_os_1day',
    'COUNT(clicks)_os_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_os_1day',
    'COUNT(clicks WHERE is_attributed = True)_os_1day',
    'PERCENT_TRUE(clicks.is_attributed)_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_channel_1day',
    'COUNT(clicks)_channel_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_channel_1day',
    'COUNT(clicks WHERE is_attributed = True)_channel_1day',
    'PERCENT_TRUE(clicks.is_attributed)_app_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_app_1day',
    'COUNT(clicks)_app_1day',
    'AVG_TIME_BETWEEN(clicks.click_time WHERE is_attributed = True)_app_1day',
    'COUNT(clicks WHERE is_attributed = True)_app_1day',
    'PERCENT_TRUE(clicks.is_attributed)_device_1day',
    'AVG_TIME_BETWEEN(clicks.click_time)_device_1day',
]

In [3]:
def fit_batch(clf, X, y, w):  clf.partial_fit(X, y, sample_weight=w)

def predict_batch(clf, X):  return clf.predict(X)

def evaluate_batch(clf, X, y, rcount):
    auc= roc_auc_score(y, predict_batch(clf, X))
    print(rcount, "ROC AUC:", auc)
    return auc

In [4]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def df_add_counts(df, cols):
    arr_slice = df[cols].values
    unq, unqtags, counts = np.unique(np.ravel_multi_index(arr_slice.T, arr_slice.max(0) + 1),
                                     return_inverse=True, return_counts=True)
    df["_".join(cols)+'_count'] = counts[unqtags]

def df2csr(wb, df, pick_hours=None):
    df.reset_index(drop=True, inplace=True)
    with timer("Adding counts"):
        df['click_time']= pd.to_datetime(df['click_time'])
        dt= df['click_time'].dt
        df['day'] = dt.day.astype('uint8')
        df['hour'] = dt.hour.astype('uint8')
        del(dt)
        df_add_counts(df, ['ip', 'day', 'hour'])
        df_add_counts(df, ['ip', 'app'])
        df_add_counts(df, ['ip', 'app', 'os'])
        df_add_counts(df, ['ip', 'device'])
        df_add_counts(df, ['app', 'channel'])
        #cpuStats()

    with timer("Adding next click times"):
        D= 2**26
        df['category'] = (df['ip'].astype(str) + "_" + df['app'].astype(str) + "_" + df['device'].astype(str) \
                         + "_" + df['os'].astype(str)).apply(hash) % D
        click_buffer= np.full(D, 3000000000, dtype=np.uint32)
        df['epochtime']= df['click_time'].astype(np.int64) // 10 ** 9
        next_clicks= []
        for category, time in zip(reversed(df['category'].values), reversed(df['epochtime'].values)):
            next_clicks.append(click_buffer[category]-time)
            click_buffer[category]= time
        del(click_buffer)
        df['next_click']= list(reversed(next_clicks))

    for fea in ['ip_day_hour_count','ip_app_count','ip_app_os_count','ip_device_count',
                'app_channel_count','next_click']:  df[fea]= np.log2(1 + df[fea].values).astype(int)

    with timer("Generating str_array"):
        str_array= ("I" + df['ip'].astype(str) \
            + " A" + df['app'].astype(str) \
            + " D" + df['device'].astype(str) \
            + " O" + df['os'].astype(str) \
            + " C" + df['channel'].astype(str) \
            + " WD" + df['day'].astype(str) \
            + " H" + df['hour'].astype(str) \
            + " AXC" + df['app'].astype(str)+"_"+df['channel'].astype(str) \
            + " OXC" + df['os'].astype(str)+"_"+df['channel'].astype(str) \
            + " AXD" + df['app'].astype(str)+"_"+df['device'].astype(str) \
            + " IXA" + df['ip'].astype(str)+"_"+df['app'].astype(str) \
            + " AXO" + df['app'].astype(str)+"_"+df['os'].astype(str) \
            + " IDHC" + df['ip_day_hour_count'].astype(str) \
            + " IAC" + df['ip_app_count'].astype(str) \
            + " AOC" + df['ip_app_os_count'].astype(str) \
            + " IDC" + df['ip_device_count'].astype(str) \
            + " AC" + df['app_channel_count'].astype(str) \
            + " NC" + df['next_click'].astype(str)
          ).values
    #cpuStats()
    if 'is_attributed' in df.columns:
        labels = df['is_attributed'].values
        weights = np.multiply([1.0 if x == 1 else 0.2 for x in df['is_attributed'].values],
                              df['hour'].apply(lambda x: 1.0 if x in pick_hours else 0.5))
    else:
        labels = []
        weights = []
    return str_array, labels, weights

In [5]:
def df2csr2(wb, df, pick_hours=None):
    labels = df['is_attributed'].values
    weights = np.multiply([1.0 if x == 1 else 0.2 for x in df['is_attributed'].values], 0.5)
    df.drop(columns=['is_attributed'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    for fea in numerical_features:  df[fea]= np.log2(1 + df[fea].values).astype(int)
    
    df = df.rename(columns=lambda x: x.replace(" ", "_"))
    cols = list(df)
    def mkstr(row):
        return ' '.join(f"{c}{v}" for (c, v) in zip(cols, row))

    str_array = df.apply(lambda row: mkstr(row), axis=1, raw=True).values
        
    return str_array, labels, weights

In [6]:
batchsize = 1000000
D = 2 ** 25

wb = wordbatch.WordBatch(
    None, 
    extractor=(
        WordHash, 
        {
            "ngram_range": (1, 2), 
            "analyzer": "word",
            "lowercase": False, 
            "n_features": D,
            "norm": None, 
            "binary": True
        }),
    minibatch_size=batchsize // 80,
    procs=8,
    freeze=True,
    timeout=1800,
    verbose=0
)

clf = FM_FTRL(
    alpha=0.05,
    beta=0.1,
    L1=0.0,
    L2=0.0,
    D=D,
    alpha_fm=0.02,
    L2_fm=0.0,
    init_fm=0.01,
    weight_fm=1.0,
    D_fm=8,
    e_noise=0.0,
    iters=3,
    inv_link="sigmoid",
    e_clip=1.0,
    threads=24,
    use_avx=1,
    verbose=0
)

In [7]:
df_val = pd.read_csv(validate_filename, engine='c', sep=",", dtype=dtypes)
str_array_val, labels_val, weights_val = df2csr2(wb, df_val)
X_val = wb.transform(str_array_val)
del df_val, str_array_val
gc.collect()

7

In [None]:
i = 0
for train_filename in train_filenames:
    for df_c in pd.read_csv(train_filename, engine='c', chunksize=batchsize, sep=",", dtype=dtypes):
        with timer("df2csr"):
            str_array, labels, weights= df2csr(wb, df_c)
        with timer("transform"):
            X = wb.transform(str_array)
        with timer("fit_batch"):
            fit_batch(clf, X, labels, weights)
        print("Train:")
        with timer("evaluate_batch train"):
            evaluate_batch(clf, X, labels, i)
        print("Test:")
        with timer("evaluate_batch test"):
            evaluate_batch(clf, X_val, labels_val, i)
        i = i + 1

        del df_c, X, str_array
        gc.collect()

[Adding counts] done in 1 s
[Adding next click times] done in 10 s
[Generating str_array] done in 30 s
