In [1]:
import matplotlib.pyplot as plt
from transform import transform
import os.path as path
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import time
import gc

warnings.filterwarnings("ignore")

%matplotlib inline
pd.options.display.float_format = '{:.5f}'.format
pd.options.display.max_columns = 100

train_columns = ['ip', 'app', 'os', 'device', 'channel', 'click_time', 'is_attributed']
dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'os'            : 'uint16',
    'device'        : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8'
}

In [2]:
data_perc = 0.02 # ignorado se houver dados processados disponíveis
use_gpu = False

# carregando do csv

In [3]:
start = time.time()

has_processed_data = path.isfile('input/train_processed.csv')

filename = 'input/train_processed.csv' if has_processed_data else 'input/train.csv'
if has_processed_data:
    data_perc = 1.0

if filename=='input/train.csv':
    train_size_total = 184903890
else:
    train_size_total = sum(1 for line in open(filename))-1
train_size = int(data_perc*train_size_total)

data_train = pd.read_csv(filename, nrows=train_size, usecols=train_columns, parse_dates=['click_time'], dtype=dtypes)
print('{:.2f}s to load train data'.format(time.time()-start))

3.18s to load train data


# extraindo features interessantes

In [4]:
start = time.time()
transform(data_train)
process_time = time.time()-start
print('{:.2f}s to process data ({:.2f} lines/s)'.format(process_time, data_train.shape[0]/process_time))

0.39s to generate feature ip_clicks
0.83s to generate feature ip_clicks_on_app
0.07s to generate feature ip_clicks_on_app_perc
0.37s to generate feature app_clicks
0.38s to generate feature os_clicks
0.59s to generate feature os_clicks_on_app
0.07s to generate feature os_clicks_on_app_perc
0.47s to generate feature device_clicks
0.62s to generate feature device_clicks_on_app
0.07s to generate feature device_clicks_on_app_perc
0.54s to generate feature channel_clicks
0.62s to generate feature channel_clicks_on_app
0.06s to generate feature channel_clicks_on_app_perc
0.90s to generate feature ip_os_clicks
1.37s to generate feature ip_os_clicks_on_app
0.07s to generate feature ip_os_clicks_on_app_perc
1.22s to generate feature ip_os_device_clicks
1.47s to generate feature ip_os_device_clicks_on_app
0.05s to generate feature ip_os_device_clicks_on_app_perc
1.84s to generate feature ip_os_device_channel_clicks
1.95s to generate feature ip_os_device_channel_clicks_on_app
0.06s to generate fe

KeyboardInterrupt: 

# valores médios de cada feature

### clicks seguidos de download

In [None]:
data_train[data_train['is_attributed']==1].describe().drop(columns=['is_attributed'])[1:3]

### clicks não seguidos de downloads

In [None]:
data_train[data_train['is_attributed']==0].describe().drop(columns=['is_attributed'])[1:3]

# distribuições das variáveis

In [None]:
fig = plt.figure(figsize=(10,300))
count = 1
for column in data_train.columns:
    if column=='is_attributed':
        continue
    fig.add_subplot(data_train.shape[1], 2, count)
    sns.distplot(data_train[data_train['is_attributed']==0][column], kde=False)
    count += 1
    fig.add_subplot(data_train.shape[1], 2, count)
    sns.distplot(data_train[data_train['is_attributed']==1][column], kde=False, color='red')
    count += 1

# treinamento

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

X = data_train.drop(columns=['is_attributed'])
y = data_train['is_attributed']

In [None]:
start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)
del X, y
gc.collect()
print('{:.2f}s to split data in train/test'.format(time.time()-start))

In [None]:
start = time.time()
unbalance_factor = data_train[data_train['is_attributed']==0].shape[0]/data_train[data_train['is_attributed']==1].shape[0]
del data_train
gc.collect()
print('{:.2f}s to compute unbalance factor: {}'.format(time.time()-start, unbalance_factor))

In [None]:
# https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
xgb_params = {
    'eta': 0.2,
    'max_leaves': 2048,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'colsample_bylevel':0.7,
    'min_child_weight':0,
    'alpha': 5,
    'scale_pos_weight': unbalance_factor,
    'eval_metric': 'auc',
    'random_state': int(time.time()),
    'nthread': 4,
    'silent': True,
    'max_depth': 0,
    'grow_policy': 'lossguide',
    'tree_method': 'hist',
    'predictor': 'cpu_predictor',
    'objective': 'binary:logistic'
}

if use_gpu:
    xgb_params.update({'tree_method':'gpu_hist', 'predictor':'gpu_predictor', 'objective':'gpu:binary:logistic'})

In [None]:
start = time.time()
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_test, y_test)
del X_train, y_train, X_test, y_test
gc.collect()
print('{:.2f}s to create xgboost data structures'.format(time.time()-start))

In [None]:
# watch accuracy in training and validation
watchlist = [(dtrain, 'training'), (dvalid, 'validation')]

start = time.time()
model = xgb.train(xgb_params, dtrain, 1000, watchlist, maximize=True, early_stopping_rounds = 25, verbose_eval=5)
del dvalid, dtrain
gc.collect()
print('{:.2f}s to perform training'.format(time.time()-start))

In [None]:
_, ax = plt.subplots(figsize=(10,15))
xgb.plot_importance(model, ax=ax)