In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

from utils import *

import xgboost as xgb

seed=42
PATH = Path('data')
list(PATH.iterdir())

[PosixPath('data/submission'),
 PosixPath('data/train_day8_3to16_FE.feather'),
 PosixPath('data/sample_submission.csv'),
 PosixPath('data/train_day9_3to16_FE.feather'),
 PosixPath('data/train_sample.csv'),
 PosixPath('data/train_day7_3to16_FE.feather'),
 PosixPath('data/dtree.dot'),
 PosixPath('data/test_FE.feather'),
 PosixPath('data/val_idxs.p'),
 PosixPath('data/mean_enc_df'),
 PosixPath('data/validation')]

In [2]:
def metric(rf,X_val,y_val):
    y_val_pred = rf.predict_proba(X_val)[:,1]
    return roc_auc_score(y_val,y_val_pred)

def permutation_importances(rf,X_val,y_val,metric):
    baseline = metric(rf,X_val,y_val)
    imp=[]
    for col in X_val.columns:
        save = X_val[col].copy()
        X_val[col] = np.random.permutation(X_val[col])
        m = metric(rf,X_val,y_val)
        print(f'Score after {col} perm: {m:.5f}')
        X_val[col] = save
        imp.append(baseline-m)
    return np.array(imp)

def get_sample_timeseries(filename,sz):
    df = get_feather(filename)
    sample_idx =np.random.permutation(df.shape[0])
    sample_idx=sorted(sample_idx[:sz])
    df = df.loc[sample_idx,:].reset_index().drop('index',axis=1)
    gc.collect()
    return df

def prediction_score(rf,train_df,y_train,val_df,y_val):
    y_train_pred = rf.predict_proba(train_df)[:,1]
    print(f'Train AUC: {roc_auc_score(y_train, y_train_pred)}')
    y_val_pred = rf.predict_proba(val_df)[:,1]
    val_auc = roc_auc_score(y_val, y_val_pred)
    print(f'Val AUC: {roc_auc_score(y_val, y_val_pred)}')
    return val_auc
def get_val_by_name(name):
    val_df = pd.read_feather(i)
    y_val = val_df.is_attributed
    val_df.drop(cols_to_drop,axis=1,inplace=True)
    gc.collect()
    return val_df,y_val
def get_train(sz=3000000):
    train_df=get_sample_timeseries('train_day8_3to16_FE.feather',sz)
    y_train = train_df.is_attributed
    train_df.drop(cols_to_drop,axis=1,inplace=True)
    gc.collect()
    return train_df,y_train

# Get train data

In [3]:
cols_to_drop=['is_attributed','ip','day']

In [4]:
val_names = [str(i) for i in list((PATH/'validation').iterdir())]
val_names

['data/validation/val3.feather',
 'data/validation/val2.feather',
 'data/validation/val0.feather',
 'data/validation/val1.feather']

In [5]:
train_df,y_train = get_train()

In [6]:
%%time

params = {'colsample_bytree': 0.7, 
          'eval_metric': 'auc', 
          'learning_rate': 0.1, 
          'max_depth': 4, 
          'min_child_weight': 100, 
          'objective': 'binary:logistic', 
          'seed': seed, 
          'subsample': 0.9500000000000001,
          'scale_pos_weight': 100,
          'tree_method': 'gpu_hist'}

dtrain = xgb.DMatrix(train_df,y_train)
del train_df,y_train
gc.collect()

for i in val_names:           
    val_df,y_val = get_val_by_name(i)
    dval = xgb.DMatrix(val_df,y_val)
    
    del val_df,y_val
    gc.collect()

    watchlist = [(dtrain, 'train'), (dval, 'valid')]
    xgb_model = xgb.train(params, dtrain, 2000, watchlist,
                      verbose_eval=20, 
                      early_stopping_rounds=70)

    train_pred = xgb_model.predict(dtrain,ntree_limit=xgb_model.best_ntree_limit)
    val_pred = xgb_model.predict(dval,ntree_limit=xgb_model.best_ntree_limit)

    train_loss = roc_auc_score(dtrain.get_label(),train_pred)
    val_loss = roc_auc_score(dval.get_label(),val_pred)
    print(f'Train AUC: {train_loss}. Val AUC: {val_loss}. Best ite: {xgb_model.best_ntree_limit}')

    del dval
    xgb_model.__del__()
    gc.collect()
    

[0]	train-auc:0.961706	valid-auc:0.952353
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 70 rounds.
[20]	train-auc:0.973846	valid-auc:0.964133
[40]	train-auc:0.977108	valid-auc:0.966103
[60]	train-auc:0.981407	valid-auc:0.96793
[80]	train-auc:0.983716	valid-auc:0.968611
[100]	train-auc:0.985235	valid-auc:0.969131
[120]	train-auc:0.986866	valid-auc:0.969194
[140]	train-auc:0.988198	valid-auc:0.969369
[160]	train-auc:0.989261	valid-auc:0.969275
[180]	train-auc:0.990185	valid-auc:0.969167
[200]	train-auc:0.990998	valid-auc:0.969031
Stopping. Best iteration:
[137]	train-auc:0.988004	valid-auc:0.969374

Train AUC: 0.9880038792179244. Val AUC: 0.9693738584339497. Best ite: 138
[0]	train-auc:0.961706	valid-auc:0.952633
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 70 rounds.
[20]	train-auc:0.973846	valid-auc:0.964361
[40]	tr

# Hypertuning

In [None]:
cols_to_drop=['is_attributed','ip','day']
train_df,y_train = get_train()

val_name = 'data/validation/val3.feather'
val_df,y_val = get_val_by_name(val_name)

In [None]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

val_losses=[]
ites=[]

dtrain = xgb.DMatrix(train_df,y_train)

dval = xgb.DMatrix(val_df,y_val)

del train_df,y_train,val_df,y_val
gc.collect()

def score(params):
    print("Training with params: ")
    print(params)



    watchlist = [(dtrain, 'train'), (dval, 'valid')]
    xgb_model = xgb.train(params, dtrain, 2000, watchlist,
                      verbose_eval=False, 
                      early_stopping_rounds=100)

    train_pred = xgb_model.predict(dtrain,ntree_limit=xgb_model.best_ntree_limit)
    val_pred = xgb_model.predict(dval,ntree_limit=xgb_model.best_ntree_limit)

    train_loss = roc_auc_score(y_train,train_pred)
    val_loss = roc_auc_score(y_val,val_pred)
    val_losses.append(val_loss)
    ites.append(xgb_model.best_ntree_limit)
    print(f'Train AUC: {train_loss}. Val AUC: {val_loss}. Best ite: {xgb_model.best_ntree_limit}')

    del dval
    xgb_model.__del__()

    gc.collect()
    
    
    return {'loss': val_loss, 'status': STATUS_OK}

def optimize(space,max_evals=5):
    
    best = fmin(score, space, algo=tpe.suggest, 
        # trials=trials, 
        max_evals=max_evals)
    return best

In [None]:

space = {
    #'n_estimators': hp.quniform('n_estimators', 50, 500, 5),
#     'max_depth': hp.choice('max_depth', np.arange(5, 10, dtype=int)),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(100,200, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(100,300, dtype=int)),
    'scale_pos_weight': hp.choice('scale_pos_weight', np.arange(100,200, dtype=int)),
    'learning_rate': 0.2,
    'eval_metric': 'auc', 
    'objective': 'binary:logistic', 
    'seed': seed,'tree_method':'gpu_hist'
}
best_hyperparams = optimize(space,max_evals=400)
print("The best hyperparameters are: ")
print(best_hyperparams)