In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

from utils import *

import xgboost as xgb

seed=42
PATH = Path('data')
list(PATH.iterdir())

[PosixPath('data/submission'),
 PosixPath('data/train_day8_3to16_FE.feather'),
 PosixPath('data/sample_submission.csv'),
 PosixPath('data/train_day9_3to16_FE.feather'),
 PosixPath('data/train_sample.csv'),
 PosixPath('data/train_day7_3to16_FE.feather'),
 PosixPath('data/dtree.dot'),
 PosixPath('data/test_FE.feather'),
 PosixPath('data/val_idxs.p'),
 PosixPath('data/mean_enc_df'),
 PosixPath('data/validation')]

In [2]:
# def metric(rf,X_val,y_val):
#     y_val_pred = rf.predict_proba(X_val)[:,1]
#     return roc_auc_score(y_val,y_val_pred)

# def permutation_importances(rf,X_val,y_val,metric):
#     baseline = metric(rf,X_val,y_val)
#     imp=[]
#     for col in X_val.columns:
#         save = X_val[col].copy()
#         X_val[col] = np.random.permutation(X_val[col])
#         m = metric(rf,X_val,y_val)
#         print(f'Score after {col} perm: {m:.5f}')
#         X_val[col] = save
#         imp.append(baseline-m)
#     return np.array(imp)

def get_sample_timeseries(filename,sz):
    df = get_feather(filename)
    sample_idx =np.random.permutation(df.shape[0])
    sample_idx=sorted(sample_idx[:sz])
    df = df.loc[sample_idx,:].reset_index().drop('index',axis=1)
    gc.collect()
    return df

def prediction_score(rf,train_df,y_train,val_df,y_val):
    y_train_pred = rf.predict_proba(train_df)[:,1]
    print(f'Train AUC: {roc_auc_score(y_train, y_train_pred)}')
    y_val_pred = rf.predict_proba(val_df)[:,1]
    val_auc = roc_auc_score(y_val, y_val_pred)
    print(f'Val AUC: {roc_auc_score(y_val, y_val_pred)}')
    return val_auc
def get_val_by_name(name):
    val_df = pd.read_feather(name)
    y_val = val_df.is_attributed
    val_df.drop(cols_to_drop,axis=1,inplace=True)
    gc.collect()
    return val_df,y_val
def get_train(sz=3000000):
    train_df=get_sample_timeseries('train_day8_3to16_FE.feather',sz)
    y_train = train_df.is_attributed
    train_df.drop(cols_to_drop,axis=1,inplace=True)
    gc.collect()
    return train_df,y_train

# Get train data

In [3]:
cols_to_drop=['is_attributed','ip','day']

In [4]:
val_names = [str(i) for i in list((PATH/'validation').iterdir())]
val_names

['data/validation/val3.feather',
 'data/validation/val2.feather',
 'data/validation/val0.feather',
 'data/validation/val1.feather']

In [5]:
train_df,y_train = get_train()

In [6]:
%%time

params = {'colsample_bytree': 0.7, 
          'eval_metric': 'auc', 
          'learning_rate': 0.1, 
          'max_depth': 4, 
          'min_child_weight': 100, 
          'objective': 'binary:logistic', 
          'seed': seed, 
          'subsample': 0.9500000000000001,
          'scale_pos_weight': 100,
          'tree_method': 'gpu_hist'}

dtrain = xgb.DMatrix(train_df,y_train)
del train_df,y_train
gc.collect()

for i in val_names:           
    val_df,y_val = get_val_by_name(i)
    dval = xgb.DMatrix(val_df,y_val)
    
    del val_df,y_val
    gc.collect()

    watchlist = [(dtrain, 'train'), (dval, 'valid')]
    xgb_model = xgb.train(params, dtrain, 2000, watchlist,
                      verbose_eval=20, 
                      early_stopping_rounds=70)

    train_pred = xgb_model.predict(dtrain,ntree_limit=xgb_model.best_ntree_limit)
    val_pred = xgb_model.predict(dval,ntree_limit=xgb_model.best_ntree_limit)

    train_loss = roc_auc_score(dtrain.get_label(),train_pred)
    val_loss = roc_auc_score(dval.get_label(),val_pred)
    print(f'Train AUC: {train_loss}. Val AUC: {val_loss}. Best ite: {xgb_model.best_ntree_limit}')

    del dval
    xgb_model.__del__()
    gc.collect()
    

[0]	train-auc:0.961706	valid-auc:0.952353
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 70 rounds.
[20]	train-auc:0.973846	valid-auc:0.964133
[40]	train-auc:0.977108	valid-auc:0.966103
[60]	train-auc:0.981407	valid-auc:0.96793
[80]	train-auc:0.983716	valid-auc:0.968611
[100]	train-auc:0.985235	valid-auc:0.969131
[120]	train-auc:0.986866	valid-auc:0.969194
[140]	train-auc:0.988198	valid-auc:0.969369
[160]	train-auc:0.989261	valid-auc:0.969275
[180]	train-auc:0.990185	valid-auc:0.969167
[200]	train-auc:0.990998	valid-auc:0.969031
Stopping. Best iteration:
[137]	train-auc:0.988004	valid-auc:0.969374

Train AUC: 0.9880038792179244. Val AUC: 0.9693738584339497. Best ite: 138
[0]	train-auc:0.961706	valid-auc:0.952633
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 70 rounds.
[20]	train-auc:0.973846	valid-auc:0.964361
[40]	tr

# Hypertuning

In [3]:
cols_to_drop=['is_attributed','ip','day']
train_df,y_train = get_train(10000000)

In [5]:
train_df.shape
y_train.shape

(10000000, 32)

(10000000,)

In [4]:
val_name = 'data/validation/val3.feather'
val_df,y_val = get_val_by_name(val_name)

dtrain = xgb.DMatrix(train_df,y_train)
dval = xgb.DMatrix(val_df,y_val)

del train_df,y_train,val_df,y_val
gc.collect()

0

In [5]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

val_losses=[]
ites=[]



def score(params):
    print("Training with params: ")
    print(params)



    watchlist = [(dtrain, 'train'), (dval, 'valid')]
    xgb_model = xgb.train(params, dtrain, 2000, watchlist,
                      verbose_eval=False, 
                      early_stopping_rounds=100)

    train_pred = xgb_model.predict(dtrain,ntree_limit=xgb_model.best_ntree_limit)
    val_pred = xgb_model.predict(dval,ntree_limit=xgb_model.best_ntree_limit)

    train_loss = roc_auc_score(dtrain.get_label(),train_pred)
    val_loss = roc_auc_score(dval.get_label(),val_pred)
    val_losses.append(val_loss)
    ites.append(xgb_model.best_ntree_limit)
    print(f'Train AUC: {train_loss}. Val AUC: {val_loss}. Best ite: {xgb_model.best_ntree_limit}')

    xgb_model.__del__()

    gc.collect()
    
    
    return {'loss': val_loss, 'status': STATUS_OK}

def optimize(space,max_evals=5):
    
    best = fmin(score, space, algo=tpe.suggest, 
        # trials=trials, 
        max_evals=max_evals)
    return best

In [None]:

space = {
    #'n_estimators': hp.quniform('n_estimators', 50, 500, 5),
#     'max_depth': hp.choice('max_depth', np.arange(5, 10, dtype=int)),
    'subsample': hp.quniform('subsample', 0.5, .95, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.6, .95, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(100,200, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(100,300, dtype=int)),
    'scale_pos_weight': hp.choice('scale_pos_weight', np.arange(100,200, dtype=int)),
    'learning_rate': 0.2,
    'eval_metric': 'auc', 
    'objective': 'binary:logistic', 
    'seed': seed,'tree_method':'gpu_hist'
}
best_hyperparams = optimize(space,max_evals=400)
print("The best hyperparameters are: ")
print(best_hyperparams)

Training with params: 
{'colsample_bytree': 0.7000000000000001, 'eval_metric': 'auc', 'gamma': 0.4, 'learning_rate': 0.2, 'max_leaf_nodes': 169, 'min_child_weight': 258, 'objective': 'binary:logistic', 'scale_pos_weight': 142, 'seed': 42, 'subsample': 0.75, 'tree_method': 'gpu_hist'}
Train AUC: 0.9893705106663817. Val AUC: 0.9702714770404105. Best ite: 72
Training with params: 
{'colsample_bytree': 0.65, 'eval_metric': 'auc', 'gamma': 0.5, 'learning_rate': 0.2, 'max_leaf_nodes': 169, 'min_child_weight': 270, 'objective': 'binary:logistic', 'scale_pos_weight': 141, 'seed': 42, 'subsample': 0.6000000000000001, 'tree_method': 'gpu_hist'}
Train AUC: 0.9879651540210793. Val AUC: 0.9705380508058978. Best ite: 64
Training with params: 
{'colsample_bytree': 0.65, 'eval_metric': 'auc', 'gamma': 0.55, 'learning_rate': 0.2, 'max_leaf_nodes': 183, 'min_child_weight': 117, 'objective': 'binary:logistic', 'scale_pos_weight': 137, 'seed': 42, 'subsample': 0.75, 'tree_method': 'gpu_hist'}
Train AUC: 0

Train AUC: 0.9850566859320673. Val AUC: 0.9693352441381687. Best ite: 43
Training with params: 
{'colsample_bytree': 0.8500000000000001, 'eval_metric': 'auc', 'gamma': 0.30000000000000004, 'learning_rate': 0.2, 'max_leaf_nodes': 185, 'min_child_weight': 245, 'objective': 'binary:logistic', 'scale_pos_weight': 184, 'seed': 42, 'subsample': 0.5, 'tree_method': 'gpu_hist'}
Train AUC: 0.9871681877289686. Val AUC: 0.9694216440140913. Best ite: 53
Training with params: 
{'colsample_bytree': 0.8, 'eval_metric': 'auc', 'gamma': 0.30000000000000004, 'learning_rate': 0.2, 'max_leaf_nodes': 141, 'min_child_weight': 171, 'objective': 'binary:logistic', 'scale_pos_weight': 109, 'seed': 42, 'subsample': 0.55, 'tree_method': 'gpu_hist'}
Train AUC: 0.9882772147255582. Val AUC: 0.9698041951822602. Best ite: 68
Training with params: 
{'colsample_bytree': 0.8, 'eval_metric': 'auc', 'gamma': 0.30000000000000004, 'learning_rate': 0.2, 'max_leaf_nodes': 112, 'min_child_weight': 171, 'objective': 'binary:log

Train AUC: 0.9874957265264901. Val AUC: 0.9699226781846803. Best ite: 54
Training with params: 
{'colsample_bytree': 0.6000000000000001, 'eval_metric': 'auc', 'gamma': 0.6000000000000001, 'learning_rate': 0.2, 'max_leaf_nodes': 181, 'min_child_weight': 141, 'objective': 'binary:logistic', 'scale_pos_weight': 138, 'seed': 42, 'subsample': 0.55, 'tree_method': 'gpu_hist'}
Train AUC: 0.9857634498051464. Val AUC: 0.9691097296084852. Best ite: 45
Training with params: 
{'colsample_bytree': 0.6000000000000001, 'eval_metric': 'auc', 'gamma': 0.6000000000000001, 'learning_rate': 0.2, 'max_leaf_nodes': 124, 'min_child_weight': 244, 'objective': 'binary:logistic', 'scale_pos_weight': 138, 'seed': 42, 'subsample': 0.55, 'tree_method': 'gpu_hist'}
Train AUC: 0.9846998863466102. Val AUC: 0.9699073492469809. Best ite: 41
Training with params: 
{'colsample_bytree': 0.6000000000000001, 'eval_metric': 'auc', 'gamma': 0.75, 'learning_rate': 0.2, 'max_leaf_nodes': 181, 'min_child_weight': 136, 'objective

Train AUC: 0.9866883037700453. Val AUC: 0.9694889056570044. Best ite: 53
Training with params: 
{'colsample_bytree': 0.8500000000000001, 'eval_metric': 'auc', 'gamma': 0.15000000000000002, 'learning_rate': 0.2, 'max_leaf_nodes': 151, 'min_child_weight': 172, 'objective': 'binary:logistic', 'scale_pos_weight': 186, 'seed': 42, 'subsample': 0.5, 'tree_method': 'gpu_hist'}
Train AUC: 0.9871351402991793. Val AUC: 0.9693193388287054. Best ite: 52
Training with params: 
{'colsample_bytree': 0.9, 'eval_metric': 'auc', 'gamma': 0.4, 'learning_rate': 0.2, 'max_leaf_nodes': 173, 'min_child_weight': 165, 'objective': 'binary:logistic', 'scale_pos_weight': 193, 'seed': 42, 'subsample': 0.55, 'tree_method': 'gpu_hist'}
Train AUC: 0.9860562880242311. Val AUC: 0.9698964314611446. Best ite: 42
Training with params: 
{'colsample_bytree': 0.8, 'eval_metric': 'auc', 'gamma': 0.30000000000000004, 'learning_rate': 0.2, 'max_leaf_nodes': 122, 'min_child_weight': 107, 'objective': 'binary:logistic', 'scale_p

Train AUC: 0.9847491273521756. Val AUC: 0.970066619672336. Best ite: 40
Training with params: 
{'colsample_bytree': 0.8500000000000001, 'eval_metric': 'auc', 'gamma': 0.8, 'learning_rate': 0.2, 'max_leaf_nodes': 101, 'min_child_weight': 107, 'objective': 'binary:logistic', 'scale_pos_weight': 194, 'seed': 42, 'subsample': 0.65, 'tree_method': 'gpu_hist'}
Train AUC: 0.987970805408512. Val AUC: 0.9695520696293387. Best ite: 52
Training with params: 
{'colsample_bytree': 0.9500000000000001, 'eval_metric': 'auc', 'gamma': 0.35000000000000003, 'learning_rate': 0.2, 'max_leaf_nodes': 113, 'min_child_weight': 214, 'objective': 'binary:logistic', 'scale_pos_weight': 155, 'seed': 42, 'subsample': 0.6000000000000001, 'tree_method': 'gpu_hist'}
Train AUC: 0.9860992467317702. Val AUC: 0.9698290429411993. Best ite: 44
Training with params: 
{'colsample_bytree': 0.9, 'eval_metric': 'auc', 'gamma': 0.5, 'learning_rate': 0.2, 'max_leaf_nodes': 170, 'min_child_weight': 205, 'objective': 'binary:logisti

Train AUC: 0.9868405302501027. Val AUC: 0.9701818117666842. Best ite: 46
Training with params: 
{'colsample_bytree': 0.8, 'eval_metric': 'auc', 'gamma': 0.2, 'learning_rate': 0.2, 'max_leaf_nodes': 144, 'min_child_weight': 113, 'objective': 'binary:logistic', 'scale_pos_weight': 102, 'seed': 42, 'subsample': 0.55, 'tree_method': 'gpu_hist'}
Train AUC: 0.9870196884403905. Val AUC: 0.9692667181326211. Best ite: 53
Training with params: 
{'colsample_bytree': 0.75, 'eval_metric': 'auc', 'gamma': 0.25, 'learning_rate': 0.2, 'max_leaf_nodes': 167, 'min_child_weight': 150, 'objective': 'binary:logistic', 'scale_pos_weight': 133, 'seed': 42, 'subsample': 0.8, 'tree_method': 'gpu_hist'}
Train AUC: 0.9906220417193558. Val AUC: 0.9705649106438685. Best ite: 76
Training with params: 
{'colsample_bytree': 0.8500000000000001, 'eval_metric': 'auc', 'gamma': 0.1, 'learning_rate': 0.2, 'max_leaf_nodes': 136, 'min_child_weight': 131, 'objective': 'binary:logistic', 'scale_pos_weight': 187, 'seed': 42, '