In [2]:
import pandas as pd
import numpy as np
import pyarrow.feather as feather

import sklearn
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import precision_recall_curve, accuracy_score

import xgboost as xgb
import optuna

import random

In [3]:
clf_df = feather.read_feather('../clf_df.feather')

In [4]:
clf_df.shape

(1989, 35)

In [3]:
clf_df.head()

Unnamed: 0,a_axis,b_axis,c_axis,Nominal_diameter,Weight,Elongation,Platyness,Sphericity,pebble_PC1,pebble_PC2,...,TipoStartRun_rapid,TipoStartSteep_poolzone,X_Start,Y_Start,X_Start_WGS84,Y_Start_WGS84,Event,graph_dist,graph_vel,domain_type
0,90.0,50.0,40.0,56.462162,300.0,0.555556,0.444444,0.627357,-33.106005,16.98043,...,1,0,9.426893,45.874324,533130.061975,5080173.0,1.0,0.0,0.0,Run_rapid
1,90.0,50.0,40.0,56.462162,300.0,0.555556,0.444444,0.627357,-33.106005,16.98043,...,1,0,9.426893,45.874324,533130.061975,5080173.0,2.0,0.0,0.0,Run_rapid
2,90.0,50.0,40.0,56.462162,300.0,0.555556,0.444444,0.627357,-33.106005,16.98043,...,1,0,9.426893,45.874324,533130.061975,5080173.0,3.0,0.0,0.0,Run_rapid
3,90.0,50.0,40.0,56.462162,300.0,0.555556,0.444444,0.627357,-33.106005,16.98043,...,1,0,9.426893,45.874324,533130.061975,5080173.0,4.0,0.0,0.0,Run_rapid
4,90.0,50.0,40.0,56.462162,300.0,0.555556,0.444444,0.627357,-33.106005,16.98043,...,1,0,9.426893,45.874324,533130.061975,5080173.0,5.0,0.0,0.0,Run_rapid


In [4]:
clf_df.columns

Index(['a_axis', 'b_axis', 'c_axis', 'Nominal_diameter', 'Weight',
       'Elongation', 'Platyness', 'Sphericity', 'pebble_PC1', 'pebble_PC2',
       'is_stuck_1', 'is_stuck_2', 'is_stuck_3', 'mean_h', 'n_o_s', 'mean_Q',
       'weather_PC1', 'weather_PC2', 'cl_', 'duration', 'TipoStartBanks',
       'TipoStartBars_sedimentbuildupzones', 'TipoStartCascade',
       'TipoStartPlanebed', 'TipoStartPools', 'TipoStartRun_rapid',
       'TipoStartSteep_poolzone', 'X_Start', 'Y_Start', 'X_Start_WGS84',
       'Y_Start_WGS84', 'Event', 'graph_dist', 'graph_vel', 'domain_type'],
      dtype='object')

In [5]:
X_columns = [ 'X_Start','Y_Start','pebble_PC1', 'pebble_PC2', 'weather_PC1','weather_PC2', 'TipoStartBanks',
       'TipoStartBars_sedimentbuildupzones', 'TipoStartCascade',
       'TipoStartPlanebed', 'TipoStartPools', 'TipoStartRun_rapid',
       'TipoStartSteep_poolzone']

In [6]:
X = clf_df[X_columns]
y = (clf_df['graph_vel'] > 0).astype('float')

In [15]:
%%capture
def objective(trial):
    
    param = {}
    param["verbosity"] = 0
    param["objective"] = "binary:logistic"
    param["lambda"] = trial.suggest_float("lambda", 1e-8, 100.0, log=True)
    param["alpha"] = trial.suggest_float("alpha", 1e-8, 100.0, log=True)
    param["subsample"] = trial.suggest_float("subsample", 0.2, 1.0)
    param["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.2, 1.0)
    param["max_depth"] = trial.suggest_int("max_depth", 3, 12, step=1)
    param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 20)
    param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
    param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
    param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    param['n_estimators'] = 300
    #param['eta'] = 0.01
    param["eval_metric"] = "logloss"

    
    model = xgb.XGBClassifier(**param, use_label_encoder=False)
    
    accs_ = []
    for rs in np.random.choice(int(1e6), 7, replace = False):
        kfold = StratifiedKFold(n_splits=5, shuffle = True, random_state=rs)
        results = cross_val_predict(model, X, y.astype('int'), cv=kfold, method='predict_proba')
        precision, recall, thresholds = precision_recall_curve(y.astype('int'), results[:,1])
        f1_scores = 2*recall*precision/(recall+precision)
        thr_ = thresholds[np.argmax(f1_scores)]
        acc_ = accuracy_score(y, results[:,1] >= thr_)
        
        accs_.append(acc_)
        
    mean_acc = np.mean(accs_)

    return mean_acc
    

array([35714,  6638, 33204, 11745,   998, 45733, 84479])

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200, timeout=3600, show_progress_bar = True)

[32m[I 2021-06-12 12:18:57,623][0m A new study created in memory with name: no-name-1ec6f7dc-4bc4-4cdb-bc68-9fa0901a2359[0m
  self._init_valid()


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=200.0), HTML(value='')))

[32m[I 2021-06-12 12:19:09,146][0m Trial 0 finished with value: 0.7874021403433168 and parameters: {'lambda': 1.4508487285782899e-05, 'alpha': 1.2964200489787086e-07, 'subsample': 0.8918787394370227, 'colsample_bytree': 0.6404103945280892, 'max_depth': 3, 'min_child_weight': 3, 'eta': 0.0006027698617560454, 'gamma': 1.4834417341686035e-05, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.7874021403433168.[0m
[32m[I 2021-06-12 12:19:22,245][0m Trial 1 finished with value: 0.8427063132945485 and parameters: {'lambda': 1.3833027894498259e-08, 'alpha': 0.021089168075554127, 'subsample': 0.3164472776444831, 'colsample_bytree': 0.31545965060992104, 'max_depth': 7, 'min_child_weight': 5, 'eta': 0.021918597084749206, 'gamma': 5.052999884643191e-07, 'grow_policy': 'lossguide'}. Best is trial 1 with value: 0.8427063132945485.[0m
[32m[I 2021-06-12 12:19:30,628][0m Trial 2 finished with value: 0.7795733678086619 and parameters: {'lambda': 2.9129493213200728e-06, 'alpha': 9.507487

[32m[I 2021-06-12 12:22:37,130][0m Trial 20 finished with value: 0.8225238813474106 and parameters: {'lambda': 2.7756003951431786e-07, 'alpha': 2.9952117323300574e-05, 'subsample': 0.20185035589046668, 'colsample_bytree': 0.6958497303096549, 'max_depth': 9, 'min_child_weight': 13, 'eta': 0.1477677374352248, 'gamma': 4.565759006143239e-08, 'grow_policy': 'lossguide'}. Best is trial 14 with value: 0.8600876247935071.[0m
[32m[I 2021-06-12 12:22:47,525][0m Trial 21 finished with value: 0.849673202614379 and parameters: {'lambda': 2.3359967516064687e-06, 'alpha': 3.687166731468963e-06, 'subsample': 0.5310620449754879, 'colsample_bytree': 0.43393940313335105, 'max_depth': 11, 'min_child_weight': 15, 'eta': 0.0923114569041753, 'gamma': 1.8431635430239502e-08, 'grow_policy': 'lossguide'}. Best is trial 14 with value: 0.8600876247935071.[0m
[32m[I 2021-06-12 12:22:58,227][0m Trial 22 finished with value: 0.8551317963082667 and parameters: {'lambda': 8.649201509019795e-08, 'alpha': 1.128

[32m[I 2021-06-12 12:26:30,698][0m Trial 40 finished with value: 0.802772391007685 and parameters: {'lambda': 0.00028003342821531803, 'alpha': 2.0208232479754145e-07, 'subsample': 0.7339392460221044, 'colsample_bytree': 0.5811102351672001, 'max_depth': 8, 'min_child_weight': 18, 'eta': 8.46390279919901e-06, 'gamma': 9.176787204453205e-07, 'grow_policy': 'depthwise'}. Best is trial 35 with value: 0.8613086260145082.[0m
[32m[I 2021-06-12 12:26:41,831][0m Trial 41 finished with value: 0.8593693887811533 and parameters: {'lambda': 1.738920685336083e-07, 'alpha': 2.643360473726602e-06, 'subsample': 0.7720511853526925, 'colsample_bytree': 0.4206655978544206, 'max_depth': 9, 'min_child_weight': 17, 'eta': 0.2270689349825941, 'gamma': 1.298784890730889e-07, 'grow_policy': 'lossguide'}. Best is trial 35 with value: 0.8613086260145082.[0m
[32m[I 2021-06-12 12:26:52,904][0m Trial 42 finished with value: 0.8593693887811533 and parameters: {'lambda': 1.6582531879030594e-07, 'alpha': 3.87820

[32m[I 2021-06-12 12:30:38,081][0m Trial 60 finished with value: 0.8572146807440923 and parameters: {'lambda': 0.004756045373451184, 'alpha': 1.0212811960646563e-08, 'subsample': 0.9606671809757664, 'colsample_bytree': 0.23899063178267632, 'max_depth': 5, 'min_child_weight': 6, 'eta': 0.6288613417474003, 'gamma': 0.0004684683754524571, 'grow_policy': 'depthwise'}. Best is trial 57 with value: 0.8686346333405159.[0m
[32m[I 2021-06-12 12:30:49,337][0m Trial 61 finished with value: 0.8618113912231561 and parameters: {'lambda': 0.7128404539367207, 'alpha': 9.057987276705907e-08, 'subsample': 0.9876427270578368, 'colsample_bytree': 0.2713982145536562, 'max_depth': 4, 'min_child_weight': 4, 'eta': 0.10149964403369265, 'gamma': 0.016723678031543707, 'grow_policy': 'depthwise'}. Best is trial 57 with value: 0.8686346333405159.[0m
[32m[I 2021-06-12 12:31:00,504][0m Trial 62 finished with value: 0.8612368024132729 and parameters: {'lambda': 2.5084141004787703, 'alpha': 7.29020931313725e-

[32m[I 2021-06-12 12:34:43,064][0m Trial 80 finished with value: 0.8549881491057961 and parameters: {'lambda': 0.0964975301286563, 'alpha': 3.373348513312464e-07, 'subsample': 0.845204181320494, 'colsample_bytree': 0.9984320196318145, 'max_depth': 6, 'min_child_weight': 3, 'eta': 0.7356291705185382, 'gamma': 0.15766477901364506, 'grow_policy': 'depthwise'}. Best is trial 57 with value: 0.8686346333405159.[0m
[32m[I 2021-06-12 12:34:57,257][0m Trial 81 finished with value: 0.8700711053652233 and parameters: {'lambda': 0.0035476789641711685, 'alpha': 6.713481339626618e-07, 'subsample': 0.8226403229831781, 'colsample_bytree': 0.9355901098493511, 'max_depth': 5, 'min_child_weight': 5, 'eta': 0.20526270982110284, 'gamma': 0.4352925941089278, 'grow_policy': 'depthwise'}. Best is trial 81 with value: 0.8700711053652233.[0m
[32m[I 2021-06-12 12:35:11,752][0m Trial 82 finished with value: 0.8681318681318684 and parameters: {'lambda': 0.02959830910393408, 'alpha': 4.9966569889798474e-08,

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

{}