In [84]:
import xgboost as xgb
import pandas  as pd
import numpy   as np
from   sklearn.model_selection import KFold
import optuna
import time

In [72]:
def data_import(dataName):
    filename = 'https://raw.githubusercontent.com/avinashbarnwal/GSOC-2019/master/AFT/test/data/neuroblastoma-data-master/data/'+dataname+'/'
    inputFileName = filename+'inputs.csv'
    labelFileName = filename+'outputs.csv'
    foldsFileName = filename+'cv/equal_labels/folds.csv'
    inputs        = pd.read_csv(inputFileName,index_col='sequenceID')
    labels        = pd.read_csv(labelFileName,index_col='sequenceID')
    folds         = pd.read_csv(foldsFileName,index_col='sequenceID')
    res           = {}
    res['inputs']   = inputs
    res['labels']    = labels
    res['folds']     = folds
    return(res)

In [73]:
def data_massage(inputs,labels):
    inputs.replace([-float('inf'),float('inf')],np.nan,inplace=True)
    missingCols = inputs.isnull().sum()
    missingCols = list(missingCols[missingCols>0].index)
    inputs.drop(missingCols,axis=1,inplace=True)
    varCols     = inputs.apply(lambda x: np.var(x))
    zeroVarCols = list(varCols[varCols==0].index)
    inputs.drop(zeroVarCols,axis=1,inplace=True)
    labels['min.log.lambda'] = labels['min.log.lambda'].apply(lambda x: np.exp(x))
    labels['max.log.lambda'] = labels['max.log.lambda'].apply(lambda x: np.exp(x))
    return inputs,labels

In [74]:
def getXY(foldNo,folds,inputs,labels):
    test_id       = list(folds[folds['fold']==foldNo].index)
    train_id      = list(folds[folds['fold']!=foldNo].index)
    X             = inputs[inputs.index.isin(train_id)]
    X_val         = inputs[inputs.index.isin(test_id)]
    y_label       = labels[labels.index.isin(train_id)]
    y_label_test  = labels[labels.index.isin(test_id)]
    y_lower       = y_label['min.log.lambda']
    y_upper       = y_label['max.log.lambda']
    y_lower_val   = y_label_test['min.log.lambda']
    y_upper_val   = y_label_test['max.log.lambda']
    res           = {}
    res['X']         = X
    res['X_val']     = X_val
    res['y_lower']      = y_lower
    res['y_lower_val']  = y_lower_val
    res['y_upper']      = y_upper
    res['y_upper_val']  = y_upper_val
    return res

In [75]:
def trainModel(X,X_val,y_lower,y_upper,y_lower_val,y_upper_val,params,num_round,distributionCol):
    res    = {}
    dtrain = xgb.DMatrix(X)
    dtrain.set_float_info("label_lower_bound",y_lower)
    dtrain.set_float_info("label_upper_bound",y_upper)

    dtest  = xgb.DMatrix(X_val)
    dtest.set_float_info("label_lower_bound",y_lower_val)
    dtest.set_float_info("label_upper_bound",y_upper_val)

    bst    = xgb.train(params,dtrain,num_boost_round=num_round,evals=[(dtrain,"train"),(dtest,"test")],evals_result=res)
    min_val_error = round(np.min(res['test'][distributionCol]),4)
    return(min_val_error)

In [76]:
dataName = 'ATAC_JV_adipose'

In [99]:
def objective(trial):
    
    folds     = KFold(n_splits=5,shuffle=True,random_state=SEED)
    num_round    = 5000
    res          = 0
    # Discrete-uniform parameter
    eta              = trial.suggest_discrete_uniform('eta',0.001,1.001,0.1)
    max_depth        = trial.suggest_discrete_uniform('max_depth',2, 10,2)
    min_child_weight = trial.suggest_discrete_uniform('min_child_weight',0.1,100.1,10)
    reg_alpha        = trial.suggest_loguniform('reg_alpha',0.0001,100)
    reg_lambda       = trial.suggest_loguniform('reg_lambda',0.0001,100)
    sigma            = trial.suggest_discrete_uniform('sigma',1,100,1)
    distribution     = trial.suggest_categorical('distribution',['normal','logistic','extreme'])
    
    distributionCol = distribution+ ',' + str(sigma)
    eval_metric     = 'aft-nloglik@'+distributionCol
    base_score      = 0.5
    
    params   = {
                'eta':eta,
                'max_depth':int(max_depth),
                'min_child_weight':min_child_weight,
                'subsample':0.7,
                'reg_alpha':reg_alpha,
                'reg_lambda':reg_lambda,
                'aft_noise_distribution' : distribution, 
                'aft_sigma': sigma,
                'eval_metric':eval_metric,
                'base_score':base_score,
                'objective':"aft:survival",
                'verbosity': 0
                }
    
    for fold_, (trn_idx, val_idx) in enumerate(Kfolds.split(X, y_lower,y_upper)):
        tr_x, tr_y_lower,tr_y_upper = X.iloc[trn_idx,:],y_lower.iloc[trn_idx],y_upper.iloc[trn_idx]
        vl_x, vl_y_lower,vl_y_upper = X.iloc[val_idx,:], y_lower.iloc[val_idx],y_upper.iloc[val_idx]
        res = res + trainModel(tr_x,vl_x,tr_y_lower,tr_y_upper,vl_y_lower,vl_y_upper,params,num_round,distributionCol)
    return res

In [96]:
data = data_import(dataName)

In [79]:
inputs = data['inputs']
labels = data['labels']
folds  = data['folds']

In [80]:
inputs,labels = data_massage(inputs,labels)

In [81]:
res = getXY(1,folds,inputs,labels)

In [82]:
X            = res['X']        
X_val        = res['X_val']
y_lower      = res['y_lower']
y_lower_val  = res['y_lower_val']
y_upper      = res['y_upper']
y_upper_val  = res['y_upper_val']

In [83]:
study = optuna.create_study()

In [None]:
start = time.time()
study.optimize(objective, n_trials=5)

[0]	train-extreme,16.0:3.50397	test-extreme,16.0:3.60609
[1]	train-extreme,16.0:3.43953	test-extreme,16.0:3.54628
[2]	train-extreme,16.0:3.40391	test-extreme,16.0:3.51408
[3]	train-extreme,16.0:3.38334	test-extreme,16.0:3.49610
[4]	train-extreme,16.0:3.37313	test-extreme,16.0:3.48755
[5]	train-extreme,16.0:3.36569	test-extreme,16.0:3.48164
[6]	train-extreme,16.0:3.36161	test-extreme,16.0:3.47861
[7]	train-extreme,16.0:3.35872	test-extreme,16.0:3.47667
[8]	train-extreme,16.0:3.35683	test-extreme,16.0:3.47557
[9]	train-extreme,16.0:3.35586	test-extreme,16.0:3.47510
[10]	train-extreme,16.0:3.35511	test-extreme,16.0:3.47484
[11]	train-extreme,16.0:3.35464	test-extreme,16.0:3.47477
[12]	train-extreme,16.0:3.35440	test-extreme,16.0:3.47478
[13]	train-extreme,16.0:3.35419	test-extreme,16.0:3.47485
[14]	train-extreme,16.0:3.35410	test-extreme,16.0:3.47492
[15]	train-extreme,16.0:3.35404	test-extreme,16.0:3.47497
[16]	train-extreme,16.0:3.35398	test-extreme,16.0:3.47508
[17]	train-extreme,16.0:

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[63]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[64]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[65]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[66]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[67]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[68]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[69]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[70]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[71]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[72]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[73]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[74]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[75]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[76]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[77]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[78]	train-logistic,66.0:5.09857	test-logistic,66.0:5.27920
[79]	train-logistic,66.0:5.09857	test-lo

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1150]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1151]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1152]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1153]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1154]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1155]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1156]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1157]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1158]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1159]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1160]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1161]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1162]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1163]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1164]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1165]	train-logistic,66.0:5.11729	test-logistic,66.0:5.20435
[1166]	t

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1879]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1880]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1881]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1882]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1883]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1884]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1885]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1886]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1887]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1888]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1889]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1890]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1891]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1892]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1893]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1894]	train-logistic,66.0:5.19613	test-logistic,66.0:4.88899
[1895]	t

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[2288]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2289]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2290]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2291]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2292]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2293]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2294]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2295]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2296]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2297]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2298]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2299]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2300]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2301]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2302]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2303]	train-logistic,66.0:5.16339	test-logistic,66.0:5.01995
[2304]	t

[32m[I 2019-11-02 20:49:10,473][0m Finished trial#33 resulted in value: 25.6736. Current best value is 14.6488 with parameters: {'eta': 0.6010000000000001, 'max_depth': 8.0, 'min_child_weight': 90.1, 'reg_alpha': 0.053088687930935234, 'reg_lambda': 0.12235874099625975, 'sigma': 11.0, 'distribution': 'normal'}.[0m


[0]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[1]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[2]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[3]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[4]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[5]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[6]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[7]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[8]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[9]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[10]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[11]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[12]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[13]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[14]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[15]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[16]	train-extreme,90.0:5.03544	test-extreme,90.0:5.20558
[17]	train-extreme,90.0:

In [101]:
trial = study.best_trial

print('Value: {}'.format(trial.value))
print('Params: ')

for key, value in trial.params.items():
    print('{}: {}'.format(key, value))

Value: 14.6488
Params: 
eta: 0.6010000000000001
max_depth: 8.0
min_child_weight: 90.1
reg_alpha: 0.053088687930935234
reg_lambda: 0.12235874099625975
sigma: 11.0
distribution: normal
