In [1]:
import xgboost as xgb
import pandas  as pd
import numpy   as np
from   sklearn.model_selection import KFold
import optuna
import time
import json
from optuna.samplers import TPESampler
import functools
import itertools

In [2]:
def data_import(data_name):
    filename = '../../../../../../data/'+data_name+'/'
    inputFileName = filename+'inputs.csv'
    labelFileName = filename+'outputs.csv'
    foldsFileName = filename+'cv/equal_labels/folds.csv'
    inputs        = pd.read_csv(inputFileName,index_col='sequenceID')
    labels        = pd.read_csv(labelFileName,index_col='sequenceID')
    folds         = pd.read_csv(foldsFileName,index_col='sequenceID')
    res           = {}
    res['inputs'] = inputs
    res['labels'] = labels
    res['folds']  = folds
    return(res)

In [3]:
def data_massage(inputs,labels):
    inputs.replace([-float('inf'),float('inf')],np.nan,inplace=True)
    missingCols = inputs.isnull().sum()
    missingCols = list(missingCols[missingCols>0].index)
    inputs.drop(missingCols,axis=1,inplace=True)
    varCols     = inputs.apply(lambda x: np.var(x))
    zeroVarCols = list(varCols[varCols==0].index)
    inputs.drop(zeroVarCols,axis=1,inplace=True)
    labels['min.log.lambda'] = labels['min.log.lambda'].apply(lambda x: np.exp(x))
    labels['max.log.lambda'] = labels['max.log.lambda'].apply(lambda x: np.exp(x))
    return inputs,labels

In [4]:
def getXY(foldNo,folds,inputs,labels):
    test_id       = list(folds[folds['fold']==foldNo].index)
    train_id      = list(folds[folds['fold']!=foldNo].index)
    X             = inputs[inputs.index.isin(train_id)]
    X_val         = inputs[inputs.index.isin(test_id)]
    y_label       = labels[labels.index.isin(train_id)]
    y_label_test  = labels[labels.index.isin(test_id)]
    y_lower       = y_label['min.log.lambda']
    y_upper       = y_label['max.log.lambda']
    y_lower_val   = y_label_test['min.log.lambda']
    y_upper_val   = y_label_test['max.log.lambda']
    res           = {}
    res['X']         = X
    res['X_val']     = X_val
    res['y_lower']      = y_lower
    res['y_lower_val']  = y_lower_val
    res['y_upper']      = y_upper
    res['y_upper_val']  = y_upper_val
    return res

In [5]:
def trainModel(X,X_val,y_lower,y_upper,y_lower_val,y_upper_val,params,num_round,distributionCol):
    
    res    = {}
    dtrain = xgb.DMatrix(X)
    dtrain.set_float_info("label_lower_bound",y_lower.values)
    dtrain.set_float_info("label_upper_bound",y_upper.values)

    dtest  = xgb.DMatrix(X_val)
    dtest.set_float_info("label_lower_bound",y_lower_val.values)
    dtest.set_float_info("label_upper_bound",y_upper_val.values)
    
    bst    = xgb.train(params,dtrain,num_boost_round=num_round,evals=[(dtrain,"train"),(dtest,"test")],evals_result=res,verbose_eval=False)
    min_val_error = round(np.min(res['test'][distributionCol]),4)
    return(min_val_error)

In [6]:
def objective(distribution,trial):
    
    SEED         = 1
    Kfolds       = KFold(n_splits=5,shuffle=True,random_state=SEED)
    num_round    = 5000
    res          = 0
    
    eta              = trial.suggest_discrete_uniform('eta',0.001,1.001,0.1)
    max_depth        = trial.suggest_discrete_uniform('max_depth',2, 10,2)
    
#     min_child_weight = trial.suggest_discrete_uniform('min_child_weight',0.1,100.1,10)
#     reg_alpha        = trial.suggest_loguniform('reg_alpha',0.0001,100)
#     reg_lambda       = trial.suggest_loguniform('reg_lambda',0.0001,100)

    min_child_weight = 0.1
    reg_alpha        = 0.005
    reg_lambda       = 0.5
    
    if distribution in ['normal','logistic']:
        sigma  = 1
    else:
        sigma  = 10
    
    distribution_sigma = distribution+ ',' + str(sigma)
    eval_metric     = 'aft-nloglik@'+distribution_sigma
    base_score      = 0.5
    
    params   = {
                'eta':eta,
                'max_depth':int(max_depth),
                'min_child_weight':min_child_weight,
                'subsample':0.7,
                'reg_alpha':reg_alpha,
                'reg_lambda':reg_lambda,
                'aft_noise_distribution' : distribution, 
                'aft_sigma': sigma,
                'eval_metric':eval_metric,
                'base_score':base_score,
                'objective':"aft:survival",
                'verbosity': 0
                }
    
    for fold_, (trn_idx, val_idx) in enumerate(Kfolds.split(X, y_lower,y_upper)):
        tr_x, tr_y_lower,tr_y_upper = X.iloc[trn_idx,:],y_lower.iloc[trn_idx],y_upper.iloc[trn_idx]
        vl_x, vl_y_lower,vl_y_upper = X.iloc[val_idx,:], y_lower.iloc[val_idx],y_upper.iloc[val_idx]
        res = res + trainModel(tr_x,vl_x,tr_y_lower,tr_y_upper,vl_y_lower,vl_y_upper,params,num_round,distribution_sigma)
    return res

In [7]:
def best_iter(eta,max_depth,min_child_weight,reg_alpha,reg_lambda,sigma,distribution): 
    SEED          = 1
    Kfolds        = KFold(n_splits=5,shuffle=True,random_state=SEED)
    num_round     = 5000
    # Discrete-uniform parameter
    distributionCol = distribution+ ',' + str(sigma)
    eval_metric     = 'aft-nloglik@'+distributionCol
    base_score      = 0.5
    
    params   = {
                'eta':eta,
                'max_depth':int(max_depth),
                'min_child_weight':min_child_weight,
                'subsample':0.7,
                'reg_alpha':reg_alpha,
                'reg_lambda':reg_lambda,
                'aft_noise_distribution' : distribution, 
                'aft_sigma': sigma,
                'eval_metric':eval_metric,
                'base_score':base_score,
                'objective':"aft:survival",
                'verbosity': 0
                }

    res_data = pd.DataFrame()
    for fold_, (trn_idx, val_idx) in enumerate(Kfolds.split(X, y_lower,y_upper)):
        tr_x, tr_y_lower,tr_y_upper = X.iloc[trn_idx,:],y_lower.iloc[trn_idx],y_upper.iloc[trn_idx]
        vl_x, vl_y_lower,vl_y_upper = X.iloc[val_idx,:], y_lower.iloc[val_idx],y_upper.iloc[val_idx]
        res_data[fold_] = trainModelIter(tr_x,vl_x,tr_y_lower,tr_y_upper,vl_y_lower,vl_y_upper,params,num_round,distributionCol)
    res_data['total'] = res_data.sum(axis=1)
    res = {}
    num_round = res_data.idxmin(axis=0, skipna=True)['total']
    res['num_round'] = num_round
    res['min_val_error'] = min(res_data['total'])
    return res

In [8]:
data_name_domain = ['ATAC_JV_adipose','CTCF_TDH_ENCODE','H3K27ac-H3K4me3_TDHAM_BP','H3K27ac_TDH_some','H3K36me3_AM_immune']

In [9]:
data      = data_import(data_name_domain[0])
data_name = data_name_domain[0]

In [10]:
inputs = data['inputs']
labels = data['labels']
folds  = data['folds']

In [11]:
inputs,labels = data_massage(inputs,labels)

In [12]:
global X
global X_val
global y_lower
global y_upper
global y_upper_val

In [13]:
run_time = {}
error    = []

In [56]:
for fold in np.unique(folds['fold'].values):
    
    start        = time.time()
    res          = getXY(fold,folds,inputs,labels)
    X            = res['X']        
    X_val        = res['X_val']
    y_lower      = res['y_lower']
    y_lower_val  = res['y_lower_val']
    y_upper      = res['y_upper']
    y_upper_val  = res['y_upper_val']
    
    for distribution in ['normal','logistic','extreme']:
        print(fold,distribution)
        sampler = TPESampler(seed=1)  # Make the sampler behave in a deterministic way.
        database_name = 'sqlite:///'+str(fold)+"_"+distribution+".db"
        study = optuna.create_study(sampler=sampler,storage=database_name)
        study.optimize(functools.partial(objective,distribution), n_trials=100)
        trial         = study.best_trial
    end            = time.time()
    time_taken     = end - start
    run_time[fold] = time_taken

1 normal


[I 2019-12-26 05:17:05,276] A new study created with name: no-name-24d15b24-1458-4b7d-9cb4-3d490ffd2449
[I 2019-12-26 05:17:17,498] Finished trial#0 resulted in value: inf. Current best value is inf with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 05:17:26,832] Finished trial#1 resulted in value: 1.3139000000000003. Current best value is 1.3139000000000003 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:17:33,737] Finished trial#2 resulted in value: 5.2522. Current best value is 1.3139000000000003 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:17:44,206] Finished trial#3 resulted in value: 7.000100000000001. Current best value is 1.3139000000000003 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:17:52,767] Finished trial#4 resulted in value: inf. Current best value is 1.3139000000000003 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:18:04,089] Finished trial#5 resulted in value: inf. Curre

1 logistic


[I 2019-12-26 05:28:42,896] A new study created with name: no-name-a80c365f-2239-4b87-8953-c69aad13f0eb
[I 2019-12-26 05:28:47,527] Finished trial#0 resulted in value: 485.6071. Current best value is 485.6071 with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 05:28:57,847] Finished trial#1 resulted in value: 3.0232. Current best value is 3.0232 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[W 2019-12-26 05:29:04,538] Setting status of trial#2 as TrialState.FAIL because the objective function returned nan.
[I 2019-12-26 05:29:09,205] Finished trial#3 resulted in value: 233.48849999999996. Current best value is 3.0232 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:29:13,891] Finished trial#4 resulted in value: 506.9914. Current best value is 3.0232 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:29:18,442] Finished trial#5 resulted in value: 479.14840000000004. Current best value is 3.0232 with parameters: {'eta': 0.001, 'max_de

1 extreme


[I 2019-12-26 05:38:59,038] A new study created with name: no-name-63043019-7620-4e3e-8ca8-62381541d00b
[I 2019-12-26 05:39:07,789] Finished trial#0 resulted in value: 11.691099999999999. Current best value is 11.691099999999999 with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 05:39:13,928] Finished trial#1 resulted in value: 10.9964. Current best value is 10.9964 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:39:20,451] Finished trial#2 resulted in value: 11.2969. Current best value is 10.9964 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:39:29,960] Finished trial#3 resulted in value: 11.4024. Current best value is 10.9964 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:39:38,781] Finished trial#4 resulted in value: 11.208800000000002. Current best value is 10.9964 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:39:47,157] Finished trial#5 resulted in value: 11.651900000000001. Current b

2 normal


[I 2019-12-26 05:53:43,097] A new study created with name: no-name-94b91b5b-5434-4ccf-ba09-a47feb9ace93
[I 2019-12-26 05:53:54,619] Finished trial#0 resulted in value: inf. Current best value is inf with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 05:54:03,902] Finished trial#1 resulted in value: 1.3815999999999997. Current best value is 1.3815999999999997 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:54:10,420] Finished trial#2 resulted in value: 10.6137. Current best value is 1.3815999999999997 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:54:20,571] Finished trial#3 resulted in value: inf. Current best value is 1.3815999999999997 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:54:31,239] Finished trial#4 resulted in value: inf. Current best value is 1.3815999999999997 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 05:54:42,883] Finished trial#5 resulted in value: inf. Current best value

2 logistic


[I 2019-12-26 06:05:23,797] A new study created with name: no-name-a95811f2-bbb2-4b7f-ac6a-3d7d7f704021
[I 2019-12-26 06:05:28,261] Finished trial#0 resulted in value: 481.86670000000004. Current best value is 481.86670000000004 with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 06:05:38,482] Finished trial#1 resulted in value: 2.4309000000000003. Current best value is 2.4309000000000003 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:05:45,134] Finished trial#2 resulted in value: 3.3636999999999997. Current best value is 2.4309000000000003 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:05:49,706] Finished trial#3 resulted in value: 216.8646. Current best value is 2.4309000000000003 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:05:54,249] Finished trial#4 resulted in value: 497.3674. Current best value is 2.4309000000000003 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:05:58,830] Finished

2 extreme


[I 2019-12-26 06:15:54,783] A new study created with name: no-name-4e150faf-fd14-4f50-a0e7-21b2838fdac8
[I 2019-12-26 06:16:03,341] Finished trial#0 resulted in value: 11.298000000000002. Current best value is 11.298000000000002 with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 06:16:09,964] Finished trial#1 resulted in value: 10.2993. Current best value is 10.2993 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:16:16,744] Finished trial#2 resulted in value: 10.3421. Current best value is 10.2993 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:16:25,964] Finished trial#3 resulted in value: 10.759500000000001. Current best value is 10.2993 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:16:34,268] Finished trial#4 resulted in value: 11.222. Current best value is 10.2993 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:16:42,587] Finished trial#5 resulted in value: 10.962200000000001. Current be

3 normal


[I 2019-12-26 06:26:34,368] A new study created with name: no-name-99bef2cd-74aa-47ce-9dcb-0b11d3e0c1f1
[I 2019-12-26 06:26:47,137] Finished trial#0 resulted in value: 7.3831999999999995. Current best value is 7.3831999999999995 with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 06:26:57,499] Finished trial#1 resulted in value: 1.6366999999999998. Current best value is 1.6366999999999998 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:27:04,521] Finished trial#2 resulted in value: 2.9692999999999996. Current best value is 1.6366999999999998 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:27:15,174] Finished trial#3 resulted in value: inf. Current best value is 1.6366999999999998 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:27:26,514] Finished trial#4 resulted in value: 24.728099999999998. Current best value is 1.6366999999999998 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:27:40,048] Fin

3 logistic


[I 2019-12-26 06:39:03,440] A new study created with name: no-name-cda2c1d5-960d-48c8-bc5a-f266acb1b17c
[I 2019-12-26 06:39:08,450] Finished trial#0 resulted in value: 575.1709999999999. Current best value is 575.1709999999999 with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 06:39:19,360] Finished trial#1 resulted in value: 2.5890000000000004. Current best value is 2.5890000000000004 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[W 2019-12-26 06:39:26,493] Setting status of trial#2 as TrialState.FAIL because the objective function returned nan.
[I 2019-12-26 06:39:31,312] Finished trial#3 resulted in value: 270.2713. Current best value is 2.5890000000000004 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:39:36,027] Finished trial#4 resulted in value: 582.5962. Current best value is 2.5890000000000004 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:39:40,998] Finished trial#5 resulted in value: 589.1135. Current best value is 

3 extreme


[I 2019-12-26 06:50:15,815] A new study created with name: no-name-b56ea0a6-e42d-440e-8ada-9da19186511b
[I 2019-12-26 06:50:24,872] Finished trial#0 resulted in value: 10.8505. Current best value is 10.8505 with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 06:50:31,693] Finished trial#1 resulted in value: 10.386500000000002. Current best value is 10.386500000000002 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:50:38,689] Finished trial#2 resulted in value: 10.6615. Current best value is 10.386500000000002 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:50:48,533] Finished trial#3 resulted in value: 10.7923. Current best value is 10.386500000000002 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:50:58,137] Finished trial#4 resulted in value: 10.985800000000001. Current best value is 10.386500000000002 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 06:51:06,928] Finished trial#5 resulted in val

4 normal


[I 2019-12-26 07:01:58,523] A new study created with name: no-name-2432307d-e813-42bb-b6b7-e93f8488da6d
[I 2019-12-26 07:02:09,615] Finished trial#0 resulted in value: inf. Current best value is inf with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 07:02:19,470] Finished trial#1 resulted in value: 1.1963000000000001. Current best value is 1.1963000000000001 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 07:02:26,014] Finished trial#2 resulted in value: 4.6823999999999995. Current best value is 1.1963000000000001 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 07:02:35,877] Finished trial#3 resulted in value: inf. Current best value is 1.1963000000000001 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 07:02:45,686] Finished trial#4 resulted in value: inf. Current best value is 1.1963000000000001 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 07:02:57,295] Finished trial#5 resulted in value: inf. Current

4 logistic


[I 2019-12-26 07:13:25,958] A new study created with name: no-name-b605b33b-214a-408c-91d7-3cde5db16878
[I 2019-12-26 07:13:30,591] Finished trial#0 resulted in value: 560.1084. Current best value is 560.1084 with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 07:13:40,691] Finished trial#1 resulted in value: 2.8294. Current best value is 2.8294 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[W 2019-12-26 07:13:47,361] Setting status of trial#2 as TrialState.FAIL because the objective function returned nan.
[I 2019-12-26 07:13:52,067] Finished trial#3 resulted in value: 254.17000000000002. Current best value is 2.8294 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 07:13:56,692] Finished trial#4 resulted in value: 550.221. Current best value is 2.8294 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 07:14:01,338] Finished trial#5 resulted in value: 541.804. Current best value is 2.8294 with parameters: {'eta': 0.001, 'max_depth': 4.0}.


4 extreme


[I 2019-12-26 07:23:56,946] A new study created with name: no-name-c44657c7-2a71-4655-a243-444b0b70732c
[I 2019-12-26 07:24:05,803] Finished trial#0 resulted in value: 11.6772. Current best value is 11.6772 with parameters: {'eta': 0.401, 'max_depth': 8.0}.
[I 2019-12-26 07:24:11,868] Finished trial#1 resulted in value: 10.967099999999999. Current best value is 10.967099999999999 with parameters: {'eta': 0.001, 'max_depth': 4.0}.
[I 2019-12-26 07:24:18,471] Finished trial#2 resulted in value: 10.6239. Current best value is 10.6239 with parameters: {'eta': 0.101, 'max_depth': 2.0}.
[I 2019-12-26 07:24:27,917] Finished trial#3 resulted in value: 10.874600000000001. Current best value is 10.6239 with parameters: {'eta': 0.101, 'max_depth': 2.0}.
[I 2019-12-26 07:24:36,652] Finished trial#4 resulted in value: 11.6034. Current best value is 10.6239 with parameters: {'eta': 0.101, 'max_depth': 2.0}.
[I 2019-12-26 07:24:45,162] Finished trial#5 resulted in value: 11.3207. Current best value i

In [20]:
run_time1 ={}
for key in run_time.keys():
    run_time1[str(key)] = run_time[key]

In [21]:
json_filename = "../../../../../result/"+data_name+"/xgboost/run_time_2_param_tuning1.json"
with open(json_filename, "w") as write_file:
    json.dump(run_time1, write_file)

In [22]:
def trainModelIter(X,X_val,y_lower,y_upper,y_lower_val,y_upper_val,params,num_round,distributionCol):
    
    res    = {}
    dtrain = xgb.DMatrix(X)
    dtrain.set_float_info("label_lower_bound",y_lower)
    dtrain.set_float_info("label_upper_bound",y_upper)

    dtest  = xgb.DMatrix(X_val)
    dtest.set_float_info("label_lower_bound",y_lower_val)
    dtest.set_float_info("label_upper_bound",y_upper_val)

    bst    = xgb.train(params,dtrain,num_boost_round=num_round,evals=[(dtrain,"train"),(dtest,"test")],evals_result=res,verbose_eval=False)
    val_error = res['test'][distributionCol]
    
    return(val_error)

In [23]:
run_time2 = {}

In [25]:
#for fold in range(2,3):
for fold in np.unique(folds['fold'].values):
    start_time   = time.time()
    res = getXY(fold,folds,inputs,labels)
    X            = res['X']        
    X_val        = res['X_val']
    y_lower      = res['y_lower']
    y_lower_val  = res['y_lower_val']
    y_upper      = res['y_upper']
    y_upper_val  = res['y_upper_val']
    
    for distribution in ['normal','logistic','extreme']:
        json_filename = "../../../../../result/"+data_name+"/xgboost/fold"+str(fold)+'_'+distribution+'_param_2.json'
        with open(json_filename, errors='ignore') as json_data:
            json_fold = json.load(json_data, strict=False)
        eta = json_fold['eta']
        max_depth = json_fold['max_depth']
        min_child_weight = 0.1
        reg_alpha        = 0.005
        reg_lambda       = 0.5
        sigma= 1
        res      = best_iter(eta,max_depth,min_child_weight,reg_alpha,reg_lambda,sigma,distribution)
        new_json = {}
        new_json['eta'] = eta
        new_json['max_depth'] = max_depth
        new_json['min_child_weight'] = min_child_weight
        new_json['reg_alpha']     = reg_alpha
        new_json['reg_lambda']    = reg_lambda
        new_json['sigma']         = sigma
        new_json['distribution']  = distribution
        new_json['num_round']     = int(res['num_round'])
        if res['min_val_error'] == float('inf'):
            res['min_val_error'] = 10**8
        new_json['min_val_error'] = res['min_val_error']
        json_filename = "../../../../../result/"+data_name+"/xgboost/fold_new"+str(fold)+'_'+distribution+'_param_2.json'
        with open(json_filename, "w") as write_file:
             json.dump(new_json, write_file)
    end_time        = time.time()
    time_taken      = end_time - start_time
    run_time2[fold] = time_taken

### Choosing best hyperparameter

In [80]:
#for fold in range(2,3):
for fold in np.unique(folds['fold'].values):
    fold_data = pd.DataFrame()
    for distribution in ['normal','logistic','extreme']:
        json_filename = "../../../../result/"+data_name+"/xgboost/fold_new"+str(fold)+'_'+distribution+'_param_2.json'
        with open(json_filename, errors='ignore') as json_data:
            json_fold = json.load(json_data, strict=False)
        dist_data = pd.DataFrame.from_dict(json_fold,orient='index',columns=[distribution])
        fold_data = pd.concat([fold_data,dist_data],axis=1)
    fold_data = fold_data.transpose()
    fold_data['min_val_error'] = fold_data['min_val_error'].astype('float')
    best_dis   = fold_data['min_val_error'].idxmin()
    best_param = fold_data.loc[best_dis]
    json_filename = "../../../../result/"+data_name+"/xgboost/fold_new"+str(fold)+'_dis'+'_param_2.json'
    best_param = best_param.to_dict()
    with open(json_filename, "w") as write_file:
        json.dump(best_param, write_file)

In [26]:
run_time3 ={}
for key in run_time2.keys():
    run_time3[str(key)] = run_time2[key]

In [27]:
json_filename = "../../../../../result/"+data_name+"/xgboost/run_time_2_param_tuning2.json"
with open(json_filename, "w") as write_file:
    json.dump(run_time3, write_file)