In [None]:
import xgboost as xgb
import pandas  as pd
import numpy   as np
from   sklearn.model_selection import KFold
import optuna
import time
import json
from   optuna.samplers import TPESampler
import functools
import os

In [None]:
def data_import(data_name):
    filename = 'https://raw.githubusercontent.com/avinashbarnwal/GSOC-2019/master/AFT/test/data/'+data_name+'/'
    inputFileName = filename+'inputs.csv'
    labelFileName = filename+'outputs.csv'
    foldsFileName = filename+'cv/equal_labels/folds.csv'
    inputs        = pd.read_csv(inputFileName,index_col='sequenceID')
    labels        = pd.read_csv(labelFileName,index_col='sequenceID')
    folds         = pd.read_csv(foldsFileName,index_col='sequenceID')
    res           = {}
    res['inputs'] = inputs
    res['labels'] = labels
    res['folds']  = folds
    return(res)

In [None]:
def data_massage(inputs,labels):
    inputs.replace([-float('inf'),float('inf')],np.nan,inplace=True)
    missingCols = inputs.isnull().sum()
    missingCols = list(missingCols[missingCols>0].index)
    inputs.drop(missingCols,axis=1,inplace=True)
    varCols     = inputs.apply(lambda x: np.var(x))
    zeroVarCols = list(varCols[varCols==0].index)
    inputs.drop(zeroVarCols,axis=1,inplace=True)
    labels['min.log.lambda'] = labels['min.log.lambda'].apply(lambda x: np.exp(x))
    labels['max.log.lambda'] = labels['max.log.lambda'].apply(lambda x: np.exp(x))
    return inputs,labels

In [None]:
def getXY(foldNo,folds,inputs,labels):
    test_id       = list(folds[folds['fold']==foldNo].index)
    train_id      = list(folds[folds['fold']!=foldNo].index)
    X             = inputs[inputs.index.isin(train_id)]
    X_val         = inputs[inputs.index.isin(test_id)]
    y_label       = labels[labels.index.isin(train_id)]
    y_label_test  = labels[labels.index.isin(test_id)]
    y_lower       = y_label['min.log.lambda']
    y_upper       = y_label['max.log.lambda']
    y_lower_val   = y_label_test['min.log.lambda']
    y_upper_val   = y_label_test['max.log.lambda']
    res           = {}
    res['X']         = X
    res['X_val']     = X_val
    res['y_lower']      = y_lower
    res['y_lower_val']  = y_lower_val
    res['y_upper']      = y_upper
    res['y_upper_val']  = y_upper_val
    return res

In [None]:
def trainModelIter(X,X_val,y_lower,y_upper,y_lower_val,y_upper_val,params,num_round,distributionCol):
    
    res    = {}
    dtrain = xgb.DMatrix(X)
    dtrain.set_float_info("label_lower_bound",y_lower)
    dtrain.set_float_info("label_upper_bound",y_upper)

    dtest  = xgb.DMatrix(X_val)
    dtest.set_float_info("label_lower_bound",y_lower_val)
    dtest.set_float_info("label_upper_bound",y_upper_val)

    bst       = xgb.train(params,dtrain,num_boost_round=num_round,evals=[(dtrain,"train"),(dtest,"test")],evals_result=res,verbose_eval=False)
    val_error = res['test'][distributionCol]
    res_data  = pd.DataFrame()
    res_data['error'] = val_error
    res_data['trees'] = np.arange(1,5001)
    best_round_index  = res_data.idxmin(axis=0, skipna=True)['error']
    best_round        = res_data.loc[best_round_index,'trees']
    return(best_round)

In [None]:
#for fold in range(2,3):
def get_result(data_name):
    inputs,labels,folds = get_data(data_name)
    run_time = {}
    for fold in np.unique(folds['fold'].values):
        start_time   = time.time()
        res = getXY(fold,folds,inputs,labels)
        X            = res['X']        
        X_val        = res['X_val']
        y_lower      = res['y_lower']
        y_lower_val  = res['y_lower_val']
        y_upper      = res['y_upper']
        y_upper_val  = res['y_upper_val']
        for distribution in ['normal','logistic','extreme']:
            if distribution == 'normal':
                sigma = 10
            elif distribution == 'logistics':
                sigma = 1
            else:
                sigma  = 10
            distribution_sigma = distribution+ ',' + str(sigma)
            eval_metric        = 'aft-nloglik@'+distribution_sigma
            base_score         = 0.5
            params   = {
                        'eta':eta,
                        'max_depth':int(max_depth),
                        'min_child_weight':min_child_weight,
                        'subsample':0.7,
                        'reg_alpha':reg_alpha,
                        'reg_lambda':reg_lambda,
                        'aft_noise_distribution' : distribution, 
                        'aft_sigma': sigma,
                        'eval_metric':eval_metric,
                        'base_score':base_score,
                        'objective':"aft:survival",
                        'random_state':1,
                        'verbosity': 0
                        }
            key = str(fold)+"_"+distribution
            best_round    = trainModelIter(X,X_val,y_lower,y_upper,y_lower_val,y_upper_val,params,num_round,distribution_sigma)
            json_filename = "../../../../../result/"+data_name+"/xgboost/fold"+str(fold)+'_'+distribution+'_param_0.json'
            result_json   = {
                            'eta':eta,
                            'max_depth':max_depth,
                            'min_child_weight':min_child_weight,
                            'subsample':0.7,
                            'reg_alpha':reg_alpha,
                            'reg_lambda':reg_lambda,
                            'distribution' : distribution, 
                            'sigma': sigma,
                            'num_round':int(best_round)
                            }
            with open(json_filename, "w") as write_file:
                json.dump(result_json, write_file)
            end_time        = time.time()
            time_taken      = end_time - start_time
            key             = str(fold)+"_"+distribution
            run_time[key]   = time_taken
    run_filename = "../../../../../result/"+data_name+"/xgboost/run_dis_time_0_param.json"
    with open(run_filename, "w") as write_file:
        json.dump(run_time, write_file)

In [None]:
def get_data(data_name):
    data      = data_import(data_name)
    inputs    = data['inputs']
    labels    = data['labels']
    folds     = data['folds']
    inputs,labels = data_massage(inputs,labels)
    return inputs,labels,folds

In [None]:
seed         = 1
num_round    = 5000
eta          = 0.01
max_depth    = 4
min_child_weight = 0.1
reg_alpha        = 0.005
reg_lambda       = 0.5

In [None]:
data_name_domain = ['ATAC_JV_adipose','CTCF_TDH_ENCODE','H3K27ac-H3K4me3_TDHAM_BP',
                    'H3K27ac_TDH_some','H3K36me3_AM_immune','H3K27me3_RL_cancer',
                    'H3K27me3_TDH_some','H3K36me3_TDH_ENCODE','H3K36me3_TDH_immune','H3K36me3_TDH_other']

In [None]:
data_name = data_name_domain[9]

In [None]:
get_result(data_name)