In [2]:
library(caret)
library(xgboost)
#library(MlBayesOpt)
library(Matrix)
library(rBayesianOptimization)
#https://gitlab.com/avinashbarnwal/elokaggle/blob/master/code/304_hyperparameter_optuna.ipynb
#https://gitlab.com/avinashbarnwal/elokaggle/blob/master/code/302_LGBM_BO_hyperpara.ipynb
#https://cran.r-project.org/web/packages/MlBayesOpt/vignettes/MlBayesOpt.html

In [3]:
data_import =function(dataname){
  filename = paste('https://raw.githubusercontent.com/avinashbarnwal/GSOC-2019/master/AFT/test/data/neuroblastoma-data-master/data/',dataname,'/',sep="")
  inputFileName = paste(filename,'inputs.csv',sep="")
  labelFileName = paste(filename,'outputs.csv',sep="")
  foldsFileName = paste(filename,'cv/equal_labels/folds.csv',sep="")
  inputs        = read.table(inputFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  labels        = read.table(labelFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  folds         = read.table(foldsFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  res           = list()
  res$inputs    = inputs
  res$labels    = labels
  res$folds     = folds
  return(res)
}

In [4]:
data_massage = function(inputs,labels){
    rownamesInput = rownames(inputs)
    inputs        = do.call(data.frame,lapply(inputs, function(x) replace(x, is.infinite(x),NA)))
    naColumns     = colnames(inputs)[colSums(is.na(inputs))>0]
    noVarCol      = getNonVarCols(inputs)
    removeCols    = c(naColumns,noVarCol)
    inputs        = inputs[ , !(colnames(inputs) %in% removeCols)]
    rownames(inputs) = rownamesInput
    labels$min.log.lambda = unlist(lapply(labels$min.log.lambda,exp))
    labels$max.log.lambda = unlist(lapply(labels$max.log.lambda,exp))
    res        = list()
    res$inputs = inputs
    res$labels = labels
    return(res)
}

In [5]:
getXY<-function(foldNo,folds,inputs,labels){
    test.id       = rownames(subset(folds,fold==foldNo))
    train.id      = rownames(subset(folds,fold!=foldNo))
    X             = subset(inputs,rownames(inputs) %in% train.id)
    X             = as.matrix(X)
    X.val         = subset(inputs,rownames(inputs) %in% test.id)
    X.val         = as.matrix(X.val)
    y.label       = subset(labels,rownames(labels) %in% train.id)
    y.label.test  = subset(labels,rownames(labels) %in% test.id)
    y.lower       = as.matrix(y.label$min.log.lambda)
    y.upper       = as.matrix(y.label$max.log.lambda)
    y.lower.val   = as.matrix(y.label.test$min.log.lambda)
    y.upper.val   = as.matrix(y.label.test$max.log.lambda)
    res           = list()
    res$X         = X
    res$X.val     = X.val
    res$y.lower      = y.lower
    res$y.lower.val  = y.lower.val
    res$y.upper      = y.upper
    res$y.upper.val  = y.upper.val
    return(res)
}

In [6]:
getNonVarCols<-function(data){
    var_columns    = apply(inputs,2,var)
    resCol         = names(var_columns[var_columns==0.0])
    return(resCol)
}

In [7]:
# Set Parameters
dataNameRange       = c('ATAC_JV_adipose','CTCF_TDH_ENCODE','H3K27ac-H3K4me3_TDHAM_BP','H3K27ac_TDH_some','H3K36me3_AM_immune')
sigma_range         = c(1,2,5,10,100)
distribution_range  = c('normal','logistic','extreme')
learning_rate       = 0.1
num_round           = 200

In [8]:
getaccuracy=function(pred,y_lower,y_higher){
    res = (pred>=y_lower & pred<=y_higher)
    return(res)
}

In [9]:
getParam = function(sigma,distribution,learning_rate){
  eval_metric = paste("aft-nloglik@",distribution,",",sigma,sep="") 
  param       = list(learning_rate=learning_rate, aft_noise_distribution=distribution, 
                    nthread = 4, verbosity=0, aft_sigma= sigma,
                    eval_metric  = eval_metric,
                    objective  = "aft:survival")
  return(param)
}

In [10]:
trainModel = function(foldNo,X,X_val,y_lower,y_lower_val,y_upper,y_upper_val,param,num_round){
  
  dtrain = xgb.DMatrix(X)
  setinfo(dtrain,'label_lower_bound', y_lower)
  setinfo(dtrain,'label_upper_bound', y_upper)
  
  dtest = xgb.DMatrix(X_val)
  setinfo(dtest,'label_lower_bound', y_lower_val)
  setinfo(dtest,'label_upper_bound', y_upper_val)
  
  watchlist = list(eval = dtest, train = dtrain)
  bst       = xgb.train(param, dtrain, num_round, watchlist,verbose = 0)

  return(bst)
}

In [11]:
xgb_cv_bayes = function(max_depth, min_child_weight, subsample, subsample_freq, 
                        colsample_bytree, reg_alpha, reg_lambda, nrounds, 
                        learning_rate, min_data_in_leaf, sigma, 
                        distribution) {
    
    eval_metric = paste("aft-nloglik@",distribution,",",sigma,sep="") 
    cv = xgb.cv(params = list(booster = "gbtree",
                              max_depth = max_depth,
                              min_child_weight = min_child_weight,
                              subsample = subsample,
                              subsample_freq = subsample_freq,
                              colsample_bytree = colsample_bytree,
                              reg_alpha = reg_alpha,
                              reg_lambda = reg_lambda,
                              nrounds = nrounds,
                              learning_rate = learning_rate,
                              min_data_in_leaf = min_data_in_leaf,
                              aft_sigma = sigma,
                              aft_noise_distribution = distribution,
                              objective = eval_metric,
                              nthread = 4,
                              eval_metric = eval_metric),
               data = dtrain,
               folds = cv_folds, prediction = TRUE, showsd = TRUE,
               early_stopping_rounds = 5, maximize = TRUE, verbose = 0)
    
    print(cv$evaluation_log)
#     Score = cv$evaluation_log$test_auc_mean[cv$best_iteration]
    list(Pred = cv$pred)
    }

In [12]:
 bounds = list(num_leaves = c(3L, 1000L),
               max_depth =  c(1L, 50L),
               min_child_weight =  c(1, 60), 
               subsample =  c(0.1, 1),
               subsample_freq = c(1, 100), 
               colsample_bytree =  c(0.0001, 1),
               reg_alpha=c(0.0001, 10),
               reg_lambda = c(1, 40),
               nrounds    = c(50, 2000),
               learning_rate=  c(0.0001, 1),
               min_data_in_leaf =c(1, 50),
               sigma = c(1,100),
               distribution = c('normal','logistic','extreme'))

In [14]:
for(i in 1:1){
    
    res                 = data_import(dataNameRange[1])
    inputs              = res$inputs
    labels              = res$labels
    folds               = res$folds
    resDataMassage      = data_massage(inputs,labels)
    inputs              = resDataMassage$inputs
    labels              = resDataMassage$labels
    fold_iter           = unique(folds$fold)
    accuracy_fold       = numeric(length(fold_iter))

    res                 = getXY(fold_iter[i],folds,inputs,labels)
    X                   = res$X
    X.val               = res$X.val
    y.lower             = res$y.lower
    y.lower.val         = res$y.lower.val
    y.upper             = res$y.upper
    y.upper.val         = res$y.upper.val
    train.folds         = cut(seq(1,nrow(X)),breaks=5,labels=FALSE)
    res                 = list()
    cv_folds            = KFold(y.upper, nfolds = 5,
                                stratified = FALSE, seed = 0)
    dtrain = xgb.DMatrix(X)
    setinfo(dtrain,'label_lower_bound', y.lower)
    setinfo(dtrain,'label_upper_bound', y.upper)
    
    opt_res = BayesianOptimization(xgb_cv_bayes,
                                bounds = bounds,
                                init_grid_dt = NULL, init_points = 10, n_iter = 20,
                                acq = "ucb", kappa = 2.576, eps = 0.0,
                                verbose = TRUE)
}

ERROR: Error in {: task 1 failed - "invalid arguments"
