In [19]:
library(mboost)
library(survival)
library(penaltyLearning)
library(Hmisc)
library(caret)

In [20]:
data_import =function(dataname){
  filename = paste('https://raw.githubusercontent.com/avinashbarnwal/GSOC-2019/master/AFT/test/data/neuroblastoma-data-master/data/',dataname,'/',sep="")
  inputFileName = paste(filename,'inputs.csv',sep="")
  labelFileName = paste(filename,'outputs.csv',sep="")
  foldsFileName = paste(filename,'cv/equal_labels/folds.csv',sep="")
  inputs        = read.table(inputFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  labels        = read.table(labelFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  folds         = read.table(foldsFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  res           = list()
  res$inputs    = inputs
  res$labels    = labels
  res$folds     = folds
  return(res)
}

In [21]:
data_massage = function(inputs,labels){
    rownamesInput = rownames(inputs)
    inputs        = do.call(data.frame,lapply(inputs, function(x) replace(x, is.infinite(x),NA)))
    naColumns     = colnames(inputs)[colSums(is.na(inputs))>0]
    noVarCol      = getNonVarCols(inputs)
    removeCols    = c(naColumns,noVarCol)
    inputs        = inputs[ , !(colnames(inputs) %in% removeCols)]
    rownames(inputs) = rownamesInput
    labels$min.log.lambda = unlist(lapply(labels$min.log.lambda,exp))
    labels$max.log.lambda = unlist(lapply(labels$max.log.lambda,exp))
    res        = list()
    res$inputs = inputs
    res$labels = labels
    return(res)
}

In [22]:
getXY<-function(foldNo,folds,inputs,labels){
    test.id       = rownames(subset(folds,fold==foldNo))
    train.id      = rownames(subset(folds,fold!=foldNo))
    X             = subset(inputs,rownames(inputs) %in% train.id)
    X             = as.matrix(X)
    X.val         = subset(inputs,rownames(inputs) %in% test.id)
    X.val         = as.matrix(X.val)
    y.label       = subset(labels,rownames(labels) %in% train.id)
    y.label.test  = subset(labels,rownames(labels) %in% test.id)
    y.lower       = as.matrix(y.label$min.log.lambda)
    y.upper       = as.matrix(y.label$max.log.lambda)
    y.lower.val   = as.matrix(y.label.test$min.log.lambda)
    y.upper.val   = as.matrix(y.label.test$max.log.lambda)
    res           = list()
    res$X         = X
    res$X.val     = X.val
    res$y.lower      = y.lower
    res$y.lower.val  = y.lower.val
    res$y.upper      = y.upper
    res$y.upper.val  = y.upper.val
    return(res)
}

In [23]:
getNonVarCols<-function(data){
    var_columns    = apply(inputs,2,var)
    resCol         = names(var_columns[var_columns==0.0])
    return(resCol)
}

In [24]:
# Set Parameters
dataNameRange       = c('ATAC_JV_adipose','CTCF_TDH_ENCODE','H3K27ac-H3K4me3_TDHAM_BP','H3K27ac_TDH_some','H3K36me3_AM_immune')
sigma_range         = c(1,2,5,10,100)
distribution_range  = c('normal','logistic','extreme')
learning_rate       = 0.1
num_round           = 200

In [41]:
res                 = data_import(dataNameRange[1])
inputs              = res$inputs
labels              = res$labels
folds               = res$folds
resDataMassage      = data_massage(inputs,labels)
inputs              = resDataMassage$inputs
labels              = resDataMassage$labels
fold_iter           = unique(folds$fold)
accuracy_fold       = numeric(length(fold_iter))
coef_model          = list()

In [26]:
getaccuracy=function(pred,y_lower,y_higher){
    res = (pred>=y_lower & pred<=y_higher)
    return(res)
}

In [50]:
for(i in 1:length(fold_iter)){
    res                 = getXY(fold_iter[i],folds,inputs,labels)
    X                   = res$X
    X.val               = res$X.val
    y.lower             = res$y.lower
    y.lower.val         = res$y.lower.val
    y.upper             = res$y.upper
    y.upper.val         = res$y.upper.val
    train.folds         = cut(seq(1,nrow(X)),breaks=5,labels=FALSE)
    res                 = list()
    my.surv             = Surv(y.lower,y.upper,type='interval2')
    formula             = as.formula(paste("my.surv ~", paste(colnames(X),collapse="+")))
    trn.data            = data.frame(X,y.lower,y.upper)
    glm = glmboost(formula,data=trn.data,family=Lognormal(),control=boost_control(mstop=200,nu=0.005))
    coef_model[i]       = coef(glm)
    tst.data            = data.frame(X.val)
    pred.y.val          = predict(glm,tst.data)
    print(y.lower.val)
    print(y.upper.val)
    accuracy_fold[i]    = sum(mapply(getaccuracy,pred.y.val,y.lower.val,y.upper.val))/length(pred.y.val)
}

“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum posi

              [,1]
  [1,]   5391.2431
  [2,]   2539.7658
  [3,]  26851.0087
  [4,]   4448.9767
  [5,]   1008.7673
  [6,]  28865.2883
  [7,]   8429.4461
  [8,]   1712.7617
  [9,]  49294.5158
 [10,]  31789.7000
 [11,]   2356.4229
 [12,]  20202.0726
 [13,]   8653.8816
 [14,]   2557.4705
 [15,]   6557.5343
 [16,]   6907.4762
 [17,]   2783.4805
 [18,]   1691.7548
 [19,]   5278.8039
 [20,]  10705.7944
 [21,]   4568.0735
 [22,]   7465.0402
 [23,]   7076.0861
 [24,]  10548.2664
 [25,]   6446.5770
 [26,]   8566.3601
 [27,] 350970.3197
 [28,]   9027.5235
 [29,]   4809.3426
 [30,]   4021.7310
 [31,]   6081.7795
 [32,]  10645.6168
 [33,]   5230.6276
 [34,]   1697.0600
 [35,]   9379.4134
 [36,]  30470.4730
 [37,]   3751.4371
 [38,]   2175.3778
 [39,]   5597.0912
 [40,]  15326.5661
 [41,]  14755.4797
 [42,]   2824.8590
 [43,]  10278.3439
 [44,]  19035.0847
 [45,]   5847.3962
 [46,]   2502.0937
 [47,]  34375.6107
 [48,]   5438.6377
 [49,]   3530.3427
 [50,]  40604.6476
 [51,]  14417.6488
 [52,]   772

“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum posi

              [,1]
  [1,]  23835.1587
  [2,]    965.4733
  [3,]  16874.5493
  [4,]  17320.8577
  [5,]   1269.1884
  [6,]   9188.9827
  [7,]  66532.5135
  [8,]   1696.7453
  [9,]   8190.1951
 [10,]  22869.6310
 [11,]   4960.3371
 [12,]  14900.0087
 [13,]  20618.1400
 [14,]   3445.7511
 [15,]  10996.5254
 [16,]   4943.5606
 [17,]  16084.4017
 [18,]   4376.2874
 [19,]   8253.3943
 [20,]   3589.8235
 [21,]  24865.4502
 [22,]    246.9820
 [23,]   5254.7699
 [24,]   4730.8864
 [25,]  22585.9436
 [26,] 808282.0180
 [27,]  24934.1613
 [28,]   8801.9546
 [29,]  16083.2600
 [30,]   5059.4493
 [31,]   6293.3374
 [32,]   1926.3584
 [33,]  34106.1451
 [34,]   1174.3358
 [35,]  33364.0031
 [36,]  11819.4258
 [37,]  27891.1141
 [38,]   1310.2815
 [39,]  13165.2822
 [40,]   7510.7722
 [41,]  27015.2113
 [42,]   6294.6396
 [43,]  47869.4762
 [44,]  12530.7793
 [45,]  13909.0179
 [46,]   1940.0880
 [47,]   4475.1343
 [48,]  26184.6746
 [49,]   1692.4729
 [50,]   4395.2695
 [51,]  48327.6563
 [52,]   304

“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum posi

               [,1]
  [1,]    3620.3112
  [2,]    1742.9890
  [3,]    2274.4273
  [4,]    2608.3092
  [5,]    4446.7546
  [6,]    1707.6809
  [7,]    1964.6646
  [8,]    2926.7408
  [9,]    4369.9434
 [10,]    2159.3919
 [11,]    5699.4716
 [12,]    2495.0978
 [13,]    2023.6485
 [14,]    2321.2898
 [15,]    2453.2491
 [16,]    5926.3992
 [17,]    6353.3996
 [18,]    2976.0229
 [19,]    4572.6629
 [20,]    2238.3314
 [21,]    3316.2531
 [22,]    1854.5590
 [23,]    3979.5402
 [24,]    2577.8104
 [25,] 4410798.9838
 [26,]    3106.0959
 [27,]  971387.2375
 [28,]    4277.7165
 [29,]    8426.7067
 [30,]    2005.1586
 [31,]    4976.3068
 [32,]    2455.5898
 [33,]    2221.3472
 [34,]    4942.8503
 [35,]   22634.3132
 [36,]    4514.1382
 [37,]    2935.2082
 [38,]    2281.3083
 [39,]    4350.0477
 [40,]    7484.7516
 [41,]   17157.1586
 [42,]    1957.6548
 [43,]    5742.1421
 [44,]    3793.5497
 [45,]    2080.4353
 [46,]    1507.3660
 [47,]   27269.9977
 [48,]    4201.7122
 [49,]    2250.4869


“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum positive value”
“NA/Inf replaced by maximum posi

             [,1]
 [1,]   1851.6243
 [2,]   5815.0514
 [3,]   1728.2432
 [4,]   4362.1733
 [5,]   1972.7023
 [6,]   7027.6479
 [7,]   1143.4934
 [8,]   5753.4128
 [9,]   4627.2812
[10,]    853.5970
[11,]   2254.1107
[12,]   3911.8460
[13,]   1414.6254
[14,]   4114.3944
[15,]   3047.1845
[16,] 674129.7398
[17,]  14303.4850
[18,]   5398.4726
[19,]   2632.7089
[20,]   3567.9991
[21,]   5396.7533
[22,]   1397.9884
[23,]  10995.5308
[24,]   4873.7242
[25,]   1875.1230
[26,]   7245.4484
[27,]   4092.7882
[28,]   1424.7450
[29,]   6713.3998
[30,]   5992.0096
[31,]   2606.2170
[32,]   5064.8658
[33,]   2506.0091
[34,]   4939.6687
[35,]   3985.4505
[36,]   6761.5041
[37,]   1882.8807
[38,]   3744.1338
[39,]    549.1429
[40,]   4757.0784
[41,]    678.4340
[42,]   1109.1693
[43,]    720.7120
[44,]   3140.8973
[45,]    778.1472
[46,]   4397.9517
[47,]    833.8130
[48,]   4066.7932
[49,]   1660.9558
[50,]    575.3980
[51,]   3483.4540
[52,]   2074.8837
[53,]    967.0517
[54,]  15109.8388
[55,]   58

In [46]:
pred.y.val

0,1
ATAC_JV_adipose/samples/AC1/MSC77/problems/chr1:206482221-223747846,-245.2289
ATAC_JV_adipose/samples/AC1/MSC77/problems/chr3:60000-66170270,-245.2289
ATAC_JV_adipose/samples/AC1/MSC80/problems/chr1:206482221-223747846,-245.2289
ATAC_JV_adipose/samples/AC1/MSC83/problems/chr3:60000-66170270,-245.2289
ATAC_JV_adipose/samples/AC1/MSC91/problems/chr1:206482221-223747846,-245.2289
ATAC_JV_adipose/samples/AC1/MSC91/problems/chr3:60000-66170270,-245.2289
ATAC_JV_adipose/samples/AC2/MSC70/problems/chr1:206482221-223747846,-245.2289
ATAC_JV_adipose/samples/AC2/MSC70/problems/chr22:20700000-50364777,-245.2289
ATAC_JV_adipose/samples/AC2/MSC70/problems/chr3:60000-66170270,-245.2289
ATAC_JV_adipose/samples/AC2/MSC72/problems/chr1:206482221-223747846,-245.2289
