In [2]:
library(mboost)
library(survival)
library(penaltyLearning)
library(Hmisc)
library(caret)
library(rjson)

In [3]:
data_import =function(dataname){
  filename = paste('https://raw.githubusercontent.com/avinashbarnwal/GSOC-2019/master/AFT/test/data/neuroblastoma-data-master/data/',dataname,'/',sep="")
  inputFileName = paste(filename,'inputs.csv',sep="")
  labelFileName = paste(filename,'outputs.csv',sep="")
  foldsFileName = paste(filename,'cv/equal_labels/folds.csv',sep="")
  inputs        = read.table(inputFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  labels        = read.table(labelFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  folds         = read.table(foldsFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  res           = list()
  res$inputs    = inputs
  res$labels    = labels
  res$folds     = folds
  return(res)
}

In [4]:
data_massage = function(inputs,labels){
    rownamesInput = rownames(inputs)
    inputs        = do.call(data.frame,lapply(inputs, function(x) replace(x, is.infinite(x),NA)))
    naColumns     = colnames(inputs)[colSums(is.na(inputs))>0]
    noVarCol      = getNonVarCols(inputs)
    removeCols    = c(naColumns,noVarCol)
    inputs        = inputs[ , !(colnames(inputs) %in% removeCols)]
    rownames(inputs) = rownamesInput
    labels$min.log.lambda = unlist(lapply(labels$min.log.lambda,exp))
    labels$max.log.lambda = unlist(lapply(labels$max.log.lambda,exp))
    res        = list()
    res$inputs = inputs
    res$labels = labels
    return(res)
}

In [5]:
getXY<-function(foldNo,folds,inputs,labels){
    test.id       = rownames(subset(folds,fold==foldNo))
    train.id      = rownames(subset(folds,fold!=foldNo))
    X             = subset(inputs,rownames(inputs) %in% train.id)
    X             = as.matrix(X)
    X.val         = subset(inputs,rownames(inputs) %in% test.id)
    X.val         = as.matrix(X.val)
    y.label       = subset(labels,rownames(labels) %in% train.id)
    y.label.test  = subset(labels,rownames(labels) %in% test.id)
    y.lower       = as.matrix(y.label$min.log.lambda)
    y.upper       = as.matrix(y.label$max.log.lambda)
    y.lower.val   = as.matrix(y.label.test$min.log.lambda)
    y.upper.val   = as.matrix(y.label.test$max.log.lambda)
    res           = list()
    res$X         = X
    res$X.val     = X.val
    res$y.lower      = y.lower
    res$y.lower.val  = y.lower.val
    res$y.upper      = y.upper
    res$y.upper.val  = y.upper.val
    return(res)
}

In [6]:
getNonVarCols<-function(data){
    var_columns    = apply(inputs,2,var)
    resCol         = names(var_columns[var_columns==0.0])
    return(resCol)
}

In [7]:
# Set Parameters
dataNameRange       = c('ATAC_JV_adipose','CTCF_TDH_ENCODE','H3K27ac-H3K4me3_TDHAM_BP','H3K27ac_TDH_some','H3K36me3_AM_immune')
sigma_range         = c(1,2,5,10,100)
distribution_range  = c('normal','logistic','extreme')
learning_rate       = 0.1
num_round           = 200

In [8]:
res                 = data_import(dataNameRange[1])
inputs              = res$inputs
labels              = res$labels
folds               = res$folds
resDataMassage      = data_massage(inputs,labels)
inputs              = resDataMassage$inputs
labels              = resDataMassage$labels
fold_iter           = unique(folds$fold)
accuracy_fold       = numeric(length(fold_iter))

In [9]:
getaccuracy=function(pred,y_lower,y_higher){
    res = (pred>=y_lower & pred<=y_higher)
    return(res)
}

In [10]:
for(i in fold_iter){
    print(i)
    res                 = getXY(i,folds,inputs,labels)
    X                   = res$X
    X.val               = res$X.val
    y.lower             = log(res$y.lower)
    y.lower.val         = log(res$y.lower.val)
    y.upper             = log(res$y.upper)
    y.upper.val         = log(res$y.upper.val)
    train.folds         = cut(seq(1,nrow(X)),breaks=5,labels=FALSE)
    res                 = list()
    target.mat          = cbind(y.lower,y.upper)
    fit                 = IntervalRegressionCV(X, target.mat)  
    pred.y.val          = predict(fit, X.val)
    pred_data           = data.frame(pred.y.val,y.lower.val,y.upper.val)
    colnames(pred_data) = c("predict","y.lower","y.upper")
    accuracy_fold[i] = sum(mapply(getaccuracy,pred.y.val,y.lower.val,y.upper.val))/length(pred.y.val)
    fileName            = paste('../../../../result/ATAC_JV_adipose/intervalCV/',i,".csv")
    write.table(pred_data,fileName,sep=",",col.names=NA)
}

[1] 2


Loading required namespace: future.apply

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight



[1] 4


Loading required namespace: future.apply

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight



[1] 1


Loading required namespace: future.apply

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight



[1] 3


Loading required namespace: future.apply

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight

Loading required namespace: directlabels

install.packages("directlabels") for more informative labels on plot.weight



# Save Accuracy

In [11]:
jsonAccuracy = toJSON(accuracy_fold)

In [12]:
write(jsonAccuracy, file="../../../../result/ATAC_JV_adipose/intervalCV/accuracy.JSON")

# Compare Results

In [15]:
for(i in fold_iter){
    resFileName   = paste("https://raw.githubusercontent.com/tdhock/neuroblastoma-data/master/data/ATAC_JV_adipose/cv/equal_labels/testFolds/",i,"/randomTrainOrderings/",i,"/models/L1reg_linear_all/predictions.csv",sep="")
    out_res_fold  = read.table(resFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
    out_res_fold  = out_res_fold[order(row.names(out_res_fold)),]
    fileName      = paste('../../../../result/ATAC_JV_adipose/intervalCV/',i,".csv")
    in_res_fold   = read.table(fileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
    in_res_fold   = in_res_fold[ order(row.names(in_res_fold)),]
    seq_in  = rownames(in_res_fold)
    seq_out = rownames(out_res_fold)
    print(identical(seq_in,seq_out))
    nCOls   = length(colnames(out_res_fold))
    pred_col = colnames(out_res_fold)[nCOls]
    out_res_fold = data.frame(out_res_fold[,pred_col], row.names=rownames(out_res_fold))
    colnames(out_res_fold) = c("pred.old")
    in_res_fold = data.frame(in_res_fold[,"predict"], row.names=rownames(in_res_fold))
    colnames(in_res_fold)  = c("pred.new")
    compared_data = merge(x = in_res_fold, y = out_res_fold, by = "row.names", all = TRUE)
    plot_name = paste('../../../../result/ATAC_JV_adipose/intervalCV/compare_benchmark_',i,'.png',sep="")
    png(filename=plot_name)
    plot(compared_data$pred.old,compared_data$pred.old,xlab="Old Prediction",ylab="New Prediction")
    dev.off()
}

[1] TRUE
[1] TRUE
[1] TRUE
[1] TRUE
