In [1]:
library(survival)
library(penaltyLearning)
library(rjson)

In [2]:
data_import =function(dataname){
  filename = paste('https://raw.githubusercontent.com/avinashbarnwal/GSOC-2019/master/AFT/test/data/',dataname,'/',sep="")
  inputFileName = paste(filename,'inputs.csv',sep="")
  labelFileName = paste(filename,'outputs.csv',sep="")
  foldsFileName = paste(filename,'cv/equal_labels/folds.csv',sep="")
  inputs        = read.table(inputFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  labels        = read.table(labelFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  folds         = read.table(foldsFileName,sep=",",header=T,stringsAsFactors = F,row.names=1)
  res           = list()
  res$inputs    = inputs
  res$labels    = labels
  res$folds     = folds
  return(res)
}

In [3]:
data_massage = function(inputs,labels){
    rownamesInput = rownames(inputs)
    inputs        = do.call(data.frame,lapply(inputs, function(x) replace(x, is.infinite(x),NA)))
    naColumns     = colnames(inputs)[colSums(is.na(inputs))>0]
    noVarCol      = getNonVarCols(inputs)
    removeCols    = c(naColumns,noVarCol)
    inputs        = inputs[ , !(colnames(inputs) %in% removeCols)]
    rownames(inputs) = rownamesInput
    #labels$min.log.lambda = unlist(lapply(labels$min.log.lambda,exp))
    #labels$max.log.lambda = unlist(lapply(labels$max.log.lambda,exp))
    res        = list()
    res$inputs = inputs
    res$labels = labels
    return(res)
}

In [4]:
getXY<-function(foldNo,folds,inputs,labels){
    test.id       = rownames(subset(folds,fold==foldNo))
    train.id      = rownames(subset(folds,fold!=foldNo))
    X             = subset(inputs,rownames(inputs) %in% train.id)
    X             = as.matrix(X)
    X.val         = subset(inputs,rownames(inputs) %in% test.id)
    X.val         = as.matrix(X.val)
    y.label       = subset(labels,rownames(labels) %in% train.id)
    y.label.test  = subset(labels,rownames(labels) %in% test.id)
    y.lower       = as.matrix(y.label$min.log.lambda)
    y.upper       = as.matrix(y.label$max.log.lambda)
    y.lower.val   = as.matrix(y.label.test$min.log.lambda)
    y.upper.val   = as.matrix(y.label.test$max.log.lambda)
    res           = list()
    res$X         = X
    res$X.val     = X.val
    res$y.lower      = y.lower
    res$y.lower.val  = y.lower.val
    res$y.upper      = y.upper
    res$y.upper.val  = y.upper.val
    return(res)
}

In [5]:
getNonVarCols<-function(data){
    var_columns    = apply(data,2,var)
    resCol         = names(var_columns[var_columns==0.0])
    return(resCol)
}

In [6]:
getaccuracy=function(pred,y_lower,y_higher){
    res = (pred>=y_lower & pred<=y_higher)
    return(res)
}

In [7]:
data_name_range  = c('ATAC_JV_adipose','CTCF_TDH_ENCODE','H3K27ac-H3K4me3_TDHAM_BP','H3K27ac_TDH_some','H3K36me3_AM_immune','H3K27me3_RL_cancer','H3K27me3_TDH_some','H3K36me3_TDH_ENCODE','H3K36me3_TDH_immune','H3K36me3_TDH_other')
run_time         = list()

In [8]:
train_test_model = function(data_name){
    
    res                 = data_import(data_name)
    inputs              = res$inputs
    labels              = res$labels
    folds               = res$folds
    resDataMassage      = data_massage(inputs,labels)
    inputs              = resDataMassage$inputs
    labels              = resDataMassage$labels
    fold_iter           = unique(folds$fold)
    accuracy_fold       = list()
    
    for(i in fold_iter){
        
        start_time          = Sys.time()
        res                 = getXY(i,folds,inputs,labels)
        X                   = res$X
        X.val               = res$X.val
        y.lower             = res$y.lower
        y.lower.val         = res$y.lower.val
        y.upper             = res$y.upper
        y.upper.val         = res$y.upper.val
        train.folds         = cut(seq(1,nrow(X)),breaks=5,labels=FALSE)
        res                 = list()
        target.mat          = cbind(y.lower,y.upper)
        fit                 = IntervalRegressionCV(X, target.mat)  
        pred.y.val          = predict(fit, X.val)
        pred_data           = data.frame(pred.y.val,y.lower.val,y.upper.val)
        colnames(pred_data) = c("predict","y.lower","y.upper")
        accuracy_fold[[i]]  = sum(mapply(getaccuracy,pred.y.val,y.lower.val,y.upper.val))/length(pred.y.val)
        file_name           = paste('intervalCV_result/',data_name,'/',i,'.csv',sep="")
        write.table(pred_data,file_name,sep=",",col.names=NA)
        end_time      = Sys.time()
        time_taken    = as.numeric(end_time-start_time)
        run_time[[i]] = time_taken
        
    }
    
    names(accuracy_fold) = fold_iter
    jsonAccuracy         = toJSON(accuracy_fold)
    file_name            = paste('intervalCV_result/',data_name,'/accuracy.JSON',sep="")
    write(jsonAccuracy, file = file_name)
    names(run_time) = fold_iter
    json_run_time   = toJSON(run_time)
    file_name       = paste("intervalCV_result/",data_name,"/run_time.JSON",sep="")
    write(json_run_time, file=file_name)
}

In [9]:
train_test_model(data_name_range[1])

Loading required namespace: future.apply

Loading required namespace: directlabels



In [10]:
train_test_model(data_name_range[2])

In [11]:
train_test_model(data_name_range[3])

In [12]:
train_test_model(data_name_range[4])

In [13]:
train_test_model(data_name_range[5])

In [14]:
train_test_model(data_name_range[6])

In [15]:
train_test_model(data_name_range[7])

In [16]:
train_test_model(data_name_range[8])

In [17]:
train_test_model(data_name_range[9])

In [18]:
train_test_model(data_name_range[10])

# Save Accuracy

# Compare Results