In [1]:
setwd("/data/Projects/ACC_NCDR/NCDR/BJMDATA/ACTION/NCHData/")
source(file = "/data/Projects/ACC_NCDR/NCDR/BJMDATA/ACTION/CODE/fMeasure.R")

library(glmnet)
library(xgboost)
#library(pROC)
library(foreach)
library(doParallel)
library(PRROC)

Loading required package: iterators
Loading required package: parallel
Loading required package: Matrix
Loaded glmnet 2.0-18



In [2]:
folds.all <- list(1, 2, 3, 4, 5)
modes <- list('mcnamara', 'expanded')

In [3]:
registerDoParallel(cores=64)

In [4]:
for (fold in folds.all){
  for (mode in modes){
    data_dir <- paste0("/data/Projects/ACC_NCDR/NCDR/BJMDATA/ACTION/NCHData/multiple_imputed/fold_", fold)
    cat(paste('Loading data for ', mode,' Model Fold ', fold, '\n', sep=''))

    DataA <- read.csv(file = paste0(data_dir, "/trainA_", mode, ".csv"), header = TRUE)
    labelsA <- DataA$DCStatus
    IDX.A <- DataA$IDX
    trainA <- DataA[, which(!(colnames(DataA) %in% c("DCStatus", "IDX")))]
    
    DataB <- read.csv(file = paste0(data_dir, "/trainB_", mode, ".csv"), header = TRUE)
    labelsB <- DataB$DCStatus
    IDX.B <- DataB$IDX
    trainB <- DataB[, which(!(colnames(DataB) %in% c("DCStatus", "IDX")))]
    
    DataC <- read.csv(file = paste0(data_dir, "/trainC_", mode, ".csv"), header = TRUE)
    labelsC <- DataC$DCStatus
    IDX.C <- DataC$IDX
    trainC <- DataC[, which(!(colnames(DataC) %in% c("DCStatus", "IDX")))]
    cat('Data Loaded at', Sys.time(), '\n')
    
    #LR model
    
    cat(paste('LR ', mode,' Model Data B Fold ', fold, '\n', sep=''))
    lr.model <- glm(DCStatus~. -IDX , DataA, family='binomial')
    lr.resp <- predict(lr.model, DataB, type='response')
    lr.roc <- roc.curve(scores.class0 = lr.resp, weights.class0 = labelsB, curve = T)
    
    print(lr.roc)
    
    cat(paste('LR ', mode,' Model Data C Fold ', fold, '\n', sep=''))
    lr.f <- allROC_par(as.numeric(lr.resp), as.numeric(labelsB))[[1]]
    lr.respC <- predict(lr.model, DataC, type='response')
    lr.roc.C <- roc.curve(scores.class0 = lr.respC, weights.class0 = labelsC, curve = T)
    lr.fC <- allROC_par(as.numeric(lr.respC), as.numeric(labelsC))
    print(lr.roc.C)
    
    #GLM model
    
    cat(paste('GLM ', mode,' Model Data B Fold ', fold, '\n', sep=''))
    glm.model <- cv.glmnet(x=data.matrix(trainA), y=as.factor(labelsA), family='binomial',
                           type.measure = 'auc', parallel=TRUE)
    glm.resp <- predict(glm.model, data.matrix(trainB), type='response')
    glm.roc <- roc.curve(scores.class0 = glm.resp, weights.class0 = labelsB, curve = T)
    print(glm.roc)
    #glm.roc <- roc(as.numeric(labelsB), as.numeric(glm.resp))
    glm.f <- allROC_par(as.numeric(glm.resp), as.numeric(labelsB))[[1]]
    #print(glm.roc)
    
    cat(paste('GLM ', mode,' Model Data C Fold ', fold, '\n', sep=''))
    glm.respC <- predict(glm.model, data.matrix(trainC), type = 'response')
    glm.roc.C  <- roc.curve(scores.class0 = glm.respC, weights.class0 = labelsC, curve = T)
    glm.fC <- allROC_par(as.numeric(glm.respC), as.numeric(labelsC))
    print(glm.roc.C)
    
    #XGB model
    
    cat(paste('XGB ', mode,' Model Data B Fold ', fold, '\n', sep=''))
    xgb.model <- xgboost(data.matrix(trainA), as.numeric(labelsA), verbose=0, nrounds=500, 
                         eta=0.1, max.depth=5, nthread=64, objective='binary:logistic',
                         save_period = NULL, save_name = NULL)
    xgb.resp <- predict(xgb.model, data.matrix(trainB))
    xgb.roc <- roc.curve(scores.class0 = xgb.resp, weights.class0 = labelsB, curve = T)
    print(xgb.roc)
    xgb.vars <- xgb.importance(feature_names=colnames(trainA), model=xgb.model)
    xgb.f <- allROC_par(as.numeric(xgb.resp), as.numeric(labelsB))
    
    
    cat(paste('XGB ', mode,' Model Data C Fold ', fold, '\n', sep=''))
    xgb.respC <- predict(xgb.model, data.matrix(trainC), type = 'response')
    xgb.roc.C  <- roc.curve(scores.class0 = xgb.respC, weights.class0 = labelsC, curve = T)
    xgb.fC <- allROC_par(as.numeric(xgb.respC), as.numeric(labelsC))[[1]]
    print(xgb.roc.C)
    
    top_feat <- xgb.vars$Feature[1:9]
    DataA_top <- DataA[, which(colnames(DataA) %in% top_feat)]
    DataC_top <- DataC[, which(colnames(DataC) %in% top_feat)]
    
    cat(paste('XGB ', mode,' Model Data C, Top 10 Features Only, Fold ', fold, '\n', sep=''))
    
    xgb.model <- xgboost(data.matrix(DataA_top), as.numeric(labelsA), verbose=0, nrounds=250,
                         eta=0.1, max.depth=4, nthread=64, objective='binary:logistic',
                         save_period = NULL, save_name = NULL)
    xgb.resp9 <- predict(xgb.model, data.matrix(DataC_top))
    #xgb.roc <- roc(as.numeric(labelsC), as.numeric(xgb.resp))
    xgb.roc <- roc.curve(scores.class0 = xgb.resp9, weights.class0 = labelsC, curve = T)
    print(xgb.roc)
    top_xgb.vars <- xgb.importance(feature_names=colnames(DataC_top), model=xgb.model)
    xgb.f <- allROC_par(as.numeric(xgb.resp9), as.numeric(labelsC))
    
    lr_predsB.df <- DataB
    lr_predsB.df$lr <- lr.resp
    
    lr_predsC.df <- DataC
    lr_predsC.df$lr <- lr.respC
    
    glm_predsB.df <- DataB
    glm_predsB.df$glm <- glm.resp
    
    glm_predsC.df <- DataC
    glm_predsC.df$glm <- glm.respC
    
    xgb_predsB.df <- DataB
    xgb_predsB.df$xgb <- xgb.resp
    
    xgb_predsC.df <- DataC
    xgb_predsC.df$xgb <- xgb.respC
    
    write.csv(lr_predsB.df,  file = paste0(data_dir,  "/lr_preds_trainB_", mode, ".csv"), row.names = FALSE)
    write.csv(lr_predsC.df,  file = paste0(data_dir,  "/lr_preds_trainC_", mode, ".csv"), row.names = FALSE)
    write.csv(glm_predsB.df, file = paste0(data_dir, "/glm_preds_trainB_", mode, ".csv"), row.names = FALSE)
    write.csv(glm_predsC.df, file = paste0(data_dir, "/glm_preds_trainC_", mode, ".csv"), row.names = FALSE)
    write.csv(xgb_predsB.df, file = paste0(data_dir, "/xgb_preds_trainB_", mode, ".csv"), row.names = FALSE)
    write.csv(xgb_predsC.df, file = paste0(data_dir, "/xgb_preds_trainC_", mode, ".csv"), row.names = FALSE)
    
    preds.df <- data.frame(xgb = xgb.respC, xgb_9 = xgb.resp9, DCStatus = labelsC, IDX = IDX.C)
    save(preds.df, file = paste0(data_dir, "/preds_trainC_", mode, "RData"))
    
    write.csv(xgb.vars, file = paste0(data_dir, "/importance_xgb_", mode, ".csv"), row.names = FALSE)
}}

Loading data for mcnamara Model Fold 1
Data Loaded at 1572557498 
LR mcnamara Model Data B Fold 1

  ROC curve

    Area under curve:
     0.8788571 

    Curve for scores from  0.0001497355  to  0.984533 
    ( can be plotted with plot(x) )

LR mcnamara Model Data C Fold 1

  ROC curve

    Area under curve:
     0.8797584 

    Curve for scores from  0.0001319108  to  0.9785889 
    ( can be plotted with plot(x) )

GLM mcnamara Model Data B Fold 1

  ROC curve

    Area under curve:
     0.8778732 

    Curve for scores from  0.0003164242  to  0.9740675 
    ( can be plotted with plot(x) )

GLM mcnamara Model Data C Fold 1

  ROC curve

    Area under curve:
     0.8787988 

    Curve for scores from  0.000355937  to  0.9727103 
    ( can be plotted with plot(x) )

XGB mcnamara Model Data B Fold 1

  ROC curve

    Area under curve:
     0.8869059 

    Curve for scores from  0.0001105995  to  0.983832 
    ( can be plotted with plot(x) )

XGB mcnamara Model Data C Fold 1

  ROC curv

“prediction from a rank-deficient fit may be misleading”


  ROC curve

    Area under curve:
     0.8907786 

    Curve for scores from  0.0001057743  to  0.9957972 
    ( can be plotted with plot(x) )

LR expanded Model Data C Fold 1


“prediction from a rank-deficient fit may be misleading”


  ROC curve

    Area under curve:
     0.890791 

    Curve for scores from  8.79389e-05  to  0.9968864 
    ( can be plotted with plot(x) )

GLM expanded Model Data B Fold 1

  ROC curve

    Area under curve:
     0.8898705 

    Curve for scores from  0.0002429319  to  0.9905772 
    ( can be plotted with plot(x) )

GLM expanded Model Data C Fold 1

  ROC curve

    Area under curve:
     0.8896918 

    Curve for scores from  0.0002869832  to  0.9923793 
    ( can be plotted with plot(x) )

XGB expanded Model Data B Fold 1

  ROC curve

    Area under curve:
     0.8990791 

    Curve for scores from  3.68377e-05  to  0.9998673 
    ( can be plotted with plot(x) )

XGB expanded Model Data C Fold 1

  ROC curve

    Area under curve:
     0.8988485 

    Curve for scores from  7.207762e-05  to  0.9995818 
    ( can be plotted with plot(x) )

XGB expanded Model Data C, Top 10 Features Only, Fold 1

  ROC curve

    Area under curve:
     0.8867045 

    Curve for scores from  0.000

“prediction from a rank-deficient fit may be misleading”


  ROC curve

    Area under curve:
     0.8900259 

    Curve for scores from  9.485306e-05  to  0.9965597 
    ( can be plotted with plot(x) )

LR expanded Model Data C Fold 2


“prediction from a rank-deficient fit may be misleading”


  ROC curve

    Area under curve:
     0.8896695 

    Curve for scores from  8.106025e-05  to  0.9968727 
    ( can be plotted with plot(x) )

GLM expanded Model Data B Fold 2

  ROC curve

    Area under curve:
     0.8882911 

    Curve for scores from  0.0002758723  to  0.9902463 
    ( can be plotted with plot(x) )

GLM expanded Model Data C Fold 2

  ROC curve

    Area under curve:
     0.8886665 

    Curve for scores from  0.0003358818  to  0.9900098 
    ( can be plotted with plot(x) )

XGB expanded Model Data B Fold 2

  ROC curve

    Area under curve:
     0.899267 

    Curve for scores from  5.225312e-05  to  0.9997701 
    ( can be plotted with plot(x) )

XGB expanded Model Data C Fold 2

  ROC curve

    Area under curve:
     0.8982547 

    Curve for scores from  7.318027e-05  to  0.9997568 
    ( can be plotted with plot(x) )

XGB expanded Model Data C, Top 10 Features Only, Fold 2

  ROC curve

    Area under curve:
     0.8864133 

    Curve for scores from  0.0

“prediction from a rank-deficient fit may be misleading”


  ROC curve

    Area under curve:
     0.8884935 

    Curve for scores from  1.972024e-05  to  0.9959637 
    ( can be plotted with plot(x) )

LR expanded Model Data C Fold 3


“prediction from a rank-deficient fit may be misleading”


  ROC curve

    Area under curve:
     0.8913349 

    Curve for scores from  2.113794e-05  to  0.9950177 
    ( can be plotted with plot(x) )

GLM expanded Model Data B Fold 3

  ROC curve

    Area under curve:
     0.8864871 

    Curve for scores from  0.0003960605  to  0.9842152 
    ( can be plotted with plot(x) )

GLM expanded Model Data C Fold 3

  ROC curve

    Area under curve:
     0.889809 

    Curve for scores from  0.0003320773  to  0.9882503 
    ( can be plotted with plot(x) )

XGB expanded Model Data B Fold 3

  ROC curve

    Area under curve:
     0.8988272 

    Curve for scores from  6.852244e-05  to  0.9988104 
    ( can be plotted with plot(x) )

XGB expanded Model Data C Fold 3

  ROC curve

    Area under curve:
     0.8980235 

    Curve for scores from  2.888531e-05  to  0.9991851 
    ( can be plotted with plot(x) )

XGB expanded Model Data C, Top 10 Features Only, Fold 3

  ROC curve

    Area under curve:
     0.8867618 

    Curve for scores from  0.0

“prediction from a rank-deficient fit may be misleading”


  ROC curve

    Area under curve:
     0.8926649 

    Curve for scores from  9.849248e-05  to  0.9960056 
    ( can be plotted with plot(x) )

LR expanded Model Data C Fold 4


“prediction from a rank-deficient fit may be misleading”


  ROC curve

    Area under curve:
     0.8886465 

    Curve for scores from  0.0001448992  to  0.9970803 
    ( can be plotted with plot(x) )

GLM expanded Model Data B Fold 4

  ROC curve

    Area under curve:
     0.8913188 

    Curve for scores from  0.0002898518  to  0.9861692 
    ( can be plotted with plot(x) )

GLM expanded Model Data C Fold 4

  ROC curve

    Area under curve:
     0.8867147 

    Curve for scores from  0.0003671274  to  0.9934594 
    ( can be plotted with plot(x) )

XGB expanded Model Data B Fold 4

  ROC curve

    Area under curve:
     0.9015912 

    Curve for scores from  6.727035e-05  to  0.9995533 
    ( can be plotted with plot(x) )

XGB expanded Model Data C Fold 4

  ROC curve

    Area under curve:
     0.896325 

    Curve for scores from  6.86623e-05  to  0.999817 
    ( can be plotted with plot(x) )

XGB expanded Model Data C, Top 10 Features Only, Fold 4

  ROC curve

    Area under curve:
     0.8843279 

    Curve for scores from  0.000

“prediction from a rank-deficient fit may be misleading”


  ROC curve

    Area under curve:
     0.8890195 

    Curve for scores from  4.913795e-05  to  0.9972655 
    ( can be plotted with plot(x) )

LR expanded Model Data C Fold 5


“prediction from a rank-deficient fit may be misleading”


  ROC curve

    Area under curve:
     0.8906347 

    Curve for scores from  7.825826e-05  to  0.992952 
    ( can be plotted with plot(x) )

GLM expanded Model Data B Fold 5

  ROC curve

    Area under curve:
     0.8878765 

    Curve for scores from  0.0003061904  to  0.99226 
    ( can be plotted with plot(x) )

GLM expanded Model Data C Fold 5

  ROC curve

    Area under curve:
     0.8890392 

    Curve for scores from  0.000303824  to  0.9853842 
    ( can be plotted with plot(x) )

XGB expanded Model Data B Fold 5

  ROC curve

    Area under curve:
     0.898468 

    Curve for scores from  3.338342e-05  to  0.9995847 
    ( can be plotted with plot(x) )

XGB expanded Model Data C Fold 5

  ROC curve

    Area under curve:
     0.9004192 

    Curve for scores from  4.045726e-05  to  0.9985934 
    ( can be plotted with plot(x) )

XGB expanded Model Data C, Top 10 Features Only, Fold 5

  ROC curve

    Area under curve:
     0.8871797 

    Curve for scores from  0.00013