In [2]:
library(tidyverse)
library(MASS)
library(pROC)
library(dummies)
install.packages("splitTools", repos = "http://cran.us.r-project.org/src/contrib/splitTools_0.3.1.tar.gz")
#, INSTALL_opts = '--no-lock'
library(splitTools)
library(multiROC)
library(doParallel)
library(foreach)
library(doRNG)
library(rngtools)


Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1     ✔ purrr   0.3.4
✔ tibble  3.1.4     ✔ dplyr   1.0.7
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

Attaching package: ‘MASS’

The following object is masked from ‘package:dplyr’:

    select

Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var

“package ‘dummies’ was built under R version 3.6.3”dummies-1

In [None]:

class_type <- "Bile_Duct_Cancer"
data <- readRDS("../../data/ATAC_predictions_train_20_predict_80/Full_data_ATAC_pred_lasso_formatted_standardized.rds")
sample_types <- read.table("../../data/sample_types.txt", header = F, sep = " ")
colnames(sample_types) <- c("sample", "sample_type")

data <- merge(data, sample_types, by="sample")
data <- data %>% dplyr::select(-sample)


if (class_type != "Healthy"){
    data %>% group_by(sample_type) %>% summarize(n = n())
    data <-subset(data, sample_type != "Duodenal_Cancer")
    unique(data$sample_type)
    data$sample_type <- as.factor(data$sample_type)
    data = data %>% droplevels("Duodenal_Cancer")
    }

observed  <- data$sample_type

if (class_type == "Healthy"){
    data <- data %>% mutate(sample_type = ifelse(sample_type == class_type, "Healthy", "Cancer"))
} else{
    data <- data %>% mutate(sample_type = ifelse(sample_type == class_type, "Cancer", "Other"))
}

data$sample_type <- as.factor(data$sample_type)
print("Sample_type levels")
levels(data$sample_type)

cross_validation <- function(dataset, k_inner_cv, k_outer_cv, class_type){
    
    cl <- makePSOCKcluster(2, outfile="")
    registerDoParallel(cl)
    return_tibble <- foreach(i = 1:k_outer_cv,
                                .inorder = TRUE,
                                .options.RNG = 1985,
                                .combine = "rbind",
                                .packages = c("splitTools", "MASS", "tidyverse")) %dorng% { # repeated Cross-validation loop
            
        message(class_type)
        message(paste("CV repetition number: ", i, sep = ""))
        set.seed(i)
        folds <- create_folds(dataset$sample_type, k = k_inner_cv)
        message(folds)
        predicted <- rep(NA, nrow(dataset))
                                
        for (fold in folds){
            message(paste("CV inner loop, CV rep number: ", i, sep = ""))
            testdata  <- dataset[-fold,]
            testdata  <- testdata %>% dplyr::select(-sample_type)
            traindata <- dataset[fold,]

            fit       <- lda(sample_type ~ ., data=traindata, family = "binomial")
            tmp <- predict(fit, testdata)
            tmp <- as.data.frame(tmp$posterior, row.names = NULL)
            predicted[-fold] <- tmp$Cancer
            }

        predicted = tibble("{class_type}_pred" := predicted)
        return(predicted)
    }       
    
    stopCluster(cl)
    registerDoSEQ()
    
    return_tibble <- cbind(tibble(observed = rep(observed, k_outer_cv), 
                           CV_rep = rep(1:k_outer_cv, each=nrow(dataset))), return_tibble)
                        
    return(return_tibble)
}

results <- cross_validation(data, k_inner_cv = 10, k_outer_cv = 2, class_type = class_type)

print("Results: ")
head(results)

#saveRDS(results, file = snakemake@output[["predictions"]])

[1] "Sample_type levels"


In [6]:
head(results)

observed,CV_rep,Bile_Duct_Cancer_pred
Gastric_cancer,1,0.0014995081
Gastric_cancer,1,0.0120043193
Gastric_cancer,1,0.9741233205
Gastric_cancer,1,0.0014776515
Ovarian_Cancer,1,0.0007374379
Ovarian_Cancer,1,0.0161924605


In [7]:
head(results %>% filter(CV_rep == 2)) 

observed,CV_rep,Bile_Duct_Cancer_pred
Gastric_cancer,2,0.000910328
Gastric_cancer,2,0.0155596563
Gastric_cancer,2,0.7854492666
Gastric_cancer,2,0.0015604988
Ovarian_Cancer,2,0.0005565786
Ovarian_Cancer,2,0.0307183443
