In [1]:
library(tidyverse)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
── Attaching packages ──────────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 3.1.1     ✔ purrr   0.3.4
✔ tibble  3.1.4     ✔ dplyr   1.0.7
✔ tidyr   1.1.3     ✔ stringr 1.4.0
✔ readr   1.3.1     ✔ forcats 0.4.0
── Conflicts ─────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


## Cancer type - models LDA, SVM_linear, SVM_radial, SVM_poly, Boosting

In [2]:
observed <- readRDS("../Classification_output/ATAC/Observed_cancer_types.rds")
length(observed)

In [3]:

MODELS_BINOMIAL = c("LDA", "SVM_linear", "SVM_radial", "Boosting", "SVM_poly")
CLASSES = c("Bile_Duct_Cancer", 
            "Breast_Cancer", 
            "Colorectal_Cancer", 
            "Gastric_cancer", 
            "Lung_Cancer", 
            "Ovarian_Cancer", 
            "Pancreatic_Cancer")

CV_REPS = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
FOLDS = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")

In [4]:
target_data <- "Methylation_and_ATAC"
data_type <- "Full_data"

for (model_binomial in MODELS_BINOMIAL){
    model_res <- tibble(cv_rep = rep((1:10), each = 229), 
                        observed = rep(observed, 10))
    for (class_type in CLASSES) {
        class_res <- tibble()
        for (cv_rep in CV_REPS) {
            cv_rep_res <- tibble()
            for (fold in FOLDS) {
                filename <- paste("../Classification_output/", target_data, "/NEW_CORRECT_PRED/Binomial_models_output_chunked/", data_type, "/", model_binomial, "/", model_binomial, "_class_", class_type, "_CVrep_", cv_rep, "_fold_", fold, "_Predictions_", data_type, ".rds", sep = "")
                pred <- readRDS(filename)
                #if (model_binomial == "SVM_radial"){
                #    print(colnames(pred))
                #    print(head(pred))
                #    colnames(pred) <- c("dataset_row_no", class_type, "Other", "label_pred")
                #}
                cv_rep_res <- rbind(cv_rep_res, pred)

            }
            cv_rep_res <- cv_rep_res %>% arrange(dataset_row_no)
            cv_rep_res <- cv_rep_res %>% mutate(cv_rep = rep(cv_rep, nrow(cv_rep_res)))
            class_res  <- rbind(class_res, cv_rep_res)
        }
        #print(dim(class_res))
        class_probs <- pull(class_res, class_type)
        model_res <- model_res %>% mutate("{class_type}" := class_probs)
    }
    #print(head(model_res))
    saveRDS(model_res, paste("../Classification_output/", target_data, "/NEW_CORRECT_PRED/Binomial_models_output_chunked/", data_type,"/Combined_predictions/", model_binomial, "_Predictions_", data_type, ".rds", sep = ""))
}

In [5]:
head(model_res)

cv_rep,observed,Bile_Duct_Cancer,Breast_Cancer,Colorectal_Cancer,Gastric_cancer,Lung_Cancer,Ovarian_Cancer,Pancreatic_Cancer
1,Gastric_cancer,0.1054676,0.336855438,0.05490051,0.156895288,0.085348332,0.09279686,0.09005742
1,Gastric_cancer,0.10748225,0.190688609,0.13357508,0.110186625,0.169595003,0.07728019,0.07278399
1,Gastric_cancer,0.18027939,0.002365903,0.05053918,0.002996518,0.001867656,0.08530689,0.82988726
1,Gastric_cancer,0.08006031,0.180495066,0.05177486,0.138829238,0.227186851,0.09207699,0.05238511
1,Ovarian_Cancer,0.02323692,0.252845797,0.09045327,0.09177912,0.248695276,0.132927,0.06183838
1,Ovarian_Cancer,0.14095125,0.176329312,0.03656419,0.047661443,0.111733695,0.19382866,0.08283457


In [6]:
tail(model_res)
dim(model_res)

cv_rep,observed,Bile_Duct_Cancer,Breast_Cancer,Colorectal_Cancer,Gastric_cancer,Lung_Cancer,Ovarian_Cancer,Pancreatic_Cancer
10,Bile_Duct_Cancer,0.447197599,0.103806617,0.1489364,0.090641851,0.03855808,0.07963562,0.170456417
10,Colorectal_Cancer,0.117254848,0.004210261,0.940035,0.033845255,0.25370728,0.13353448,0.065367303
10,Colorectal_Cancer,0.002157283,0.0173301,0.9002567,0.001429981,0.05723981,0.50887928,0.102751343
10,Colorectal_Cancer,0.04742416,0.072894994,0.8145805,0.016645807,0.0722897,0.21483421,0.099020614
10,Colorectal_Cancer,0.243156488,0.076824003,0.1189189,0.038639829,0.07309044,0.0510282,0.102663956
10,Colorectal_Cancer,0.003865944,0.024367084,0.9987341,0.021177745,0.42682567,0.45671728,0.003143653


## Case-control - models LDA, SVM_linear, SVM_radial, SVM_poly, Boosting

In [7]:
data <- readRDS("../data/ATAC_predictions_train_20_predict_80/Full_data_ATAC_pred_lasso_formatted_standardized.rds")
sample_types <- read.table("../data/sample_types.txt", header = F, sep = " ")
colnames(sample_types) <- c("sample", "sample_type")

data <- merge(data, sample_types, by="sample")
data <- data %>% select(-sample)
observed_case_control <- data %>% select(sample_type)

observed_case_control <- observed_case_control %>% mutate(sample_type = ifelse(sample_type == "Healthy", "Healthy", "Cancer"))
print(head(observed_case_control))
dim(observed_case_control)

  sample_type
1      Cancer
2      Cancer
3      Cancer
4      Cancer
5      Cancer
6      Cancer


In [8]:
MODELS_BINOMIAL = c("Boosting", "LDA", "SVM_linear", "SVM_radial", "SVM_poly")
CLASSES <- c("Healthy")
CV_REPS = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
FOLDS = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
target_data <- "Methylation_and_ATAC"
data_type <- "Full_data"

for (model_binomial in MODELS_BINOMIAL){
    model_res <- tibble(cv_rep = rep((1:10), each = 424), 
                        observed = rep(observed_case_control$sample_type, 10))
    for (class_type in CLASSES) {
        class_res <- tibble()
        for (cv_rep in CV_REPS) {
            cv_rep_res <- tibble()
            for (fold in FOLDS) {
                filename <- paste("../Classification_output/", target_data, "/NEW_CORRECT_PRED/Binomial_models_output_chunked/", data_type, "/", model_binomial, "/", model_binomial, "_class_", class_type, "_CVrep_", cv_rep, "_fold_", fold, "_Predictions_", data_type, ".rds", sep = "")
                pred <- readRDS(filename)
                #if (model_binomial == "SVM_radial"){
                #    colnames(pred) <- c("dataset_row_no", "Cancer", "Healthy", "label_pred")
                #}
                cv_rep_res <- rbind(cv_rep_res, pred)

            }
            cv_rep_res <- cv_rep_res %>% arrange(dataset_row_no)
            cv_rep_res <- cv_rep_res %>% mutate(cv_rep = rep(cv_rep, nrow(cv_rep_res)))
            class_res  <- rbind(class_res, cv_rep_res)
        }
        print(dim(class_res))
        #print(class_res)
        class_probs <- pull(class_res, "Cancer")
        model_res <- model_res %>% mutate(Cancer = class_probs)
    }
    print(head(model_res))
    saveRDS(model_res, paste("../Classification_output/", target_data, "/NEW_CORRECT_PRED/Binomial_models_output_chunked/", data_type,"/Combined_predictions/", model_binomial, "_Predictions_", data_type, "_Healthy.rds", sep = ""))
}

[1] 4240    5
# A tibble: 6 × 3
  cv_rep observed Cancer
   <int> <chr>     <dbl>
1      1 Cancer   0.0919
2      1 Cancer   0.618 
3      1 Cancer   0.975 
4      1 Cancer   0.291 
5      1 Cancer   0.785 
6      1 Cancer   0.964 
[1] 4240    5
# A tibble: 6 × 3
  cv_rep observed Cancer
   <int> <chr>     <dbl>
1      1 Cancer    0.443
2      1 Cancer    0.387
3      1 Cancer    0.973
4      1 Cancer    0.319
5      1 Cancer    0.546
6      1 Cancer    0.856
[1] 4240    5
# A tibble: 6 × 3
  cv_rep observed Cancer
   <int> <chr>     <dbl>
1      1 Cancer    0.424
2      1 Cancer    0.590
3      1 Cancer    0.999
4      1 Cancer    0.149
5      1 Cancer    0.313
6      1 Cancer    0.983
[1] 4240    5
# A tibble: 6 × 3
  cv_rep observed Cancer
   <int> <chr>     <dbl>
1      1 Cancer    0.261
2      1 Cancer    0.529
3      1 Cancer    0.997
4      1 Cancer    0.168
5      1 Cancer    0.315
6      1 Cancer    0.908
[1] 4240    5
# A tibble: 6 × 3
  cv_rep observed Cancer
   <int> <chr> 

## Cancer type - model Lasso

In [9]:
MODELS_BINOMIAL = c("Lasso")
CLASSES = c("Bile_Duct_Cancer", 
            "Breast_Cancer", 
            "Colorectal_Cancer", 
            "Gastric_cancer", 
            "Lung_Cancer", 
            "Ovarian_Cancer", 
            "Pancreatic_Cancer")

CV_REPS = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
FOLDS = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
target_data <- "Methylation_and_ATAC"
data_type <- "Full_data"

for (model_binomial in MODELS_BINOMIAL){
    model_res <- tibble(cv_rep = rep((1:10), each = 229), 
                        observed = rep(observed, 10))
    for (class_type in CLASSES) {
        class_res <- tibble()
        for (cv_rep in CV_REPS) {
            cv_rep_res <- tibble()
            #print("CVREP")
            #print(cv_rep)
            for (fold in FOLDS) {
                #print("FOLD")
                #print(fold)
                filename <- paste("../Classification_output/", target_data, "/NEW_CORRECT_PRED/Binomial_models_output_chunked/", data_type, "/", model_binomial, "/", model_binomial, "_class_", class_type, "_CVrep_", cv_rep, "_fold_", fold, "_Predictions_", data_type, ".rds", sep = "")
                pred <- readRDS(filename)
                #print(pred)
                if (pred$second_class_prob[1] > 0.5){
                    if (pred$label_pred[1] == "Other"){
                        colnames(pred) <- c("row_predicted", "Other", "label_pred")
                        pred <- pred %>% mutate(class_type = 1 - Other) %>% select(-Other)
                        colnames(pred) <- c("row_predicted", "label_pred", class_type)
                    } else {
                        colnames(pred) <- c("row_predicted", class_type, "label_pred")
                        print("HERE")
                    }
                } else if (pred$second_class_prob[1] < 0.5){
                    if (pred$label_pred[1] == "Other"){
                        colnames(pred) <- c("row_predicted", class_type, "label_pred")
                    } else {
                        colnames(pred) <- c("row_predicted", "Other", "label_pred")
                        pred <- pred %>% mutate(class_type := 1 - Other) %>% select(-Other)
                        colnames(pred) <- c("row_predicted", "label_pred", class_type)
                        print("HERE")
                    }
                } #else {
                   # colnames(pred) <- c("row_predicted", class_type, "label_pred")
                #}
                #print(colnames(cv_rep_res))
                #print(colnames(pred))
                #print(pred[1:20, ])
                cv_rep_res <- rbind(cv_rep_res, pred)

            }
            cv_rep_res <- cv_rep_res %>% arrange(row_predicted)
            cv_rep_res <- cv_rep_res %>% mutate(cv_rep = rep(cv_rep, nrow(cv_rep_res)))
            class_res  <- rbind(class_res, cv_rep_res)
        }
        print(head(class_res))
        print(dim(class_res))
        class_probs <- pull(class_res, class_type)
        model_res <- model_res %>% mutate(class_type := class_probs)
        print(head(model_res))
        cols <- head(colnames(model_res),-1)
        colnames(model_res) <- c(cols, class_type)
    }
    print(head(model_res))
    saveRDS(model_res, paste("../Classification_output/", target_data, "/NEW_CORRECT_PRED/Binomial_models_output_chunked/", data_type,"/Combined_predictions/", model_binomial, "_Predictions_", data_type, ".rds", sep = ""))
}

# A tibble: 6 × 4
  row_predicted label_pred Bile_Duct_Cancer cv_rep
          <int> <chr>                 <dbl> <chr> 
1             1 Other                0.0800 1     
2             2 Other                0.0729 1     
3             3 Other                0.229  1     
4             4 Other                0.0434 1     
5             5 Other                0.0663 1     
6             6 Other                0.121  1     
[1] 2290    4
# A tibble: 6 × 3
  cv_rep observed       class_type
   <int> <fct>               <dbl>
1      1 Gastric_cancer     0.0800
2      1 Gastric_cancer     0.0729
3      1 Gastric_cancer     0.229 
4      1 Gastric_cancer     0.0434
5      1 Ovarian_Cancer     0.0663
6      1 Ovarian_Cancer     0.121 
# A tibble: 6 × 4
  row_predicted label_pred Breast_Cancer cv_rep
          <int> <chr>              <dbl> <chr> 
1             1 Other             0.277  1     
2             2 Other             0.182  1     
3             3 Other             0.0119 1     
4   

## Case-control - model Lasso

In [10]:
MODELS_BINOMIAL = c("Lasso")
CLASSES <- c("Healthy")

CV_REPS = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
FOLDS = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")

target_data <- "Methylation_and_ATAC"
data_type <- "Full_data"

for (model_binomial in MODELS_BINOMIAL){
    model_res <- tibble(cv_rep = rep((1:10), each = 424), 
                        observed = rep(observed_case_control$sample_type, 10))
    for (class_type in CLASSES) {
        class_res <- tibble()
        for (cv_rep in CV_REPS) {
            cv_rep_res <- tibble()
            for (fold in FOLDS) {
                filename <- paste("../Classification_output/", target_data, "/NEW_CORRECT_PRED/Binomial_models_output_chunked/", data_type, "/", model_binomial, "/", model_binomial, "_class_", class_type, "_CVrep_", cv_rep, "_fold_", fold, "_Predictions_", data_type, ".rds", sep = "")
                pred <- readRDS(filename)
                if (pred$second_class_prob[1] > 0.5){
                    if (pred$label_pred[1] == "Cancer"){
                        colnames(pred) <- c("row_predicted", "Cancer", "label_pred")
                        
                    } else {
                        colnames(pred) <- c("row_predicted", "Healthy", "label_pred")
                        pred <- pred %>% mutate(Cancer = 1 - Healthy) %>% select(-Healthy)
                        print("HERE")
                    }
                } else if (pred$second_class_prob[1] < 0.5){
                    if (pred$label_pred[1] == "Cancer"){
                        colnames(pred) <- c("row_predicted", "Healthy", "label_pred")
                        pred <- pred %>% mutate(Cancer = 1 - Healthy) %>% select(-Healthy)
                        print("HERE")
                    } else {
                        colnames(pred) <- c("row_predicted", "Cancer", "label_pred")
                    }
                }
                cv_rep_res <- rbind(cv_rep_res, pred)

            }
            cv_rep_res <- cv_rep_res %>% arrange(row_predicted)
            cv_rep_res <- cv_rep_res %>% mutate(cv_rep = rep(cv_rep, nrow(cv_rep_res)))
            class_res  <- rbind(class_res, cv_rep_res)
        }
        print(dim(class_res))
        class_probs <- pull(class_res, "Cancer")
        model_res <- model_res %>% mutate(Cancer = class_probs)
    }
    print(head(model_res))
    saveRDS(model_res, paste("../Classification_output/", target_data, "/NEW_CORRECT_PRED/Binomial_models_output_chunked/", data_type,"/Combined_predictions/", model_binomial, "_Predictions_", data_type, "_Healthy.rds", sep = ""))
}

[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"
[1] "HERE"