In [None]:
library(tidyverse)
library(RColorBrewer)
library(cowplot)

In [None]:
############################################################################
#                                                                          #
#                          SET PARAMETERS HERE                             #
#                                                                          #
############################################################################

# ci5_cs1e9        all_germline_filtered_bams_tumor_ci5_cs1e9 #de_novo_germline_reference
input_folder_target_ratios = "all_germline_filtered_bams_tumor_ci5_cs1e9"
# ratio_singletons_excluded_tumor_4_cs     ratio
filetype_target_ratios = "ratio"

# ratio_emp_dist_analysis_ci4_cs1e9    ratio_emp_dist_analysis_all_germline_filtered_tumor_bams_ci5_cs1e9 # ratio_emp_dist_analysis_de_novo_germline_reference
input_folder_ratios_dist = "ratio_emp_dist_analysis_all_germline_filtered_tumor_bams_ci5_cs1e9"
output_folder = "union_germline_filtered_tumor_bams_ci5_cs1e9" # ci5_cs1e9 # union_germline_filtered_tumor_bams_ci5_cs1e9 # de_novo_germline_reference

In [None]:
paramspace_emp <- read.csv("../data/metadata/paramspace_empirical_dist_analysis_phaseI.csv", sep = ",")
dim(paramspace_emp)
head(paramspace_emp)

In [None]:
paramspace_cfDNA <- read.csv("../data/metadata/paramspace_cfDNA_phaseI.csv", sep = ",")
paramspace_cfDNA  <- paramspace_cfDNA %>% rename(patient_id = pt_id)
dim(paramspace_cfDNA)

In [None]:
paramspace_cfDNA["big_ratio"] <- NA
paramspace_cfDNA["big_ratio_CI_lower"] <- NA
paramspace_cfDNA["big_ratio_CI_upper"] <- NA

paramspace_cfDNA["small_ratio"] <- NA
paramspace_cfDNA["small_ratio_CI_lower"] <- NA
paramspace_cfDNA["small_ratio_CI_upper"] <- NA

for (row in 1:nrow(paramspace_cfDNA)){
    pt = paramspace_cfDNA[row, "patient_id"]
    #print(pt)
    fd = paramspace_cfDNA[row, "cfDNA_folder"]
    big_r = read.csv(paste("../data/", pt, "/" , fd, "/", input_folder_target_ratios, "/", "big_", filetype_target_ratios, ".csv", sep = ""))
    small_r = read.csv(paste("../data/", pt, "/", fd, "/", input_folder_target_ratios, "/", "small_", filetype_target_ratios, ".csv", sep = ""))
    #print(big_r[[1]])
    #print(small_r[[1]])
    paramspace_cfDNA[row, "big_ratio"] = big_r$ratio
    paramspace_cfDNA[row, "big_ratio_CI_lower"] = big_r$lower_CI
    paramspace_cfDNA[row, "big_ratio_CI_upper"] = big_r$upper_CI
    
    paramspace_cfDNA[row, "small_ratio"] = small_r$ratio
    paramspace_cfDNA[row, "small_ratio_CI_lower"] = small_r$lower_CI
    paramspace_cfDNA[row, "small_ratio_CI_upper"] = small_r$upper_CI
}

In [None]:
paramspace_cfDNA <- paramspace_cfDNA %>% 
  mutate(tmp_chunks = stringr::str_split(cfDNA_folder, stringr::fixed("_"),  n = 3)) %>%
  mutate(sampleID = map_chr(tmp_chunks, 1),
         sub_value = map_chr(tmp_chunks, 2), 
         sub_val = map_chr(tmp_chunks, 3)) %>%
  select(-c(tmp_chunks, sub_value, sub_val))

head(paramspace_cfDNA)
dim(paramspace_cfDNA)

In [None]:
sample_timepoint <- read.csv("../data/metadata/clin_data/sample_timepoint_formatted.csv")

In [None]:
sample_timepoint_days <- sample_timepoint %>% filter(phase == "phaseI")
sample_timepoint_days <- sample_timepoint_days %>% select(sampleID, sample_timepoint_days_since_OP)
print(sample_timepoint_days %>% filter(is.na(sample_timepoint_days_since_OP)))
dim(sample_timepoint_days)
head(sample_timepoint_days)

In [None]:
paramspace_cfDNA$sampleID <- as.character(paramspace_cfDNA$sampleID)
paramspace_cfDNA <- left_join(paramspace_cfDNA, sample_timepoint_days, by = c("sampleID"))
head(paramspace_cfDNA)
dim(paramspace_cfDNA)

In [None]:
getPalette = colorRampPalette(brewer.pal(9, "Blues"))

In [None]:
head(paramspace_emp)

In [None]:
Correlations <- read.csv("../data/ci5_cs1e9_correlation.csv")
relapse_label <- Correlations %>% select(cfDNA_sample, Relapse)
colnames(relapse_label) <- c("cfDNA_folder", "Relapse_label")

In [None]:
dim(paramspace_cfDNA)

In [None]:
paramspace_cfDNA <- left_join(paramspace_cfDNA, relapse_label, by = "cfDNA_folder")
head(paramspace_cfDNA)

In [None]:
plots <- NULL
plots_grid <- NULL
plots_small <- NULL
plots_grid_small <- NULL
p_vals_all <- NULL

for (pt in unique(paramspace_emp$patient_id)){
    paramspace_emp_pt <- paramspace_emp %>% filter(patient_id == pt)
    paramspace_cfDNA_pt <- paramspace_cfDNA %>% filter(patient_id == pt)
    
    paramspace_emp_pt["big_ratio"] <- NA
    paramspace_emp_pt["small_ratio"] <- NA
    
    for (row in 1:nrow(paramspace_emp_pt)){
        other_pt = paramspace_emp_pt[row, "other_patient_id"]
        other_pt_cfDNA_folder = paramspace_emp_pt[row, "other_patient_cfDNA_folder"]
        index  = paramspace_emp_pt[row, "X"]
        big_r = read.csv(paste("../data/", pt, "/" , input_folder_ratios_dist, "/ratios/", "big_ratio_pt_", other_pt, "_cfDNAsample_", other_pt_cfDNA_folder, "_", index, ".csv", sep = ""))
        small_r = read.csv(paste("../data/", pt, "/" , input_folder_ratios_dist, "/small_ratios/", "small_ratio_pt_", other_pt, "_cfDNAsample_", other_pt_cfDNA_folder, "_", index, ".csv", sep = ""))
        paramspace_emp_pt[row, "big_ratio"] = big_r$ratio
        paramspace_emp_pt[row, "small_ratio"] = small_r$ratio
    }
    ratios_other = paramspace_emp_pt$big_ratio
    ratios_other = sort(ratios_other, decreasing = FALSE)
    paramspace_cfDNA_pt["p_value"] <- NA
    
    ratios_other_small = paramspace_emp_pt$small_ratio
    ratios_other_small = sort(ratios_other_small, decreasing = FALSE)
    paramspace_cfDNA_pt["p_value"] <- NA
    paramspace_cfDNA_pt["p_value_small_r"] <- NA
    
    for (row in 1:nrow(paramspace_cfDNA_pt)){
        ratio = paramspace_cfDNA_pt[row, "big_ratio"]
        paramspace_cfDNA_pt[row, "p_value"] = sum(ratios_other >= ratio)/length(ratios_other)
        
        small_ratio = paramspace_cfDNA_pt[row, "small_ratio"]
        paramspace_cfDNA_pt[row, "p_value_small_r"] = sum(ratios_other_small >= small_ratio)/length(ratios_other_small)
        }
    p_vals_all <- rbind(p_vals_all, paramspace_cfDNA_pt)
    
    
    paramspace_cfDNA_pt$sample_timepoint <- as.factor(paramspace_cfDNA_pt$sample_timepoint)
    colourCount = length(unique(paramspace_cfDNA_pt$sample_timepoint))
    
    #distribution_plot <- ggplot() + 
    #                          geom_histogram(data = paramspace_emp_pt, aes(x = big_ratio), fill = "grey81", color = "grey50", bins = 70) + 
    #                          geom_vline(data = paramspace_cfDNA_pt, aes(xintercept = big_ratio, color = sample_timepoint), size = 0.5, alpha = 1) + 
    #                          theme_minimal()  + 
    #                          scale_color_manual(values = getPalette(colourCount)) + 
    #                          labs(x = "Big ratio",
    #                               y = "count", 
    #                               title = paste("Big ratio distribution", sep =  ""),
    #                               subtitle = paste("Patient ", pt, " (", paramspace_cfDNA_pt[1, "Relapse_label"], ")", "\nPipeline ", input_folder_target_ratios, sep =  ""))
    #
    #distribution_plot_cowplot <- ggplot() + 
    #                          geom_histogram(data = paramspace_emp_pt, aes(x = big_ratio), fill = "grey81", color = "grey50", bins = 70) + 
    #                          geom_vline(data = paramspace_cfDNA_pt, aes(xintercept = big_ratio, color = sample_timepoint), size = 0.5, alpha = 1) + 
    #                          theme_minimal()  + 
    #                          scale_color_manual(values = getPalette(colourCount)) + 
    #                          labs(x = "Big ratio",
    #                               y = "count", 
    #                               title = paste(pt, " (", paramspace_cfDNA_pt[1, "Relapse_label"], ")", sep =  "")) +
    #                          theme(legend.position = "none")
    
    #ggsave(
    #  paste("../plotting_results/", output_folder, "/empirical_dist_analysis/big_ratios/Patient_", pt, "_Big_ratio_distribution.png", sep = ""),
    #  distribution_plot, dpi = "print")
    
    ######################################################################################################################################
    ######################################################################################################################################
    ######################################################################################################################################
    
    distribution_plot_small <- ggplot() + 
                              geom_histogram(data = paramspace_emp_pt, aes(x = small_ratio), fill = "grey71", color = "grey50", bins = 70) + 
                              geom_vline(data = paramspace_cfDNA_pt, aes(xintercept = small_ratio, color = sample_timepoint), size = 1.2, alpha = 1) + 
                              theme_minimal()  + 
                              scale_color_manual(values = getPalette(colourCount)) + 
                              labs(x = " ",
                                   y = " ", 
                                   title = paste("Small ratio distribution", sep =  ""),
                                   subtitle = paste("Patient ", pt, " (", paramspace_cfDNA_pt[1, "Relapse_label"], ")", "\nPipeline ", input_folder_target_ratios, sep =  "")) + 
                              theme(legend.position = "none",
                                    text = element_text(size = 20),
                                    plot.title = element_text(size=22)) + 
                              xlab("") + ylab("")
    
     distribution_plot_cowplot_small <- ggplot() + 
                              geom_histogram(data = paramspace_emp_pt, aes(x = small_ratio), fill = "grey71", color = "grey50", bins = 70) + 
                              geom_vline(data = paramspace_cfDNA_pt, aes(xintercept = small_ratio, color = sample_timepoint), size = 1.2, alpha = 1) + 
                              theme_minimal()  + 
                              scale_color_manual(values = getPalette(colourCount)) + 
                              labs(x = " ",
                                   y = " ", 
                                   title = paste(pt, " (", paramspace_cfDNA_pt[1, "Relapse_label"], ")", sep =  "")) +
                              theme(legend.position = "none",
                                    plot.title = element_text(size=18),
                                    axis.text.x=element_text(size=12),
                                    axis.text.y=element_text(size=12)) + 
                              xlab("") + ylab("")
    
    #ggsave(
    #  paste("../plotting_results/", output_folder, "/empirical_dist_analysis/small_ratios/Patient_", pt, "_Small_ratio_distribution.png", sep = ""),
    #  distribution_plot_small, dpi = "print")

    
    #plots[[pt]] <- distribution_plot
    #plots_grid[[pt]] <- distribution_plot_cowplot
    
    plots_small[[pt]] <- distribution_plot_small
    plots_grid_small[[pt]] <- distribution_plot_cowplot_small
    
    }

In [None]:
options(repr.plot.width=12, repr.plot.height=8)
combined_plot

In [None]:
ggsave(
      paste("../plotting_results/", output_folder, "/empirical_dist_analysis/big_ratios/Combined_plots_Big_ratio_distribution.png", sep = ""),
      combined_plot, width = 15, height = 8, dpi = "print")

In [None]:
options(repr.plot.width=11, repr.plot.height=12)
combined_plot_small

In [None]:
ggsave(
      paste("../plotting_results/", output_folder, "/empirical_dist_analysis/small_ratios/Combined_plots_Small_ratio_distribution.png", sep = ""),
      combined_plot_small, width = 15, height = 8, dpi = "print")

## Calculate p-values

In [None]:
dim(paramspace_cfDNA)

In [None]:
dim(p_vals_all)

In [None]:
p_vals_all <- p_vals_all %>% select(patient_id, cfDNA_folder, unique_kmers_folder, big_ratio, small_ratio, sampleID, sample_timepoint_days_since_OP, Relapse_label, p_value, p_value_small_r)
head(p_vals_all)

In [None]:
write.csv(p_vals_all,  paste("../plotting_results/", output_folder, "/empirical_dist_analysis/p_values.csv", sep = ""), row.names = FALSE)