In [None]:
library(tidyverse)
library(RColorBrewer)
library(grid)
library(cowplot)

In [None]:
data_folder <-  "de_novo_germline_reference" # de_novo_germline_reference # all_germline_filtered_bams_tumor_subtract_other_tumor_ci5_cs1e9 # ci5_cs1e9  # all_germline_ci5_cs1e9 # all_germline_filtered_bams_tumor_ci5_cs1e9 # all_germline_filtered_bams_tumor_subtract_other_tumor_ci5_cs1e9
big_ratio_file_name <- "big_ratio.csv" #"big_ratio_singletons_excluded_tumor_4_cs.csv" # "big_ratio_all_germline_singletons_excluded_tumor_4_cs.csv" #"big_ratio.csv" # #
small_ratio_file_name <- "small_ratio.csv" #"small_ratio_singletons_excluded_tumor_4_cs.csv" #"small_ratio_all_germline_singletons_excluded_tumor_4_cs.csv" #"small_ratio.csv" # #

In [None]:
paramspace <- read.csv("../data/metadata/paramspace_cfDNA_phaseI.csv")

In [None]:
paramspace <- paramspace %>% select(pt_id, cfDNA_folder)

In [None]:
head(paramspace)

In [None]:
patients <- as.character(unique(paramspace$pt_id))

In [None]:
patients

In [None]:
read_counts <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(read_counts) <-c("pt_id", "cfDNA_sample", "readcount")

In [None]:
for (i in patients){
    paramspace_patient <- paramspace %>% filter(pt_id == i)
    for (j in paramspace_patient$cfDNA_folder){
        reads <- read.csv(paste("../data/", i, "/", j, "/cs1e9/cfDNA_count.csv", sep = ""), header=FALSE)
        read_counts[nrow(read_counts)+1,] = c(i, j, reads)
    }}

In [None]:
read_counts

In [None]:
write.csv(read_counts, "../data/read_counts.csv", row.names=FALSE)

## Combine the ratios for all patients in one data frame

In [None]:
big_ratio <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(big_ratio) <-c("pt_id", "cfDNA_sample", "big_ratio")

In [None]:
for (i in patients){
    paramspace_patient <- paramspace %>% filter(pt_id == i)
    for (j in paramspace_patient$cfDNA_folder){
        reads <- read.csv(paste("../data/", i, "/", j, "/", data_folder, "/", big_ratio_file_name, sep = ""))
        reads <- reads %>% select(ratio)
        big_ratio[nrow(big_ratio)+1,] = c(i, j, reads)
    }}

In [None]:
small_ratio_comb <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(small_ratio_comb) <-c("pt_id", "cfDNA_sample", "small_ratio")

In [None]:
for (i in patients){
    paramspace_patient <- paramspace %>% filter(pt_id == i)
    for (j in paramspace_patient$cfDNA_folder){
        reads <- read.csv(paste("../data/", i, "/", j, "/", data_folder, "/", small_ratio_file_name, sep = ""))
        reads <- reads %>% select(ratio)
        small_ratio_comb[nrow(small_ratio_comb)+1,] = c(i, j, reads)
    }}

In [None]:
head(small_ratio_comb)
dim(small_ratio_comb)

## Plots

In [None]:
big_ratio <- big_ratio %>% select("cfDNA_sample", "big_ratio")

In [None]:
small_ratio_comb <- small_ratio_comb %>% select("cfDNA_sample", "small_ratio")

In [None]:
ratios_and_counts <- left_join(read_counts, big_ratio, by="cfDNA_sample")
ratios_and_counts <- left_join(ratios_and_counts, small_ratio_comb, by="cfDNA_sample")

In [None]:
cor(ratios_and_counts$big_ratio, ratios_and_counts$readcount)

In [None]:
ggplot(data = ratios_and_counts)+
    geom_point(aes(x = big_ratio, y = readcount, color = pt_id))
    #+geom_text(aes(x = big_ratio, y = readcount, label = pt_id))

ggsave(paste("../plotting_results/Correlation_readcounts/", data_folder, "_big_ratio", ".png", sep=""))

In [None]:
options(repr.plot.width=12, repr.plot.height=6)
ggplot(data = ratios_and_counts)+
    geom_point(aes(x = big_ratio, y = readcount, color = pt_id)) + facet_wrap(~pt_id, ncol = 5, scales = "free")

ggsave(paste("../plotting_results/Correlation_readcounts/", data_folder, "_big_ratio_faceted", ".png", sep=""))

In [None]:
cor(ratios_and_counts$small_ratio, ratios_and_counts$readcount)

In [None]:
ggplot(data = ratios_and_counts)+
    geom_point(aes(x = small_ratio, y = readcount, color = pt_id))

ggsave(paste("../plotting_results/Correlation_readcounts/", data_folder, "_small_ratio", ".png", sep=""))

In [None]:
#ggsave(paste("../plotting_results/Correlation_readcounts/", data_folder, "_small_ratio_faceted", ".png", sep=""))

In [None]:
pt_correlation <- ratios_and_counts %>% group_by(pt_id)  %>% transmute(corr_big = cor(readcount, big_ratio), corr_small = cor(readcount, small_ratio)) %>% unique()

In [None]:
pt_correlation

In [None]:
mean_big = mean(pt_correlation$corr_big)
mean_big

mean_small = mean(pt_correlation$corr_small)
mean_small

In [None]:
write.csv(pt_correlation, paste("../plotting_results/Correlation_readcounts/", data_folder, ".csv", sep=""), row.names= FALSE)

In [None]:
pt_correlation

# Plots for the report

In [None]:
small_ratio <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(small_ratio) <-c("pt_id", "cfDNA_sample", "small_ratio")

In [None]:
for (i in patients){
    paramspace_patient <- paramspace %>% filter(pt_id == i)
    for (j in paramspace_patient$cfDNA_folder){
        reads <- read.csv(paste("../data/", i, "/", j, "/", data_folder, "/", small_ratio_file_name, sep = ""))
        reads <- reads %>% select(ratio)
        small_ratio[nrow(small_ratio)+1,] = c(i, j, reads)
    }}
head(small_ratio)

In [None]:
dim(small_ratio)

In [None]:
small_ratio <- small_ratio %>% 
  mutate(tmp_chunks = stringr::str_split(cfDNA_sample, stringr::fixed("_"),  n = 3)) %>%
  mutate(sampleID = map_chr(tmp_chunks, 1),
         sub_value = map_chr(tmp_chunks, 2), 
         sub_val = map_chr(tmp_chunks, 3)) %>%
  select(-c(tmp_chunks, sub_value, sub_val))

head(small_ratio)
dim(small_ratio)

In [None]:
sample_timepoint <- read.csv("../data/metadata/clin_data/sample_timepoint_formatted.csv")

In [None]:
head(sample_timepoint)
sample_timepoint_days <- sample_timepoint %>% filter(phase == "phaseI")
sample_timepoint_days <- sample_timepoint_days %>% select(sampleID, sample_timepoint_days_since_OP)
sample_timepoint_days %>% filter(is.na(sample_timepoint_days_since_OP))
dim(sample_timepoint_days)
head(sample_timepoint_days)

In [None]:
small_ratio$sampleID <- as.character(small_ratio$sampleID)
small_ratio <- left_join(small_ratio, sample_timepoint_days, by = c("sampleID"))
head(small_ratio)
dim(small_ratio)

In [None]:
phaseIpt_R <- read.csv("../phaseI_pt_R.csv", header=FALSE)
phaseIpt_R <- as.character(unlist(c(phaseIpt_R[1,])))

low_qual_sample <- read.table("../low_qual_sample.txt")
low_qual_sample <- as.character(low_qual_sample[[1]])

In [None]:
relapse_pts <- phaseIpt_R

small_ratio <- small_ratio %>% mutate(relapse_label = ifelse(pt_id %in% relapse_pts, "R", "No_R"))

ratios_and_counts_small <- left_join(read_counts, small_ratio, by=c("pt_id", "cfDNA_sample"))
dim(ratios_and_counts_small)

In [None]:
head(ratios_and_counts_small)
ratios_and_counts_small <- ratios_and_counts_small %>% filter(pt_id != low_qual_sample)
dim(ratios_and_counts_small)

In [None]:
ratios_and_counts_small_samples_o <- NULL
color_palette_size <- 0
for (i in unique(ratios_and_counts_small$pt_id)){
    print(i)
    ratios_and_counts_small_pt <- ratios_and_counts_small %>% filter(pt_id == i) %>% arrange(sample_timepoint_days_since_OP)
    ratios_and_counts_small_pt <- ratios_and_counts_small_pt %>% mutate(samples_order = seq(1, nrow(ratios_and_counts_small_pt)))
    if (nrow(ratios_and_counts_small_pt) > color_palette_size){
            color_palette_size <- nrow(ratios_and_counts_small_pt)
        }
    ratios_and_counts_small_samples_o <- rbind(ratios_and_counts_small_samples_o, ratios_and_counts_small_pt)
    }

    
head(ratios_and_counts_small_samples_o)          

In [None]:
correlations_smallr <- ratios_and_counts_small_samples_o %>% 
    group_by(pt_id) %>% 
    transmute(corr_small = cor(readcount, small_ratio)) %>% unique()
correlations_smallr
mean(correlations_smallr$corr_small)

In [None]:
relapse_pts <- phaseIpt_R

relapse_cor <- correlations_smallr %>% filter(pt_id %in% relapse_pts)
mean(relapse_cor$corr_small)

no_relapse_cor <- correlations_smallr %>% filter(!pt_id %in% relapse_pts)
mean(no_relapse_cor$corr_small)

In [None]:
dim(ratios_and_counts_small_samples_o)
ratios_and_counts_small_samples_o <- left_join(ratios_and_counts_small_samples_o, correlations_smallr, by = "pt_id")
dim(ratios_and_counts_small_samples_o)
head(ratios_and_counts_small_samples_o)

In [None]:
#getPalette = colorRampPalette(brewer.pal(9, "Blues"))
colsi = colorRampPalette(brewer.pal(9, "Blues"))(17)[4:17]
ratios_and_counts_small_samples_o$samples_order <- as.factor(ratios_and_counts_small_samples_o$samples_order)

In [None]:
options(repr.plot.width=10.2, repr.plot.height=10)

pt_id.labs <- c(paste(ratios_and_counts_small_samples_o$pt_id, ", (", ratios_and_counts_small_samples_o$relapse_label, "), cor: ", round(ratios_and_counts_small_samples_o$corr_small, 3), sep = ""))
names(pt_id.labs) <- c(as.character(ratios_and_counts_small_samples_o$pt_id))

plotty <- ratios_and_counts_small_samples_o %>% ggplot() +
    geom_point(aes(x = small_ratio, y = readcount, color = samples_order), size = 1.8) + 
    facet_wrap(~pt_id, scales = "free", ncol = 3, labeller = labeller(pt_id = pt_id.labs),) + 
    scale_color_manual(values = c("darkred", colsi)) + #getPalette(color_palette_size -1))) +
    theme_minimal() + 
    theme(strip.text.x = element_text(size = 10),
          axis.text.x = element_text(size = 11),
          axis.text.y = element_text(size = 11),
          axis.title=element_text(size=13),
          legend.text=element_text(size=13), 
          strip.background =element_rect(fill="lightgrey"),
          panel.border = element_rect(size = 0.1, colour = "black", fill = NA)) +
    xlab("Fraction of unique tumor k-mers found in cfDNA") + labs(color="Order of samples")+
    ylab("Number of cfDNA reads")

plotty

In [None]:
options(repr.plot.width=9, repr.plot.height=7)

pt_id.labs <- c(paste(ratios_and_counts_small_samples_o$pt_id, ", (", ratios_and_counts_small_samples_o$relapse_label, "), cor: ", round(ratios_and_counts_small_samples_o$corr_small, 3), sep = ""))
names(pt_id.labs) <- c(as.character(ratios_and_counts_small_samples_o$pt_id))

plotty_nor <- ratios_and_counts_small_samples_o %>% filter(relapse_label == "No_R") %>% ggplot() +
    geom_point(aes(x = small_ratio, y = readcount, color = samples_order), size = 1.8) + 
    facet_wrap(~pt_id, ncol = 3, labeller = labeller(pt_id = pt_id.labs),) + 
    scale_color_manual(values = c("darkred", colsi)) + #getPalette(color_palette_size -1))) +
    theme_minimal() + 
    theme(strip.text.x = element_text(size = 10),
          axis.text.x = element_text(size = 9),
          axis.text.y = element_text(size = 10),
          legend.text=element_text(size=12), 
          strip.background =element_rect(fill="lightgrey"),
          panel.border = element_rect(size = 0.1, colour = "black", fill = NA)) +
    xlab("Fraction of unique tumor k-mers found in cfDNA") + labs(color="Order of samples")+
    ylab("Number of cfDNA reads")

plotty_nor

In [None]:
options(repr.plot.width=10, repr.plot.height=7)

pt_id.labs <- c(paste(ratios_and_counts_small_samples_o$pt_id, ", (", ratios_and_counts_small_samples_o$relapse_label, "), cor: ", round(ratios_and_counts_small_samples_o$corr_small, 3), sep = ""))
names(pt_id.labs) <- c(as.character(ratios_and_counts_small_samples_o$pt_id))

plotty_R <- ratios_and_counts_small_samples_o %>% filter(relapse_label == "R") %>% ggplot() +
    geom_point(aes(x = small_ratio, y = readcount, color = samples_order), size = 1.8) + 
    facet_wrap(~pt_id, ncol = 3, labeller = labeller(pt_id = pt_id.labs),) + 
    scale_color_manual(values = c("darkred", colsi)) + #getPalette(color_palette_size -1))) +
    theme_minimal() + 
    theme(strip.text.x = element_text(size = 10),
          axis.text.x = element_text(size = 9),
          axis.text.y = element_text(size = 10),
          legend.text=element_text(size=12), 
          strip.background =element_rect(fill="lightgrey"),
          panel.border = element_rect(size = 0.1, colour = "black", fill = NA)) +
    xlab("Fraction of unique tumor k-mers found in cfDNA") + labs(color="Order of samples")+
    ylab("Number of cfDNA reads")

plotty_R

In [None]:
ratios_and_counts_small_samples_o_relapses <- ratios_and_counts_small_samples_o %>% filter(pt_id %in% phaseIpt_R) 

In [None]:
options(repr.plot.width=9, repr.plot.height=10)
plots_readcounts <- NULL
plots_ratio <- NULL
#unique(ratios_and_counts_small_samples_o$pt_id
for (i in unique(ratios_and_counts_small_samples_o$pt_id)){
    ratios_and_counts_small_samples_o_pt <- ratios_and_counts_small_samples_o %>% filter(pt_id == i)
    plot_readc_pt <- ggplot(ratios_and_counts_small_samples_o_pt) + 
        geom_col(aes(x = sample_timepoint_days_since_OP, y = readcount), width = 20, fill = "gray75") + 
        theme_minimal() + xlab("") + ylab("") +  ggtitle(paste(i, ", cor: ", round(ratios_and_counts_small_samples_o_pt$corr_small, 3), sep = "")) +  
        theme(plot.margin = unit(c(0, 0, -0.4, 0), "cm"),
               plot.title = element_text(size=10),
              axis.text.x = element_blank(), 
              axis.text.y =element_text(size=8)) + 
        scale_y_continuous(breaks = c(0, 5e+08, 1e+09))
    plots_readcounts[[i]] <- plot_readc_pt
    
    plots_ratio_pt <- ggplot(ratios_and_counts_small_samples_o_pt) + 
        geom_point(aes(x = sample_timepoint_days_since_OP, y = small_ratio), size = 0.8) + 
        geom_line(aes(x = sample_timepoint_days_since_OP, y = small_ratio)) + 
        theme_minimal() + xlab("") + ylab("") + 
        theme(plot.margin = unit(c(0, 0, 0, 0), "cm"), 
              axis.text.y =element_text(size=8))
    plots_ratio[[i]] <- plots_ratio_pt
    }

p1 <- plot_grid(plots_readcounts[["1"]], 
          plots_readcounts[["2"]], 
          plots_ratio[["1"]], 
          plots_ratio[["2"]], 
          plots_readcounts[["3"]], 
          plots_readcounts[["4"]], 
          plots_ratio[["3"]], 
          plots_ratio[["4"]],
          plots_readcounts[["5"]], 
          plots_readcounts[["6"]], 
          plots_ratio[["5"]], 
          plots_ratio[["6"]], 
          plots_readcounts[["7"]], 
          plots_readcounts[["8"]], 
          plots_ratio[["7"]], 
          plots_ratio[["8"]], ncol = 2, align = "v",
          rel_heights = c(0.5, 0.8, 0.5, 0.8, 0.5, 0.8, 0.5, 0.8, 0.5, 0.8, 0.5, 0.8, 0.5, 0.8, 0.5, 0.8),
          rel_widths = c(9,9, 9,9, 9,9, 9,9, 9,9, 9,9, 9,9, 9,9))


p2 <- plot_grid(plots_readcounts[["9"]], NULL, plots_ratio[["10"]],  NULL, ncol = 2, align = "v",
          rel_heights = c(0.5, 0.8, 0, 0),
          rel_widths = c(9,9, 9, 9))

p3 <- plot_grid(p1, p2, ncol = 1, rel_heights=c(0.8,0.2))
p3

     


In [None]:
clinical_data <- read.csv("../data/metadata/clin_data/clinical_data_formatted.csv")
clinical_data_relapse <- clinical_data %>% select(patient_id, time_to_relapse_days, adjuvant_chemo_start_days, adjuvant_chemo_end_days)
clinical_data_relapse <- clinical_data_relapse %>% rename(pt_id = patient_id)
head(clinical_data_relapse)

ratios_and_counts_small_samples_o_relapses <-left_join(ratios_and_counts_small_samples_o_relapses, clinical_data_relapse, by="pt_id")
head(ratios_and_counts_small_samples_o_relapses)
dim(ratios_and_counts_small_samples_o_relapses)

In [None]:
relapses <- ratios_and_counts_small_samples_o_relapses %>% filter(relapse_label == "R", sample_timepoint_days_since_OP >= time_to_relapse_days)
pre_relapse1 <- ratios_and_counts_small_samples_o_relapses %>% group_by(pt_id) %>% 
    filter(relapse_label == "R", sample_timepoint_days_since_OP >= time_to_relapse_days) %>% 
    filter(sample_timepoint_days_since_OP == min(sample_timepoint_days_since_OP))

pre_relapse2 <- ratios_and_counts_small_samples_o_relapses %>% filter(sample_timepoint_days_since_OP < time_to_relapse_days)

pre_relapse <- bind_rows(pre_relapse1, pre_relapse2)

In [None]:
options(repr.plot.width=9, repr.plot.height=10)
plots_readcounts_R <- NULL
plots_ratio_R <- NULL
#unique(ratios_and_counts_small_samples_o$pt_id
for (i in unique(ratios_and_counts_small_samples_o_relapses$pt_id)){
    ratios_and_counts_small_samples_o_pt <- ratios_and_counts_small_samples_o_relapses %>% filter(pt_id == i)
    pre_relapse_pt <- pre_relapse %>% filter(pt_id == i)
    relapses_pt <- relapses %>% filter(pt_id == i)
    
    plot_readc_pt <- ggplot(ratios_and_counts_small_samples_o_pt) + 
        geom_col(aes(x = sample_timepoint_days_since_OP, y = readcount), width = 20, fill = "gray75") + 
        theme_minimal() + xlab("") + ylab("") + ggtitle(paste(i, ", cor: ", round(ratios_and_counts_small_samples_o_pt$corr_small, 3), sep = "")) + 
        theme(plot.margin = unit(c(0, 0, -0.4, 0), "cm"),
               plot.title = element_text(size=10),
              axis.text.x = element_blank(), 
              axis.text.y =element_text(size=8)) + 
        scale_y_continuous(breaks = c(0, 5e+08, 1e+09))
    plots_readcounts_R[[i]] <- plot_readc_pt
    
    plots_ratio_pt <- ggplot() + 
        geom_point(data = ratios_and_counts_small_samples_o_pt, aes(x = sample_timepoint_days_since_OP, y = small_ratio), size = 0.8) + 
        geom_line(data = pre_relapse_pt, aes(x = sample_timepoint_days_since_OP, y = small_ratio)) + 
        geom_line(data = relapses_pt, aes(x = sample_timepoint_days_since_OP, y = small_ratio), linetype = "dashed") + 
        theme_minimal() + xlab("") + ylab("") + 
        theme(plot.margin = unit(c(0, 0, 0, 0), "cm"), 
              axis.text.y =element_text(size=8))
    plots_ratio_R[[i]] <- plots_ratio_pt
    }


In [None]:
options(repr.plot.width=9, repr.plot.height=6)
relapse_pts <- phaseIpt_R
pR1 <- plot_grid(plots_readcounts_R[[phaseIpt_R][1]], 
          plots_readcounts_R[[phaseIpt_R][2]]], 
          plots_ratio_R[[phaseIpt_R][1]]], 
          plots_ratio_R[[phaseIpt_R][2]]], 
          plots_readcounts_R[[phaseIpt_R][3]]], 
          plots_readcounts_R[[phaseIpt_R][4]]], 
          plots_ratio_R[[phaseIpt_R][3]]], 
          plots_ratio_R[[phaseIpt_R][4]]], ncol = 2, align = "v",
          rel_heights = c(0.5, 0.8, 0.5, 0.8, 0.5, 0.8, 0.5, 0.8),
          rel_widths = c(9,9, 9,9, 9,9, 9,9))


pR2 <- plot_grid(plots_readcounts_R[[phaseIpt_R][5]]], NULL, plots_ratio_R[[phaseIpt_R[5]],  NULL, ncol = 2, align = "v",
          rel_heights = c(0.5, 0.8, 0, 0),
          rel_widths = c(9,9, 9, 9))

pR3 <- plot_grid(pR1, pR2, ncol = 1, rel_heights=c(0.66,0.33))
pR3