In [None]:
library(tidyverse)
library(stringr) 
library(plotly)
library(cowplot)

In [None]:
##############################################################################

##########################     INPUT PARAMETERS    ###########################

##############################################################################



# filetype = # "ratio_all_germline_singletons_excluded_tumor_4_cs" # # "ratio_singletons_excluded_tumor_4_cs" # "ratio" 
filetype = "ratio"

# input_folder =   "ci5_cs1e9" # "all_germline_ci5_cs1e9" # "all_germline_filtered_bams_tumor_ci5_cs1e9" # "all_germline_filtered_bams_tumor_subtract_other_tumor_ci5_cs1e9", all_germline_filtered_bams_tumor_ci5_cs1e9_filtered_cfDNA
# all_germline_filtered_bams_tumor_ci5_cs1e9_filtered_cfDNA # de_novo_germline # all_germline_filtered_bams_tumor_ci5_cs1e9_filtered_cfDNA_subtracted_kmers_not_seen
# all_germline_filtered_bams_tumor_ci5_cs1e9_well_mapping_subtracted # de_novo_germline # de_novo_germline_filtered_cfDNA # de_novo_germline_reference
input_folder = "de_novo_germline_reference"

# output_folder =  "ci5_cs1e9" # "union_germline_ci4_cs1e9" # "union_germline_filtered_tumor_bams_ci5_cs1e9" # "union_germline_filtered_tumor_bams_subtract_other_tumors_ci5_cs1e9"
# union_germline_filtered_tumor_bams_ci5_cs1e9_filtered_cfDNA # de_novo_germline # union_germline_filtered_bams_tumor_ci5_cs1e9_filtered_cfDNA_subtracted_kmers_not_seen
# union_germline_filtered_tumor_ci5_cs1e9_well_mapping_subtracted # de_novo_germline # de_novo_germline_filtered_cfDNA # de_novo_germline_reference
output_folder = "de_novo_germline_reference"

low_qual_sample <- read.table("../low_qual_sample.txt")
low_qual_sample <- as.character(low_qual_sample[[1]])

phaseIpt <- read.csv("../phaseI_pt.csv", header=FALSE)
phaseIpt <- as.character(unlist(c(phaseIpt[1,])))

In [None]:
paramspace = read.csv("../data/metadata/paramspace_cfDNA_phaseI.csv")

paramspace  <- paramspace %>% rename(patient_id = pt_id)
head(paramspace)

In [None]:
SAMPLES = paramspace["patient_id"]
FOLDERS = paramspace["cfDNA_folder"]

In [None]:
nrow(paramspace)

In [None]:
paramspace["big_ratio"] <- NA
paramspace["big_ratio_CI_lower"] <- NA
paramspace["big_ratio_CI_upper"] <- NA

paramspace["small_ratio"] <- NA
paramspace["small_ratio_CI_lower"] <- NA
paramspace["small_ratio_CI_upper"] <- NA

#paramspace["jaccard"] <- NA
#paramspace["jaccard_CI_lower"] <- NA
#paramspace["jaccard_CI_upper"] <- NA

for (row in 1:nrow(paramspace)){
    pt = paramspace[row, "patient_id"]
    #print(pt)
    fd = paramspace[row, "cfDNA_folder"]
    big_r = read.csv(paste("../data/", pt, "/" , fd, "/", input_folder, "/", "big_", filetype, ".csv", sep = ""))
    #jaccard = read.csv(paste("../data/", pt, "/" , fd, "/", input_folder, "/", "jaccard_index.csv", sep = ""))
    small_r = read.csv(paste("../data/", pt, "/", fd, "/", input_folder, "/", "small_", filetype, ".csv", sep = ""))
    #print(big_r[[1]])
    #print(small_r[[1]])
    paramspace[row, "big_ratio"] = big_r$ratio
    paramspace[row, "big_ratio_CI_lower"] = big_r$lower_CI
    paramspace[row, "big_ratio_CI_upper"] = big_r$upper_CI

    #paramspace[row, "jaccard"] = jaccard$ratio
    #paramspace[row, "jaccard_CI_lower"] = jaccard$lower_CI
    #paramspace[row, "jaccard_CI_upper"] = jaccard$upper_CI
    
    paramspace[row, "small_ratio"] = small_r$ratio
    paramspace[row, "small_ratio_CI_lower"] = small_r$lower_CI
    paramspace[row, "small_ratio_CI_upper"] = small_r$upper_CI
}

In [None]:
res <- paramspace
head(res)

In [None]:
#res <- res %>% separate(cfDNA_folder, c("sampleID", "sample_type", "sample_number"))
res <- res %>% 
  mutate(tmp_chunks = stringr::str_split(cfDNA_folder, stringr::fixed("_"),  n = 3)) %>%
  mutate(sampleID = map_chr(tmp_chunks, 1),
         sub_value = map_chr(tmp_chunks, 2), 
         sub_val = map_chr(tmp_chunks, 3)) %>%
  select(-c(tmp_chunks, sub_value, sub_val))

head(res)
dim(res)

In [None]:
sample_timepoint <- read.csv("../data/metadata/clin_data/sample_timepoint_formatted.csv")

In [None]:
head(sample_timepoint)
sample_timepoint_days <- sample_timepoint %>% filter(phase == "phaseI")
sample_timepoint_days <- sample_timepoint_days %>% select(sampleID, sample_timepoint_days_since_OP)
sample_timepoint_days %>% filter(is.na(sample_timepoint_days_since_OP))
dim(sample_timepoint_days)
head(sample_timepoint_days)

In [None]:
res$sampleID <- as.character(res$sampleID)
res <- left_join(res, sample_timepoint_days, by = c("sampleID"))
head(res)
dim(res)

In [None]:
res %>% filter(is.na(sample_timepoint_days_since_OP))

In [None]:
clinical_data <- read.csv("../data/metadata/clin_data/clinical_data_formatted.csv")
clinical_data_relapse <- clinical_data %>% select(patient_id, time_to_relapse_days, adjuvant_chemo_start_days, adjuvant_chemo_end_days)
head(clinical_data_relapse)

In [None]:
res <-left_join(res, clinical_data_relapse, by="patient_id")
head(res)
dim(res)

In [None]:
phase_I_pts = phaseIpt

res_min <- NULL
for (pt in phase_I_pts){
    res_pt <- res %>% filter(patient_id == pt)
    res_pt$sample_timepoint_days_since_OP <- as.numeric(res_pt$sample_timepoint_days_since_OP)
    
    # order based on sample timepoint
    res_pt <- res_pt %>% arrange(sample_timepoint_days_since_OP)
    head(res_pt)
    
    # pick first 2
    res_pt <- res_pt[1:2, ]
    print(dim(res_pt))
    
    res_pt <- res_pt %>% mutate(sample_label = c("pre-op", "post-op"))
    
    # save to res min
    res_min <- rbind(res_min, res_pt)
    
    }

res_min %>% select(patient_id, cfDNA_folder, big_ratio, small_ratio, sample_timepoint_days_since_OP, sample_label) #big_upper_ratio,

In [None]:
res_min <- res_min %>% mutate(pre_post_time = ifelse(sample_label == "pre-op", 0, sample_timepoint_days_since_OP))

In [None]:
options(repr.plot.width=7, repr.plot.height=7)

In [None]:
res_filtered <- res_min %>% select(patient_id, cfDNA_folder, big_ratio, small_ratio, sample_timepoint_days_since_OP, sample_label) #big_upper_ratio,

In [None]:
normalized_res <- data.frame(matrix(ncol = 8, nrow = 0))
colnames(normalized_res) <- c("patient_id", "cfDNA_folder", "big_ratio", "small_ratio", "sample_timepoint_days_since_OP", "sample_label", "big_ratio_normalized", "small_ratio_normalized")


In [None]:
patients = as.character(unique(res_filtered$patient_id))

In [None]:
for (i in patients){
    res_pt <- res_filtered %>% filter(patient_id == i)
    
    for (x in res_pt$cfDNA_folder){
        sample_label_check <- res_pt %>% filter(cfDNA_folder == x)
        
        if (sample_label_check$sample_label == "pre-op"){
            pre_val_big = sample_label_check$big_ratio
            #pre_val_jaccard = sample_label_check$jaccard
            pre_val_small = sample_label_check$small_ratio}}
    
    for (j in res_pt$cfDNA_folder){
        normalized_line <- res_pt %>% filter(cfDNA_folder == j)
        #print(normalized_line)
        normalized_line <- normalized_line %>% mutate(big_ratio_normalized = big_ratio/pre_val_big, small_ratio_normalized = small_ratio/pre_val_small) #
        #print(normalized_line)
        normalized_res[nrow(normalized_res)+1,] = c(i, j, normalized_line[3:8])
    
    }}

In [None]:
Correlations <- read.csv("../data/ci5_cs1e9_correlation.csv")
relapse_label <- Correlations %>% select(cfDNA_sample, Relapse)
colnames(relapse_label) <- c("cfDNA_folder", "Relapse_label")

In [None]:
normalized_res <- left_join(normalized_res, relapse_label, by="cfDNA_folder")

In [None]:
normalized_res <- normalized_res %>% mutate(pre_post_time = ifelse(sample_label == "pre-op", 0, sample_timepoint_days_since_OP))

In [None]:
quick_plot_big_r_normalized <- ggplot(normalized_res) + geom_line(aes(x = pre_post_time, y = log10(big_ratio_normalized), color = Relapse_label, group=patient_id)) + 
                                      geom_point(aes(x = pre_post_time, y = log10(big_ratio_normalized), color = Relapse_label)) 


In [None]:
quick_plot_big_r_normalized

In [None]:
ggsave(
  paste("../plotting_results/", output_folder, "/pre_post_plots/Normalized_Big_", filetype, ".png", sep = ""),
  quick_plot_big_r_normalized, dpi = "print")

In [None]:
#quick_plot_jaccard_normalized <- ggplot(normalized_res) + geom_line(aes(x = pre_post_time, y = log10(jaccard_normalized), color = Relapse_label, group=patient_id)) + 
#                                      geom_point(aes(x = pre_post_time, y = log10(jaccard_normalized), color = Relapse_label)) + theme(text = element_text(size = 20)) 
#ggsave(
#  paste("../plotting_results/", output_folder, "/pre_post_plots/Normalized_Jaccard_", filetype, ".png", sep = ""),
#  quick_plot_jaccard_normalized, dpi = "print")

In [None]:
#quick_plot_jaccard_normalized

In [None]:
quick_plot_small_r_normalized <- ggplot(normalized_res %>% filter(patient_id != low_qual_sample)) + geom_line(aes(x = pre_post_time, y = log10(small_ratio_normalized), color = Relapse_label, group=patient_id), size = 1) + 
                                      geom_point(aes(x = pre_post_time, y = log10(small_ratio_normalized), color = Relapse_label), size = 2) + 
    theme_minimal() + 
    theme(text = element_text(size = 16)) + 
    xlab("Sample time point (days since surgery)")  + ylab("log10((UT kmers in cfDNA / UT kmers)") + 
    scale_color_manual(values=c('#00BFC4', '#F8766D'), labels = c("No relapse", "Relapse"))
    #+ 
    #ggtitle("Fraction of unique tumor k-mers found in cfDNA,\nnormalized by the pre-op sample (low quality sample removed)")


In [None]:
quick_plot_small_r_normalized

In [None]:
ggsave(
  paste("../plotting_results/", output_folder, "/pre_post_plots/Normalized_Small_", filetype, ".png", sep = ""),
  quick_plot_small_r_normalized, dpi = "print")

In [None]:
normalized_res

In [None]:
quick_plot_big_r_ <- ggplot(normalized_res) + geom_line(aes(x = pre_post_time, y = big_ratio, color = Relapse_label, group=patient_id)) + 
                                      geom_point(aes(x = pre_post_time, y = big_ratio, color = Relapse_label)) + theme(text = element_text(size = 20)) 
ggsave(
  paste("../plotting_results/", output_folder, "/pre_post_plots/Big_", filetype, ".png", sep = ""),
  quick_plot_big_r_, dpi = "print")

In [None]:
quick_plot_big_r_

In [None]:
#quick_plot_jaccard <- ggplot(normalized_res %>% filter(patient_id != low_qual_sample)) + geom_line(aes(x = pre_post_time, y = jaccard, color = Relapse_label, group=patient_id)) + 
#                                               geom_point(aes(x = pre_post_time, y = jaccard, color = Relapse_label)) + theme(text = element_text(size = 20)) 


In [None]:
#quick_plot_jaccard

In [None]:
quick_plot_small_r <- ggplot(normalized_res) + geom_line(aes(x = pre_post_time, y = small_ratio, color = Relapse_label, group=patient_id)) + 
                                      geom_point(aes(x = pre_post_time, y = small_ratio, color = Relapse_label)) + theme(text = element_text(size = 20)) 
ggsave(
  paste("../plotting_results/", output_folder, "/pre_post_plots/Small_", filetype, ".png", sep = ""),
  quick_plot_small_r, dpi = "print")

In [None]:
quick_plot_small_r