In [None]:
library(tidyverse)
library(RColorBrewer)

In [None]:
############################################################################
#                                                                          #
#                          SET PARAMETERS HERE                             #
#                                                                          #
############################################################################

############### input parameters (in data folder) ##########################
# input_folder = "all_germline_filtered_bams_tumor_ci5_cs1e9" # "all_germline_ci5_cs1e9" # "ci5_cs1e9" # "cs1e9"
input_folder = "all_germline_filtered_bams_tumor_subtract_other_tumor_ci5_cs1e9"
#filename =  "plotdata_unique_tumor_all_germline_singletons_excluded_tumor_4_cs_filtered.txt" # "plotdata_intersection_singletons_excluded_tumor_4_cs_filtered.txt" # "plotdata_intersection_cs1e9_filtered.txt"
filename = "plotdata_intersection_filtered.txt"

############### output parameters (in plotting_results folder) #############
#output_folder = "ci1_cs1e9" # "ci5_cs1e9" # "union_germline_ci4_cs1e9" # "union_germline_filtered_tumor_bams_ci5_cs1e9"
output_folder = "union_germline_filtered_tumor_bams_subtract_other_tumors_ci5_cs1e9"
#plot_title = "Unique tumor kmers" # "Tumor kmers"
plot_title = "Unique tumor kmers found in cfDNA samples"


paramspace <- read.csv("../data/metadata/paramspace_cfDNA_phaseI.csv", sep = ",")
dim(paramspace)
#output_path_unique_kmers, pt_id

In [None]:
head(paramspace)

In [None]:
paramspace <- paramspace %>% 
  mutate(tmp_chunks = stringr::str_split(cfDNA_folder, stringr::fixed("_"),  n = 3)) %>%
  mutate(sampleID = map_chr(tmp_chunks, 1),
         sub_value = map_chr(tmp_chunks, 2), 
         sub_val = map_chr(tmp_chunks, 3)) %>%
  select(-c(tmp_chunks, sub_value, sub_val))

head(paramspace)
dim(paramspace)

In [None]:
sample_timepoint <- read.csv("../data/metadata/clin_data/sample_timepoint_formatted.csv")

In [None]:
sample_timepoint_days <- sample_timepoint %>% filter(phase == "phaseI")
sample_timepoint_days <- sample_timepoint_days %>% select(sampleID, sample_timepoint_days_since_OP)
print(sample_timepoint_days %>% filter(is.na(sample_timepoint_days_since_OP)))
dim(sample_timepoint_days)
head(sample_timepoint_days)

In [None]:
paramspace$sampleID <- as.character(paramspace$sampleID)
paramspace <- left_join(paramspace, sample_timepoint_days, by = c("sampleID"))
head(paramspace)
dim(paramspace)

In [None]:
data_all <- NULL

plots <- NULL

getPalette = colorRampPalette(brewer.pal(9, "Blues"))

for (pt in unique(paramspace$pt_id)){
    pt_paramspace <- paramspace %>% filter(pt_id == pt)
    print(pt)
    
    data_all_pt <- NULL
    for (row in 1:nrow(pt_paramspace)){
        fd = pt_paramspace[row, "cfDNA_folder"]
        sample_timepoint = pt_paramspace[row, "sample_timepoint_days_since_OP"]
        data <- read.table(paste("../data/", pt, "/", fd ,"/", input_folder, "/", filename, sep = ""))
        data <- data %>% mutate(sample_timepoint = sample_timepoint)
        
        data_all_pt <- rbind(data_all_pt, data)    
    }
    data_all_pt <- data_all_pt %>% arrange(sample_timepoint)
    data_all_pt$sample_timepoint <- as.factor(data_all_pt$sample_timepoint)
    colourCount = length(unique(data_all_pt$sample_timepoint))
    
    colnames(data_all_pt) <- c("count", "n", "sample_timepoint")
    print(head(data_all_pt))
    
    data_all_1000 <- data_all_pt %>% filter(count <= 1000)
    
    line_smooth <- ggplot(data_all_1000) + 
        geom_smooth(aes(count, log10(n), color = sample_timepoint), size = 0.5, alpha = 1, se = FALSE) + 
        geom_line(aes(count, log10(n), color = sample_timepoint), size = 0.3, alpha = 0.2) + 
        theme_minimal()  + 
        #annotate("text", x = 250, y = log10(max(data_only_250$n)) + 0.6, label = data_only_250$patient_id[which(data_only_250$n == max(data_only_250$n))]) +
        scale_color_manual(values = getPalette(colourCount)) + 
        labs(x = "Number of times k-mer is observed",
             y = "log(Number of k-mers)", 
            title = paste(plot_title, sep =  ""),
            subtitle = paste("Patient ", pt, "\nPipeline ", input_folder, sep =  ""))
    
    ggsave(
      paste("../plotting_results/", output_folder, "/kmer_distributions/unique_tumor_cfDNA_intersection_kmer_distributions/Patient_", pt, "_Unique_tumor_cfDNA_intersection_kmer_count_distribution", ".png", sep = ""),
      line_smooth, dpi = "print")

    
    plots[[pt]] <- line_smooth
    
    
    }
