# recurrent_amplifications.ipynb
Count all oncogenes amplified in n >= 1 tumors. 

## TODO
- import data from data/Supplementary Tables.xlsx
- determine x-axis automatically instead of hard-coded in cancer_order variable
- determine y-axis automatically instead of hard-coded in chromosomal_gene_order variable

In [None]:
library(readxl)
library(ggplot2)
library(tidyverse)
library(janitor)
library(writexl)
library(cowplot)
library(extrafont)
library(svglite)

extrafont::font_import(pattern="Arial",prompt=FALSE)
extrafont::loadfonts()

In [None]:
import_ecDNA_amplicons <- function(file_path){
    # Get a table of genes amplified on ecDNA in this cohort.
    
    # S.T.2 columns:
    # biosample_id sex patient_id external_sample_id tumor_history age_at_diagnosis cohort extent_of_tumor_resection file_name cancer_type cancer_subclass ecDNA_sequences_detected amplicon_class in_unique_tumor_set in_unique_patient_set
    biosamples <- suppressWarnings(
      read_excel(file_path, sheet = "2. Biosamples", col_types = "guess")
    ) %>% 
        filter(in_unique_tumor_set) %>% # deduplicate samples
        select(biosample_id, patient_id, cancer_type) %>%
        rename(sample_name = biosample_id) # rename 'sample_name' to 'biosample_id' to match gene table
    
    # S.T.5 columns:
    # sample_name amplicon_number feature gene gene_cn truncated is_canonical_oncogene
    ecDNA_amplicons <- suppressWarnings(
      read_excel(file_path, sheet = "5. Gene amplifications", col_types = "guess")
    ) %>%
        filter(str_detect(feature, "ecDNA") & 
               #is_canonical_oncogene &
               sample_name %in% unique(biosamples$sample_name)
              ) %>%
        select(sample_name, amplicon_number, gene, gene_cn, truncated, is_canonical_oncogene) %>%
        left_join(biosamples, by = "sample_name")
    return(ecDNA_amplicons)
}

file_path <- "../data/Supplementary Tables.xlsx"  # Update with your actual file path
ecDNA_amplicons <- import_ecDNA_amplicons(file_path)
head(ecDNA_amplicons)

In [None]:
#calculate total number of samples with amplification per gene per tumor type

count_gene_amps <- function(ac_tbl){
# Count gene amplifications from AmpliconClassifier output.
# Expects as input a tibble with the columns "gene" "cancer_type";
# see function import_ecDNA_amplicons
# Returns: matrix of gene counts of size n_cancer_type x m_genes
  ac_tbl <- ac_tbl %>% 
    group_by(cancer_type, gene) %>% 
    summarize(count = n(), .groups="keep")
  colnames(ac_tbl) <- c("cancer_type","gene","total_frequency")
  ac_tbl <- ac_tbl %>%
    pivot_wider(
      names_from = cancer_type,
      values_from = total_frequency
    )
  return(ac_tbl)
}
# gene_freq_all_amplicons <- count_gene_amps(deduplicated_amplicons)
gene_freq_ecDNA <- count_gene_amps(ecDNA_amplicons)
head(gene_freq_ecDNA)

In [None]:
# Subset recurrently ecDNA-amplified oncogenes

subset_recurrently_ecDNA_amp_genes <- function(freq_tbl, count=3){
  # tbl should be genes (rows) x tumor types (cols). See count_gene_amps.
  # count is the minimum number of examples to be recurrent.
  rowsums <- tibble(freq_tbl$gene, rowSums(freq_tbl[,-1], na.rm = TRUE))
  colnames(rowsums) <- c("gene","count")
  rowsums <- rowsums %>% 
    filter(count >= count)
  freq_tbl <- freq_tbl %>% 
    filter(gene %in% rowsums$gene) %>%
    janitor::remove_empty(which = "cols")
  return(freq_tbl)
}

gene_x_tumor_amp_freq <- gene_freq_ecDNA %>% 
  subset_recurrently_ecDNA_amp_genes()
head(gene_x_tumor_amp_freq)
dim(gene_x_tumor_amp_freq)

In [None]:
subset_genes_of_interest <- function(freq_tbl, genes){
    return(freq_tbl %>%
           subset(gene %in% genes) %>%
           select(where(~ !all(is.na(.))))
    )
}
chromosomal_gene_order <- c(
    'MYCL','AKT3','ID2','MYCN','GLI2','PDGFRA','KIT','TERT','FOXP4','TERT','EGFR','CDK4','MET','RAD21','MYC','CCND1','BIRC2','BIRC3',
    'YAP1','PDGFD','GLI1','DDIT3','OS9','CDK4','LRIG3','MDM2','CNOT2','IGF1R','NTN1','NCOR1','FLCN','TOP3A','PPM1D','BRIP1','BRD4',
    'CCNE1','TFPT','CNOT3','PLAGL2'
    ) %>% rev
subset_genes_of_interest(gene_x_tumor_amp_freq, chromosomal_gene_order)
#chromosomal_gene_order <- c("MYCN", "CDK4", "AGAP2", "MYC", "PDGFRA", "PVT1", "CHIC2", "MDMD2", "YEATS4", "DDIT3", 
#                           "GLI1", "KIT", "FIP1L1", "KDR", "CCND2", "MET", "SREBF1", "EGFR", "CDK6", "LRIG3", "IFNG", "FGF6",
#                           "CNOT3", "TERT", "ID2", "COPS3", "TFPT", "WNT2", "CCNE1", "PPM1D", "GLI2", "AKAP9", "PLAG1",
#                           "TFEB", "YAP1", "TRIB1", "PLAGL2", "MYCL", "MMP12", "BIRC3", "BIRC2", "TBC1D15", "LDHB", "HMGA2", 
#                           "PAX7", "FOXO1", "RAD21", "INTS2", "FHL2", "BRIP1", "BRD4", "YWHAQ", "WIF1", "TRIB2", "NDRG1", "IGF1R",
#                           "GAS7", "FLT3", "CCN4", "CAV1", "RRAS2", "KRAS","ETV6", "RB1", "PCM1", "DNPH1", "CFC5L", "CCND3",
#                           "RUNX11", "IRS2", "GATA6", "CDK14", "TRIM33", "NUAK2", "NRAS", "MYB", "MSH6", "MSH2", "MDM4", "ITGA3",
#                           "ID1", "FGF4", "FBXO11", "EPCAM", "CNTN2", "CDX2", "CCND1", "BCL2L1", "ASXL1", "ALK", "BTG1") %>% rev

In [None]:
get_x_barplot_data <- function(ecDNA_amplicons, genes){
  # Count amplicons represented in each column. Note this is not a column sum of the heatmap, since an
  # amplification may have more than one amplified oncogene.
  tumor_amp_freq <- ecDNA_amplicons %>%
      filter(gene %in% genes) %>%
      distinct(sample_name, amplicon_number, .keep_all = TRUE) %>%
      group_by(cancer_type) %>%
      summarize(count = n()) %>%
      arrange(-count)     # Sort by the "count" column
  #tumor_amp_freq$cancer_type <- factor(tumor_amp_freq$cancer_type, levels = tumor_amp_freq$cancer_type)
  return(tumor_amp_freq)
}
get_y_barplot_data <- function(gene_x_tumor_amp_freq,genes){
  # Count genes represented in each row
  gene_amp_freq <- tibble(gene_x_tumor_amp_freq$gene, rowSums(gene_x_tumor_amp_freq[,-1], na.rm = TRUE))
  colnames(gene_amp_freq) <- c("gene","count")
  gene_amp_freq <- gene_amp_freq %>% 
    subset(gene %in% genes) %>%
    #mutate(gene = factor(gene, levels=genes)) #%>%
    arrange(-count)
  return(gene_amp_freq)
}
## Sort by frequency
sort_heatmap_data <- function(hmd){
  # sort by column sums, keeping gene as first and other as last.
  ##column_sums <- c(Inf, colSums(hmd %>% ungroup() %>% select(-1, -ncol(hmd)), na.rm=TRUE),0) 
  
  # sort by rows
  yord <- get_y_barplot_data()
  row_sums <- rowSums(hmd %>% ungroup() %>% select(where(is.numeric)), na.rm=TRUE)
  return(hmd %>%
    select(all_of(xord)) %>% # Sort the tibble by order in x barplot
    arrange(row_sums) # Sort the tibble by decreasing row sums
    )
}
get_heatmap_data <- function(gene_x_tumor_amp_freq,ecDNA_amplicons,genes){
  xord <- get_x_barplot_data(ecDNA_amplicons,genes)$cancer_type
  yord <- get_y_barplot_data(gene_x_tumor_amp_freq,genes)$gene
  gene_x_tumor_frequencytable   <- gene_x_tumor_amp_freq %>% 
    select(all_of(xord)) %>% # Sort the tibble by order in x barplot
    subset(gene %in% yord) %>%
    arrange(factor(gene, levels = rev(yord))) %>% # Sort by order in y barplot
    tidyr::pivot_longer(
      cols = !gene,
      names_to = c("cancer_type"),
      values_to = "count",
    ) #%>%
    #mutate(cancer_type = factor(cancer_type)) %>%
    #mutate(gene = factor(gene)) %>%
    #arrange(yord)
  return(gene_x_tumor_frequencytable)
}
#cancer_order <- c("NBL","OST","RMS","ETMR","RBL", "EMBT", "EPN", "GCT", "MST", "PNST", "SARC")


In [None]:
#get_x_barplot_data(ecDNA_amplicons,chromosomal_gene_order)
#get_y_barplot_data(gene_x_tumor_amp_freq,chromosomal_gene_order)
#get_heatmap_data(gene_x_tumor_amp_freq,ecDNA_amplicons,chromosomal_gene_order)

In [None]:
plot_gene_x_tumor_heatmap <- function(heatmap_data,subpanel=FALSE){
  if(subpanel){
    plot_tag <- element_text(size=8,face = "bold", colour = "black")
    tag="b"
  }else{
    plot_tag <- NULL
    tag=NULL
  }
  mapping <- aes(x = fct_inorder(cancer_type), y = fct_inorder(gene), fill = count) 
  hm <- ggplot(data=heatmap_data, mapping=mapping) +
    geom_raster() +
    #scale_fill_distiller(name = "Patient tumor count", palette = "Blues", direction = 1, na.value = "white", trans = 'log10', ) +
    scale_fill_gradient(name = "Patient tumor count", na.value="white", trans = 'log10',  low="skyblue",high="darkblue") +
    scale_x_discrete() + 
    labs(x="Tumor type", y="Oncogene", tag=tag) +
    theme_classic(base_size=7, base_family="Arial",) +
    theme(axis.text = element_text(size=7,colour="black"),
          axis.text.x = element_text(angle=45,vjust=0.5),
          axis.text.y = element_text(face='italic'),
          plot.tag = plot_tag,
          legend.position = "right", 
          legend.direction = "vertical",
          legend.key.size = unit(0.5,"cm"),
          plot.margin = unit(c(0,0,0,0), "cm"),
          ) 
  return(hm)
}

hmd <- get_heatmap_data(gene_x_tumor_amp_freq,ecDNA_amplicons,chromosomal_gene_order)
hm <- plot_gene_x_tumor_heatmap(hmd)
leg <- cowplot::get_legend(hm)
hm.clean <- hm + theme(legend.position="none")
hm.clean
#ggsave("out/heatmap_with_legend.svg", plot = hm, width = 12, height = 8)


In [None]:
hm

In [None]:
# Create x axis barplot
x_barplot <- function(data, color=FALSE, labels=FALSE, subpanel=FALSE) {
  # Color the bars or not
  if(color) {
    mapping <- aes(x = fct_inorder(cancer_type), y = count, fill = count)
  } else {
    mapping <- aes(x = fct_inorder(cancer_type), y = count)
  }
  
  # Label the y axis or not
  if(labels) {
    axis_text_x <- element_text(angle=45, vjust=0.5)
    axis_ticks_x <- NULL
  } else {
    axis_text_x <- element_blank()
    axis_ticks_x <- element_blank()
  }
  
  # Include subpanel labelling a, b, c etc.
  if(subpanel) {
    plot_tag <- element_text(size=8, face = "bold", colour = "black")
    tag = "a"
  } else {
    plot_tag <- NULL
    tag = NULL
  }
  
  # Create the bar plot
  bp.x <- ggplot(data = data, mapping = mapping) + 
    geom_bar(stat = "identity", mapping) + 
    #scale_y_log10() +
    #annotation_logticks(sides='l') +
    theme_classic(base_size=7, base_family="Arial") + 
    theme(
      plot.margin = unit(c(0,0,0,0), "cm"),
      axis.text = element_text(colour="black", size=7),
      plot.tag = plot_tag,
      axis.text.x = axis_text_x,
      axis.title.x = element_blank(),
      axis.ticks.x = axis_ticks_x,
      legend.position = "none") + 
    scale_fill_distiller(name = "Value", palette = "Blues", direction = 1, trans = 'log10') + 
    labs(x = "Cancer Type", y = "Patient tumors with frequently\necDNA-amplified oncogenes", tag=tag)
  
  return(bp.x)
}
xbpd <- get_x_barplot_data(ecDNA_amplicons,chromosomal_gene_order)
bp.x <- x_barplot(xbpd, color=FALSE, labels=TRUE)

# Save the plot as SVG
#ggsave("cancer_type_barplot.svg", plot = bp.x, width = 12, height = 8)  # Adjust width and height as needed

bp.x


In [None]:
# Create y axis barplot
y_barplot <- function(data, color=FALSE, labels=FALSE, subpanel=FALSE){
  # Color the bars or not
  if(color){
    mapping <- aes(x = count, y = fct_rev(fct_inorder(gene)), fill = count)
  }else{
    mapping <- aes(x = count, y = fct_rev(fct_inorder(gene)))
  }
  # label the y axis or not
  if(labels){
    axis_text_y <- NULL
    axis_ticks_y <- NULL
  }else{
    axis_text_y <- element_blank()
    axis_ticks_y <- element_blank()
  }
  # include subpanel labelling a, b, c etc.
  if(subpanel){
    plot_tag <- element_text(size=8,face = "bold", colour = "black")
    tag=""
  }else{
    plot_tag <- NULL
    tag=NULL
  }
  
  bp.y <- ggplot(data = data, mapping=mapping) + 
    geom_col(mapping=mapping) + 
    scale_x_log10() +
    annotation_logticks(sides='b') +
    theme_classic(base_size=7, base_family="Arial") + 
    theme(
      plot.margin = unit(c(0,0,0,0), "cm"),
      axis.text = element_text(colour="black",size=7),
      plot.tag = plot_tag,
      axis.text.y = axis_text_y,
      axis.title.y = element_blank(),
      axis.ticks.y = axis_ticks_y,
      legend.position = "none") + 
    scale_fill_distiller(name = "Value", palette = "Blues", direction = 1, trans = 'log10' ) + 
    labs(x = "Patient tumors with frequently\necDNA-amplified oncogenes", tag=tag)
  return(bp.y)
}
ybpd <- get_y_barplot_data(gene_x_tumor_amp_freq,chromosomal_gene_order)
bp.y <- y_barplot(ybpd,color=FALSE,labels=TRUE,subpanel=TRUE)
#ggsave("gene_barplot.svg", plot = bp.y, width = 8, height = 8)  # Adjust width and height as needed
bp.y

In [None]:
assemble_plot <- function(){
  cowplot::plot_grid(
    bp.x, leg, hm.clean, bp.y,
    align = "hv",
    axis = "lrbt",
    nrow=2,
    ncol=2,
    rel_heights=c(1,2),
    rel_widths=c(2,1)
  )
}
bp.y <- y_barplot(ybpd)
bp.x <- x_barplot(xbpd)

assemble_plot()
ggsave(filename="amplicon_plot_grey_bar.png",path="out",dpi=300,width=7,height=7,units="in",bg="white")
ggsave(filename="amplicon_plot_grey_bar.svg",path="out",dpi=300,width=7,height=7,units="in")

# Dead code

In [None]:
library(ggplot2)
library(cowplot)  # Ensure cowplot is loaded
library(forcats)  # For factor manipulation

# Existing function to create the heatmap (assumed to be defined already)

# Assemble the plots
assemble_plot <- function(){
  cowplot::plot_grid(
    bp.x, leg, hm.clean, bp.y,
    align = "hv",
    axis = "lrbt",
    nrow=2,
    ncol=2,
    rel_heights=c(0.9,2)
  )
}

# Generate and display the bar plots
bp.x <- x_barplot(cancer_order)
bp.y <- y_barplot(color=FALSE, labels=TRUE, subpanel=TRUE)

# Assemble and save the plot
final_plot <- assemble_plot()
# ggsave(filename="amplicon_plot_grey_bar.png", path="out", plot = final_plot, dpi=300, width=7, height=7, units="in", bg="white")
ggsave(filename="amplicon_plot_grey_bar_v4.svg", path="out", plot = final_plot, dpi=300, width=7, height=11, units="in")
final_plot

In [None]:
# Load necessary packages
library(ggplot2)
library(gtable)
library(grid)

# Function to extract and save legend as SVG
get_and_save_legend <- function(myplot, file_name) {
  # Convert ggplot to gtable object
  plot_gtable <- ggplotGrob(myplot)
  
  # Locate the legend (guide-box) in gtable
  guide_grob <- plot_gtable$grobs[which(sapply(plot_gtable$grobs, function(x) x$name) == "guide-box")]
  
  if (length(guide_grob) > 0) {
    legend_grob <- guide_grob[[1]]
    
    # Create a new blank page and draw only the legend
    grid.newpage()
    grid.draw(legend_grob)
    
    # Save the legend to the specified SVG file
    ggsave(file_name, plot = legend_grob, width = 3, height = 1, device = "svg")  # Adjust width and height as needed
    message("Legend saved successfully as SVG.")
  } else {
    message("Legend extraction failed: No legend found in the plot.")
  }
}

# Generate the heatmap plot and save the legend
hm <- plot_gene_x_tumor_heatmap()
get_and_save_legend(hm, "out/legend_only.svg")


In [None]:
library(cowplot)
library(magick)
library(ggplot2)

# Load the SVG and convert it into a raster graphic for ggplot2
leg_svg <- image_read_svg("out/legend_only.svg", width = 600)  # Adjust width as needed

# Scale the image to increase its resolution
leg_svg <- image_scale(leg_svg, "200%")  # Scale up by 200%

# Create a ggdraw object with the scaled image and add a caption
leg <- ggdraw() + 
  draw_image(leg_svg) 

leg


In [None]:
write_outputs <- function(filename){
  outdir <- file.path("out")
  dir.create(outdir, showWarnings = FALSE)
  path <- file.path(outdir,filename)
  write_xlsx(list(gene_x_tumor_amp_freq = get_heatmap_data(), 
                  tumor_amp_freq = get_x_barplot_data(),
                  gene_amp_freq = get_y_barplot_data()),
             path=path)
}
write_outputs('PedPanCanAmpliconAnalysisOutputs.xlsx')