# survival.ipynb
Survival analyses on the pedpancan cohort.

In [None]:
library(tidyverse)
library(readxl)
library(dplyr)
library(stringr)
library(naniar) #for replace with Nas function
library(survival)
library(survminer)
library(RColorBrewer)
library(janitor)
#library(rio)
#library(labelled)
library(gt)
library(gtsummary)
library(ggsurvfit)
library(extrafont)
library(svglite)

#install.packages("devtools")
#library(devtools)
#devtools::install_github("MSKCC-Epi-Bio/bstfun")
#pacman::p_load(bstfun)

extrafont::font_import(pattern="Arial",prompt=FALSE)
extrafont::loadfonts()

sessionInfo()

In [None]:
## create an output directory if it doesn't exist
dir.create('../out', showWarnings = FALSE)

load_survival_data <- function(path, tumor_types=NULL){
  ## path: path to data/Supplementary Tables.xlsx
  ## tumor_types: (optional) may specify only a subset of tumor types present in the dataset.
  combinedsurv <- read_excel(path, sheet="1. Patients")
  # Subset for tumor types if specified
  if (!is.null(tumor_types)){
    combinedsurv <- combinedsurv %>%
    filter(str_detect(cancer_type, tumor_types))
  }

  # Drop NAs
  combinedsurv <- combinedsurv %>%
    filter(complete.cases(amplicon_class,OS_status,OS_months)) %>%
    mutate(OS_months = as.numeric(OS_months)) %>%
  # Censor at 5 years = 60 months
    mutate(OS_months_5y = if_else(OS_months < 60, OS_months, 60)) %>%
    mutate(OS_status_5y = if_else(OS_months <= 60, OS_status, "Alive")) %>%
    mutate(OS_status_5y = if_else(OS_status_5y == "Alive", 0, 1)) %>%
  # get ecDNA status
    mutate(ecDNA_status = if_else(amplicon_class == "ecDNA", "ecDNA+", "ecDNA-")) %>%
    mutate(amplicon_class = if_else(amplicon_class == "intrachromosomal", "chromosomal", amplicon_class)) %>%
  # convert to factors
    mutate(ecDNA_status = factor(ecDNA_status)) %>%
    mutate(amplicon_class = factor(amplicon_class)) %>%
    mutate(cancer_type = factor(cancer_type))
    
  return(combinedsurv)
}

cox_plot <- function(coxobj,data,outfile=NULL,width=3,height=6){
  ## OSC function to perform a Cox regression and generate the plot
  #coxph(Surv(OS_months, OS_status) ~ ecDNA_status + strata(cancer_type), data = data)
  zph <-cox.zph(coxobj) 
  print(zph)
  #ggcoxzph(zph)
  #m4
  #creating forest plots
  plt <- ggforest(coxobj,data=as.data.frame(data)) +
        theme_classic(base_size=7, base_family="Arial") +
        theme(axis.text = element_text(size=7,colour="black"),
              plot.title = element_text(size=7))

  if(!is.null(outfile)){
    pdf.options(encoding='ISOLatin2.enc')
    pdfName = paste(outfile, ".pdf", sep="")
    pngName = paste(outfile, ".png", sep="")
    svgName = paste(outfile, ".svg", sep = "")
    ggsave(path="out", filename=pdfName, device="pdf", width=width, height=height, units='in')
    ggsave(path="out", device="png", filename=pngName, width=width, height=height, units='in')
    ggsave(path="out", device="svg", filename=svgName, width=width, height=height, units='in')
  }
  return(plt)
}

km_plot <- function(survObj,outfile=NULL){
  ## OSC function to perform a KM analysis and generate the plot
  if (length(survObj$n) == 2){
    colors = c('blue', 'red')
    labels = c('ecDNA-', 'ecDNA+')
  } else if (length(survObj$n) == 3){
    colors = c('magenta','red','dodgerblue')
    labels = c('chromosomal','ecDNA','no amplification')
  }
  plt <- survObj %>% 
   ggsurvfit(linewidth=0.5) +
   labs(x = 'Follow-up time (Months)',
        y = 'Overall Survival') +
   scale_color_manual(values = colors,
                      labels = labels) +
   scale_fill_manual(values = colors,
                     labels = labels) +
   scale_y_continuous(limits=c(0, 1))+
   add_censor_mark(size = .5, alpha = 1) +
   add_risktable(risktable_stats = "n.risk", size=2,
                 theme = theme_risktable_default(axis.text.y.size = 7,
                                    plot.title.size = 7)) +
   add_risktable_strata_symbol(size=4) + 
   theme_classic(base_size=7, base_family="Arial",) +
   theme(axis.text = element_text(size=7,colour="black"),
         legend.position = "bottom",
   )
  if (length(survObj$n) <=3){
    plt <- plt + add_confidence_interval()
  }
  
  if(!is.null(outfile)){
    pdf.options(encoding='ISOLatin2.enc')
    pdfName = paste(outfile, ".pdf", sep="")
    pngName = paste(outfile, ".png", sep="")
    svgName = paste(outfile, ".svg", sep = "")
    ggsave(path="out", filename=pdfName, device="pdf", width=3, height=3.5, units='in')
    ggsave(path="out", device="png", filename=pngName, width=3, height=3.5, units='in')
    ggsave(path="out", device="svg", filename=svgName, width=3, height=3.5, units='in')
  }
  return(plt)
}

# Kaplan-Meier regression

In [None]:
# KM amplicon type
# KM by ecDNA status of tumor types with at least 1 sample with ecDNA, censored at 5 years
dd2 <- load_survival_data("../data/Supplementary Tables 12_1_24.xlsx") %>%
  group_by(cancer_type) %>%
  filter(n() >= 10) %>%
  filter (any(OS_status == 'Deceased'))%>%
  filter(any(amplicon_class == 'ecDNA'))%>%
  ungroup()
dd2$cancer_type <- droplevels(dd2$cancer_type) # drop unused levels
dd2$cancer_type %>% unique() # print remaining levels

formula = Surv(OS_months_5y, OS_status_5y) ~ amplicon_class
km = survfit2(formula=formula, data = dd2 )
km_plot(km, "km_class_subset_5year")
#km_plot(km)
logrank <- pairwise_survdiff(formula,dd2,p.adjust.method="BH",rho=0)
logrank



# Cox regressions

We include tumor types which satisfy the following:
- At least 10 patients
- At least one death
- At least one ecDNA

In [None]:
# Set up dataset
dd3 <-load_survival_data("../data/Supplementary Tables 12_1_24.xlsx") %>%
  group_by(cancer_type) %>%
  filter(any(amplicon_class == 'ecDNA')) %>%
  filter (any(OS_status == 'Deceased'))%>%
  filter(n() >= 10) %>%
  ungroup()
dd3$amplicon_class = relevel(dd3$amplicon_class, ref = "no amplification")
dd3$cancer_type = relevel(dd3$cancer_type, ref = "LGG")
dim(dd3)
dd3$cancer_type %>% unique()
dd3$cancer_type <- droplevels(dd3$cancer_type)

# print summary
dd3 %>% group_by(amplicon_class, cancer_type) %>%
  summarise(n=n())%>%
  spread(cancer_type, n)

# cox regression
m4 <- coxph(Surv(OS_months_5y, OS_status_5y) ~ amplicon_class + cancer_type + sex + age_at_diagnosis, data = dd3)
zph <-cox.zph(m4) 
print("zph")
print(zph)
m4
cox_plot(m4,dd3,"cox_forest",width=6,height=6)

# More plots, not included in the manuscript

## Alternative KM curves on the whole cohort

In [None]:
# KM by ecDNA status of combined cohort, censored at 5 years
#no filters applied by cancer type or n value
print(getwd())
data <- load_survival_data("../data/Supplementary Tables 12_1_24.xlsx")
formula <- Surv(OS_months_5y, OS_status_5y) ~ ecDNA_status
km <- survfit2(formula=formula, data=data)
km_plot(km)
#km_plot(km, "km_surv_all_5year")
logrank <- survdiff(formula,data)
logrank
dim(data)

In [None]:
# KM by amplicon type
# no filters applied by tumor type or n value
data <- load_survival_data("../data/Supplementary Tables 12_1_24.xlsx") #%>%
  #mutate(amplicon_class = recode(amplicon_class))
formula <- Surv(OS_months_5y, OS_status_5y) ~ amplicon_class
km <- survfit2(formula=formula, data=data)
km_plot(km)
#km_plot(km, "km_class_all_5year")
logrank <- pairwise_survdiff(formula,data,p.adjust.method="BH",rho=0)
logrank

In [None]:
# KM by ecDNA status of tumor types with at least 1 sample with ecDNA, censored at 5 years
#filtered by tumor type with ecDNA, n > 10 and at least one patient who is deceased
dd2 <- load_survival_data("../data/Supplementary Tables 12_1_24.xlsx") %>%
  group_by(cancer_type) %>%
  filter(any(amplicon_class == 'ecDNA')) %>%
  filter(n() >= 10) %>%
  filter (any(OS_status == 'Deceased'))%>%
  ungroup()
dd2$cancer_type <- droplevels(dd2$cancer_type) # drop unused levels
dd2$cancer_type %>% unique() # print remaining levels

formula <- Surv(OS_months_5y, OS_status_5y) ~ ecDNA_status
km <- survfit2(formula=formula, data=dd2)
km_plot(km)
#km_plot(km, "km_surv_subset_5year")
logrank <- survdiff(formula,dd2)
logrank


## Survival of HGG

In [None]:
# KM by amplicon type censored at 5 years
dd2 <- load_survival_data("../data/Supplementary Tables 12_1_24.xlsx") %>%
  filter(str_detect(cancer_type, "HGG"))

formula = Surv(OS_months_5y, OS_status_5y) ~ amplicon_class
km = survfit2(formula=formula, data = dd2 )
#km_plot(km, "km_HGG_5year")
km_plot(km)
logrank <- pairwise_survdiff(formula,dd2,p.adjust.method="BH",rho=0)
logrank

## HGG subtypes

In [None]:
dd5 <- load_survival_data("../data/Supplementary Tables 12_1_24.xlsx") %>%
  filter(str_detect(cancer_type, "HGG"))

length(dd5)

formula <- Surv(OS_months_5y, OS_status_5y) ~ ecDNA_status + cancer_subclass
m2 <- coxph(formula = formula, data = dd5)
m2
# Check proportionality assumption
coxZph <-cox.zph(m2) 
print("coxZph: ")
print(coxZph)

## Survival of H3K27 mutant HGG.
Cox analysis: TP53 mutant is really bad for your prognosis, ecDNA nonsignificant.

In [None]:
dd4 <- load_survival_data("../data/Supplementary Tables 12_1_24.xlsx") %>%
  filter(str_detect(cancer_subclass, "K27")) %>%
  mutate(TP53_mutant = ifelse(str_detect(cancer_subclass, "TP53"), TRUE, FALSE))

length(dd4)

formula <- Surv(OS_months_5y, OS_status_5y) ~ ecDNA_status + TP53_mutant
m2 <- coxph(formula = formula, data = dd4)
m2
# Check proportionality assumption
coxZph <-cox.zph(m2) 
print("coxZph: ")
print(coxZph)

formula <- Surv(OS_months_5y, OS_status_5y) ~ ecDNA_status
km <- survfit2(formula=formula, data=dd4)
km_plot(km)
#km_plot(km, "km_surv_all_5year")
logrank <- survdiff(formula,dd4)
logrank

formula <- Surv(OS_months_5y, OS_status_5y) ~ TP53_mutant
km <- survfit2(formula=formula, data=dd4)
km_plot(km)
#km_plot(km, "km_surv_all_5year")
logrank <- survdiff(formula,dd4)
logrank
