In [None]:
# Dependencies
Sys.setenv(LANGUAGE = "en") # set language to "ja" if you prefer

suppressWarnings(library(edgeR))
suppressWarnings(library(patchwork)) # combine plots
suppressWarnings(library(magrittr))
suppressWarnings(library(tibble))
suppressWarnings(library(repr))
suppressWarnings(library(stringr))
suppressWarnings(library(dplyr))
suppressWarnings(library(ggplot2))
suppressWarnings(library(extrafont))
suppressWarnings(library(svglite))
suppressWarnings(library(RColorBrewer))
suppressWarnings(library(sva))

suppressMessages(extrafont::font_import(pattern="Arial",prompt=FALSE))
suppressMessages(extrafont::loadfonts())

sessionInfo()

In [None]:
# imports from external file
gex <- new.env()
source("gex.r", local = gex)

In [None]:
# Data
dir.create("figures", showWarnings = FALSE)
dir.create("out", showWarnings = FALSE)

path_to_counts =  "../data/cloud/gex/pbta-rsem-genes.expected_count.tsv"
path_to_annotation = "../data/cloud/gex/sample_phenotypes.csv"
path_to_gencode = "../data/cloud/gex/gencode.v47.primary_assembly.basic.annotation.gtf"

og_data=gex$load_inputs(path_to_counts,path_to_annotation,path_to_gencode) # this will take awhile

In [None]:
generate_data_v2 <- function(data,batch_vars){
## returns list:
    ## list (data$uncorrected, data$corrected_once, ...)
    ## annot
    dge = gex$setup_preprocess_dge(data$cts,formula=NULL,filterbyexp=FALSE)
    cts_list = list(dge)
    groups <- model.matrix(~ data$annot$amplified + data$annot$ecDNA)[, -1]
    for (covariate in batch_vars){
        message(paste('Starting correction on',covariate))
        batch <- factor(data$annot[[covariate]])
        cts_uncorrected <- cts_list[[length(cts_list)]]$counts
        #if (covariate == 'cancer_type'){
        #    return(list(cts_list,batch,groups))
        #}    
        cts_corrected <- sva::ComBat_seq(counts=cts_uncorrected,batch=batch,covar_mod=groups)
        dgec = gex$setup_preprocess_dge(cts_corrected,formula=NULL,filterbyexp=FALSE)
        cts_list <- c(cts_list,list(dgec))
        message(paste('Finished correction on',covariate))
    }
    return(list(cts_list,data$annot))
}

# GSEA input files including various tumor types with ecDNA:
    c('EMBT','ETMR','HGG','MBL','MST','NBL','PINT','RMS','SARC')

In [None]:
#all_data <- generate_data(og_data,
tumor_types = c('EMBT','ETMR','HGG','MBL','MST','NBL','PINT','RMS','SARC')
#                         )
data_subset <- gex$filter_tumor_types(og_data,tumor_types)
all_data <- generate_data_v2(data=data_subset,batch_vars=c('cohort','cancer_type'))

In [None]:
base_theme <- theme_classic(base_size=14, base_family="Arial",) +
    theme(axis.text = element_text(size=14,colour="black"))
theme_set(base_theme)

plot_pca <- function(dge,annot,covariate){
    annot$cohort <- str_remove(annot$cohort, "^PBTA-")
    classes = annot[[covariate]]
    # define colors
    n_classes = length(unique(classes))
    if (n_classes <= 9) {
        colors <- RColorBrewer::brewer.pal(n_classes, "Set1")
    } else if (n_classes <= 12) {
        colors <- RColorBrewer::brewer.pal(n_classes, "Set3")
    } else {
        colors <- rainbow(n_classes)
    }
    names(colors) <- unique(classes)
    
    mds_data <- plotMDS(dge, gene.selection = "common", plot=FALSE)
    mds_df <- data.frame(y=mds_data$x, x=mds_data$y, group=classes) 
    plt <- ggplot(mds_df, aes(x=x, y=y, color=group)) +
        geom_point(size = 2, shape = 16) + 
        scale_color_manual(values=colors) + 
        labs(color=covariate,
             y=paste0('PC1 (',scales::percent(mds_data$var.explained[1]),')'),
             x=paste0('PC2 (',scales::percent(mds_data$var.explained[2]),')')) 
    return(plt)
}

plot_all <- function(all_data){
    l = length(all_data[[1]])
    vars = c('cohort','cancer_type','amplicon_class')
    flag_i=FALSE
    for (i in seq_len(l)){
        flag_j=FALSE
        for (j in seq_along(vars)){
            dge <- all_data[[1]][[i]]
            covariate <- vars[j]
            subplt <- plot_pca(dge,all_data[[2]],covariate)
            if (!flag_j){
                flag_j=TRUE
                plt_j<-subplt
            } else {
                plt_j <- plt_j + subplt
            }
        }
        if (!flag_i){
            flag_i=TRUE
            plt <- plt_j
        } else {
            plt <- plt / plt_j
        }
    }
    return(plt)
}

In [None]:
plt <- plot_all(all_data)
w=4;h=3
options(repr.plot.width=3*w, repr.plot.height=3*h)
gex$write_plot(plt,"batch-corrections",3*w,3*h)
plt

In [None]:
# Write files for GSEA:
# gex_all_bc_cohort_tumortype.tmm.gct: tmm normalized gene expression for 406 tumors 
#    batch-corrected for cohort and tumor type

exp = all_data[[1]][[3]] %>% cpm
filepath = 'out/gex_all_bc_cohort_tumortype.tmm.gct'
gex$write_gct(exp,filepath)

pheno = all_data[[2]][['amplicon_class']]
filepath = 'out/gex_all_ampclass.cls'
gex$write_cls(pheno,filepath)

In [None]:
# gex_amp_bc_cohort_tumortype.tmm.gct: tmm normalized gene expression for 102 tumors with amplification
#    batch-corrected for cohort and tumor type

mask = all_data[[2]][['amplified']]

exp = all_data[[1]][[3]] %>% cpm
filepath = 'out/gex_amp_bc_cohort_tumortype.tmm.gct'
gex$write_gct(exp[,mask],filepath)

pheno = all_data[[2]][['ecDNA']]
filepath = 'out/gex_amp_ecDNA.cls'
gex$write_cls(pheno[mask],filepath)

In [None]:
all_data[[2]][c('cancer_type','amplicon_class')] %>% table

# GSEA input files for MBs

In [None]:
tumor_types = 'MBL'
data_subset <- gex$filter_tumor_types(og_data,tumor_types)
mb_data <- generate_data_v2(data=data_subset,batch_vars=c('cohort','cancer_subclass'))

In [None]:
plot_tumortype <- function(some_data){
    l = length(some_data[[1]])
    vars = c('cohort','cancer_subclass','amplicon_class')
    flag_i=FALSE
    for (i in seq_len(l)){
        flag_j=FALSE
        for (j in seq_along(vars)){
            dge <- some_data[[1]][[i]]
            covariate <- vars[j]
            subplt <- plot_pca(dge,some_data[[2]],covariate)
            if (!flag_j){
                flag_j=TRUE
                plt_j<-subplt
            } else {
                plt_j <- plt_j + subplt
            }
        }
        if (!flag_i){
            flag_i=TRUE
            plt <- plt_j
        } else {
            plt <- plt / plt_j
        }
    }
    return(plt)
}

In [None]:
plt <- plot_tumortype(mb_data)
w=4;h=3
options(repr.plot.width=3*w, repr.plot.height=3*h)
gex$write_plot(plt,"mb-batch-corrections",3*w,3*h)
plt

In [None]:
# Write files for GSEA:
# gex_mb_bc_cohort_subtype.tmm.gct: tmm normalized gene expression for 186 medulloblastomas 
#    batch-corrected for cohort and tumor type

exp = mb_data[[1]][[3]] %>% cpm
filepath = 'out/gex_mb_bc_cohort_subtype.tmm.gct'
gex$write_gct(exp,filepath)

pheno = mb_data[[2]][['amplicon_class']]
filepath = 'out/gex_mb_ampclass.cls'
gex$write_cls(pheno,filepath)

In [None]:
# gex_amp_bc_cohort_tumortype.tmm.gct: tmm normalized gene expression for 30 mb tumors with amplification
#    batch-corrected for cohort and tumor type

mask = mb_data[[2]][['amplified']]

exp = mb_data[[1]][[3]] %>% cpm
filepath = 'out/gex_mb_amp_bc_cohort_subtype.tmm.gct'
gex$write_gct(exp[,mask],filepath)

pheno = mb_data[[2]][['ecDNA']]
filepath = 'out/gex_mb_amp_ecDNA.cls'
gex$write_cls(pheno[mask],filepath)

# TODO: GSEA input files for HGGs