In [None]:
# adapted from:
# https://ucdavis-bioinformatics-training.github.io/2018-June-RNA-Seq-Workshop/thursday/DE.html
# https://rpubs.com/jrgonzalezISGlobal/transcriptomic_analyses
# https://ucdavis-bioinformatics-training.github.io/2018-September-Bioinformatics-Prerequisites/friday/limma_biomart_vignettes.html
# https://github.com/kevinblighe/EnhancedVolcano

In [None]:
# Dependencies
Sys.setenv(LANGUAGE = "en") # set language to "ja" if you prefer

suppressWarnings(library(edgeR))
suppressWarnings(library(EnhancedVolcano))
suppressWarnings(library(patchwork)) # combine plots
suppressWarnings(library(magrittr))
suppressWarnings(library(tibble))
suppressWarnings(library(repr))
suppressWarnings(library(stringr))
suppressWarnings(library(dplyr))
suppressWarnings(library(ggplot2))
suppressWarnings(library(extrafont))
suppressWarnings(library(svglite))

suppressMessages(extrafont::font_import(pattern="Arial",prompt=FALSE))
suppressMessages(extrafont::loadfonts())

sessionInfo()

In [None]:
# imports from external file
gex <- new.env()
source("gex.r", local = gex)

# Data import and preprocessing

In [None]:
# Data
dir.create("figures", showWarnings = FALSE)
dir.create("out", showWarnings = FALSE)

path_to_counts =  "../data/cloud/gex/pbta-rsem-genes.expected_count.tsv"
path_to_annotation = "../data/cloud/gex/sample_phenotypes.csv"
path_to_gencode = "../data/cloud/gex/gencode.v47.primary_assembly.basic.annotation.gtf"

data=gex$load_inputs(path_to_counts,path_to_annotation,path_to_gencode) # this will take awhile

In [None]:
head(data$annot)

In [None]:
head(data$cts)

In [None]:
data$annot %>% dim
table(data$annot[c('cancer_type','amplicon_class')])

In [None]:
## lots of R pacakges work by creating a data *object* and performing matematical transformations on the structured data within.
# https://www.rdocumentation.org/packages/edgeR/versions/3.14.0/topics/DGEList-class


formula = ~ data$annot$cohort + data$annot$sex + data$annot$tumor_history + data$annot$age_at_diagnosis + data$annot$extent_of_tumor_resection + 
            data$annot$cancer_type + data$annot$amplified + data$annot$ecDNA

dge = gex$setup_preprocess_dge(data,formula)

# Batch correction and outlier detection
No need to run this section;  
No batch effect or outliers detected by cohort after removing noncoding genes.

In [None]:
# Looks like samples cluster by cohort, with a few odd outliers.
options(repr.plot.width = 8, repr.plot.height = 6)
colors <- c("blue","red","dark green")[as.factor(data$annot$cohort)]
mds = plotMDS(dge, gene.selection = "common", col = colors, pch=16)

In [None]:
# Corrected expression
# NB: no need to log transform before correction; identical results.
# PCs 1 and 2 driven by extreme values in a handful of outliers
dgec <- removeBatchEffect(dge,batch=as.factor(data$annot$cohort))
options(repr.plot.width = 8, repr.plot.height = 6)
colors <- c("blue","red","dark green")[as.factor(data$annot$cohort)]
mdsc = plotMDS(dgec, gene.selection = "common", col = colors, pch=16)

In [None]:
mx = median(mds$x)
sx = sd(mds$x)
my = median(mds$y)
sy = sd(mds$y)
outlier_mask <- (mds$y < my-3*sy) | (mds$y > my+3*sy) | (mds$x < mx-3*sx) | (mds$x > mx+3*sx)
paste("Identified",sum(outlier_mask),"outlier samples")
data$annot[outlier_mask,]

In [None]:
# Corrected expression sans outliers
# Cohorts now overlap, but PC1 describes cohort 1 variance and PC2 describes cohort 2.
options(repr.plot.width = 8, repr.plot.height = 6)
colors <- c("blue","red","dark green")[as.factor(data$annot$cohort[!outlier_mask])]
mdsc = plotMDS(dgec[,!outlier_mask], gene.selection = "common", col = colors, pch=16)

In [None]:
# Regenerate the dataset sans outliers
# Outliers are hardcoded here if you don't want to run the previous lines of code
if (exists("outlier_mask")){
    outliers <- data$annot[outlier_mask,] %>% rownames
} else {
    outliers=c('7316-13','7316-1082','7316-2744','7316-3645','7316-170','7316-9449','7316-1089','7316-290','7316-2144','7316-1886',
               '7316-3028','7316-2589','7316-2071','7316-3657','7316-3061','7316-477','7316-2614','7316-2138','7316-291')
    outlers=c()
}
data$cts <- data$cts[,which(!(colnames(data$cts) %in% outliers))] 
data$annot <- data$annot[which(!(rownames(data$annot) %in% outliers)),]

# Model fitting

In [None]:
fit_dge_lm <- function(dge,design){
    # This takes a minute
    options(repr.plot.width=7, repr.plot.height=7)
    message("Fitting voom normalization...")
    v <- voom(dge, design, plot=TRUE)
    
    message("Fitting linear model...")
    fit <- lmFit(v,design)
    
    message("Calculating emperical bayes statistics...")
    fit <- eBayes(fit,robust=TRUE)
    return(fit)
}

design <- model.matrix(formula)
fit <- fit_dge_lm(dge,design)

In [None]:
# Apply multiple testing correction and obtain stats
ecDNA_comparison = ncol(design)
amp_comparison = ncol(design)-1
get_de_genes <- function(fit,comparison_index){
    print(paste("Getting DE genes w.r.t.",colnames(fit$design)[comparison_index]))
    stats_df <- topTable(fit,n=Inf,coef=comparison_index) %>% tibble
    print(stats_df %>% head)
    return(stats_df)
}
ec_stats_df = get_de_genes(fit,ecDNA_comparison)
amp_stats_df = get_de_genes(fit,amp_comparison)

In [None]:
hits <- ec_stats_df %>% 
    #filter(adj.P.Val < 0.10) %>%
    arrange(desc(logFC))
hits %>% head(n=30)
write.table(hits, file='out/differential_expression_ecDNA.tsv',quote=FALSE,sep='\t',row.names=FALSE)
write.table(amp_stats_df %>% arrange(desc(logFC)), file='out/differential_expression_amp.tsv',quote=FALSE,sep='\t',row.names=FALSE)

In [None]:
hits %>% tail(n=30)

In [None]:
## Plotting code

color_code <- list(
  "replication-dependent histones" = "blue",
  "replication-independent histones" = "darkgreen ",
  "histone pseudogenes" = "darkorange",
  "other" = "grey50",
  'c-NHEJ' = 'royalblue3',
  'Alt-EJ' = 'darkblue',
  'SSA' = 'blue3',
  'HR' = 'purple4',
  'HOXA@' = 'black',
  'HOXB@' = 'cyan',
  'HOXC@' = 'mediumorchid4',
  'HOXD@' = 'slateblue4'
)
ylabel=expression(-Log[10]*"("*italic(q)*")")

base_theme <- theme_classic(base_size=7, base_family="Arial",) +
    theme(axis.text = element_text(size=7,colour="black"))
theme_set(base_theme)

osc_volcano_i <- function(stats_df,gene_set,gene_set_name){
    # Highlight a specific gene set. Plot axes specific to ecDNA FC and qvalues.
    stats_df$highlight = stats_df$ID %in% gene_set
    stats_df <- stats_df[order(stats_df$highlight),]
    colCustom <- c(ifelse(stats_df$highlight, color_code[[gene_set_name]], "grey50"))
    names(colCustom) <- c(ifelse(stats_df$highlight, gene_set_name, "other"))
    plt <- EnhancedVolcano(stats_df,
                    lab = stats_df$'ID',
                    title = NULL,
                    subtitle = NULL,
                    caption = NULL,
                    axisLabSize = 14,
                    x = 'logFC',
                    y = "adj.P.Val",
                    xlim = c(-4,4),
                    ylim = c(0,4),
                    pCutoff = 0.05,
                    FCcutoff = 10,
                    labSize=0,
                    pointSize = c(ifelse(stats_df$highlight, 3, 1)),
                    colCustom = colCustom,
                    colAlpha = .6
                    ) %>% suppressWarnings
    options(repr.plot.width=18, repr.plot.height=7)
    return(plt + ylab(ylabel)) #+ lims(x=c(0,4),y=c(-4,4))
}

osc_volcano_ii <- function(stats_df,gene_set,gene_set_name){
    # Highlight a specific gene set. Plot axes specific to amp FC and qvalues.
    stats_df$highlight = stats_df$ID %in% gene_set
    stats_df <- stats_df[order(stats_df$highlight),]
    colCustom <- c(ifelse(stats_df$highlight, color_code[[gene_set_name]], "grey50"))
    names(colCustom) <- c(ifelse(stats_df$highlight, gene_set_name, "other"))
    plt <- EnhancedVolcano(stats_df,
                    lab = stats_df$'ID',
                    title = NULL,
                    subtitle = NULL,
                    caption = NULL,
                    axisLabSize = 14,
                    x = 'logFC',
                    y = "adj.P.Val",
                    xlim = c(-4,4),
                    ylim = c(0,25),
                    pCutoff = 0.05,
                    FCcutoff = 10,
                    labSize=0,
                    pointSize = c(ifelse(stats_df$highlight, 3, 1)),
                    colCustom = colCustom,
                    colAlpha = .6
                    ) %>% suppressWarnings
    options(repr.plot.width=18, repr.plot.height=7)
    return(plt + ylab(ylabel)) #+ lims(x=c(0,4),y=c(-4,4))
}

osc_volcano_v <- function(stats_df){
    highlight <- c('GFAP','HOXA6','HOXA9','HOXA10','S100B','XRCC4','CCNB1','CCNA2','CIP2A','FEN1','MYCN',
                  'RAD51','SMC2','RAD17','PRIM2','TIMELESS','CDK7','CDK1',
                  #'CENPK','AURKA','CENPN','CENPL',
                  'H3C15','H2AW','H2BU1', 'H3C13','H3C14','H3C11','H3C4','H2BC18','H2AC13','H4C13','H1-5',
                   'H2AC11','H2BU1','H4C3')
    stats_df$highlight = ifelse(stats_df$ID %in% highlight,"notable significant","other")
    stats_df <- stats_df[order(stats_df$highlight=='other',decreasing=TRUE),]
    stats_df$color <- ifelse(stats_df$highlight=='other','grey50','black')
    sapply(stats_df$highlight, function(x) color_code[[x]])
       
    plt <- EnhancedVolcano(stats_df,
                lab = stats_df$'ID',
                title = NULL,
                subtitle = NULL,
                caption = NULL,
                axisLabSize = 14,
                x = 'logFC',
                y = "adj.P.Val",
                #xlim = c(-2.75,2.75),
                ylim = c(0,4),
                pCutoff = 0.05,
                selectLab = highlight,
                labSize = 4.21644413212,#3.37315530569,#
                FCcutoff = 10,
                vline = NULL, 
                vlineType = "blank",
                legendPosition = "none",
                pointSize = c(ifelse(stats_df$highlight == "other", 1, 3)),
                colCustom = setNames(stats_df$color,stats_df$highlight),
                drawConnectors = TRUE,
                maxoverlapsConnectors = Inf,
                lengthConnectors = unit(0, "npc"),
                colAlpha = .6
                ) %>% suppressWarnings
    options(repr.plot.width=5, repr.plot.height=5)
    return(plt + ylab(ylabel))
}

In [None]:
options(warn=-1)
plt <- osc_volcano_v(ec_stats_df)
w=14.4/4;h=3.55
options(repr.plot.width=2*w, repr.plot.height=2*h)
#gex$write_plot(plt,"selected_volcano",w,h)
plt
options(warn=0)

In [None]:
options(warn=-1)
histone_sets = gex$read_gmt("out/histone-sets.gmt")
rdh = histone_sets[['replication_dependent_histones']]
nrdh = histone_sets[['replication_independent_histones']]
H_pseudogenes = histone_sets[['histone_pseudogenes']]
p1 = osc_volcano_i(ec_stats_df,rdh,'replication-dependent histones')
p2 = osc_volcano_i(ec_stats_df,nrdh,'replication-independent histones')
#p3 = osc_volcano_i(ec_stats_df,H_pseudogenes, 'histone pseudogenes')
p4 = osc_volcano_ii(amp_stats_df,rdh,'replication-dependent histones')
p5 = osc_volcano_ii(amp_stats_df,nrdh,'replication-independent histones')
#p6 = osc_volcano_ii(amp_stats_df,H_pseudogenes, 'histone pseudogenes')
plt <- (p1+p2)/(p4+p5)
w=14.4/4*2;h=9
options(repr.plot.width=w, repr.plot.height=h)
gex$write_plot(plt,"rdh_volcano",w,h)
plt
options(warn=0)

In [None]:
options(warn=-1)
dsbr_sets = gex$read_gmt("out/dsbr-sets.gmt")
p1 <- osc_volcano_i(ec_stats_df,dsbr_sets[["c-NHEJ"]],'c-NHEJ') |
osc_volcano_i(ec_stats_df,dsbr_sets[["Alt-EJ"]],'Alt-EJ') |
osc_volcano_i(ec_stats_df,dsbr_sets[["SSA"]], 'SSA') | 
osc_volcano_i(ec_stats_df,dsbr_sets[["HR"]], 'HR')
p2 <- osc_volcano_ii(amp_stats_df,dsbr_sets[["c-NHEJ"]],'c-NHEJ') |
osc_volcano_ii(amp_stats_df,dsbr_sets[["Alt-EJ"]],'Alt-EJ') |
osc_volcano_ii(amp_stats_df,dsbr_sets[["SSA"]], 'SSA') | 
osc_volcano_ii(amp_stats_df,dsbr_sets[["HR"]], 'HR')
plt = p1/p2
w=14.4;h=9
options(repr.plot.width=w, repr.plot.height=h)
gex$write_plot(plt,"dsbr_volcano",w,h)

plt 
options(warn=0)

In [None]:
options(warn=-1)
hox_sets = gex$read_gmt("out/hox-sets.gmt")
p1 <- osc_volcano_i(ec_stats_df,hox_sets[["HOXA@"]],'HOXA@') |
osc_volcano_i(ec_stats_df,hox_sets[["HOXB@"]],'HOXB@') |
osc_volcano_i(ec_stats_df,hox_sets[["HOXC@"]], 'HOXC@') | 
osc_volcano_i(ec_stats_df,hox_sets[["HOXD@"]], 'HOXD@')
p2 <- osc_volcano_ii(amp_stats_df,hox_sets[["HOXA@"]],'HOXA@') |
osc_volcano_ii(amp_stats_df,hox_sets[["HOXB@"]],'HOXB@') |
osc_volcano_ii(amp_stats_df,hox_sets[["HOXC@"]], 'HOXC@') | 
osc_volcano_ii(amp_stats_df,hox_sets[["HOXD@"]], 'HOXD@')
plt = p1/p2
w=14.4;h=9
options(repr.plot.width=w, repr.plot.height=h)
gex$write_plot(plt,"hox_volcano",w,h)

plt
options(warn=0)

# Alternative model not correcting for amplification
Specifies a glm with ecDNA as a covariate but not amplification.

In [None]:
# rerun if you don't want to correct for cohort.
formula_alt = ~ data$annot$sex + data$annot$tumor_history + data$annot$age_at_diagnosis + data$annot$extent_of_tumor_resection + 
            data$annot$cancer_type + data$annot$ecDNA
dge_alt = gex$setup_preprocess_dge(data,formula_alt)

design_alt <- model.matrix(formula_alt)
fit_alt <- fit_dge_lm(dge_alt,design_alt)

ecDNA_comparison_alt = ncol(design_alt)
alt_stats_df = get_de_genes(fit_alt,ecDNA_comparison_alt)

In [None]:
write.table(alt_stats_df %>% arrange(desc(logFC)), file='out/differential_expression_alt.tsv',quote=FALSE,sep='\t',row.names=FALSE)

In [None]:
library(ggVennDiagram)
qt = 0.05
upregulated <- list(
    ec = (ec_stats_df %>% filter(adj.P.Val < qt) %>% filter(logFC > 0))$ID,
    amp = (amp_stats_df %>% filter(adj.P.Val < qt) %>% filter(logFC > 0))$ID,
    alt = (alt_stats_df %>% filter(adj.P.Val < qt) %>% filter(logFC > 0))$ID
)
downregulated <- list(
    ec = (ec_stats_df %>% filter(adj.P.Val < qt) %>% filter(logFC < 0))$ID,
    amp = (amp_stats_df %>% filter(adj.P.Val < qt) %>% filter(logFC < 0))$ID,
    alt = (alt_stats_df %>% filter(adj.P.Val < qt) %>% filter(logFC < 0))$ID
)
p1 = ggVennDiagram(upregulated, category.names = c("ec controlling for amp","amplified","ec not controlling for amp")) + 
    scale_x_continuous(expand = expansion(mult = .3)) + 
    scale_fill_distiller(palette = "Reds", direction=1) + 
    ggtitle(paste0("Significantly upregulated genes (q < ",qt,")"))
p2 = ggVennDiagram(downregulated, category.names = c("ec controlling for amp","amplified","ec not controlling for amp")) + 
    scale_x_continuous(expand = expansion(mult = .3)) + 
    scale_fill_distiller(palette = "Blues", direction=1) + 
    ggtitle(paste0("Significantly downregulated genes (q < ",qt,")"))

p3 = Venn(upregulated[c(1,3)], names = c("ec controlling for amp","ec not controlling for amp")) %>%
    process_data(shape_id = "201") %>%
    plot_venn() + 
    #scale_x_continuous(expand = expansion(mult = .2)) + 
    scale_fill_distiller(palette = "Reds", direction=1) + 
    ggtitle(paste0("Significantly upregulated genes (q < ",qt,")"))

p4 = Venn(downregulated[c(1,3)], names = c("ec controlling for amp","ec not controlling for amp")) %>%
    process_data(shape_id = "201") %>%
    plot_venn() + 
    #scale_x_continuous(expand = expansion(mult = .2)) + 
    scale_fill_distiller(palette = "Blues", direction=1) + 
    ggtitle(paste0("Significantly downregulated genes (q < ",qt,")"))

plt <- (p1 + p2) / (p3 + p4)
w=12;h=10
options(repr.plot.width=w, repr.plot.height=h)
gex$write_plot(plt,"venns",w,h)

plt

# Regression w.r.t. age for no particular reason

In [None]:
# reminder, here's what the formula for our regresison looks like
#formula = ~ data$annot$cohort + data$annot$sex + data$annot$tumor_history + data$annot$age_at_diagnosis + data$annot$extent_of_tumor_resection + 
#            data$annot$cancer_type + data$annot$amplified + data$annot$ecDNA
age_comparison = 11
age_stats_df = get_de_genes(fit,age_comparison)
age_stats_df %>% head(n=30)

In [None]:
# plot counts by variable
plot_gex <- function(dge,annot,gene,covariate){
    y = (dge %>% cpm)[gene,]
    x = annot[[covariate]]
    model <- lm(y ~ x)
    intercept <- coef(model)[1]
    slope <- coef(model)[2]
    message(paste('slope:',slope,'intercept:',intercept))
    if (class(x) == 'numeric'){
        plot_type=geom_point()
        adjust=0
    } else if(class(x) == 'logical'){
        plot_type=geom_boxplot()
        adjust=intercept*0.5
    } else{
        stop('not implemented')
    }
    data = data.frame(x=x,y=y)
    plt <- ggplot(data,aes(x=x,y=y)) +
        plot_type +
        geom_abline(intercept = intercept+adjust, slope = slope, color = "blue") + 
        theme_classic(base_size=14, base_family="Arial",) +
        theme(axis.text = element_text(size=14,colour="black"))+
        labs(x = covariate, y = paste0(gene," expression"))
    return(plt)
}
w=8;h=8
options(repr.plot.width=w, repr.plot.height=h)
plot_gex(dge,data$annot,'CBX7','age_at_diagnosis')

In [None]:
w=8;h=8
options(repr.plot.width=w, repr.plot.height=h)
plot_gex(dge,data$annot,'CBX7','amplified')

# Old plots

In [None]:
osc_volcano_i <- function(stats_df,gene_set,gene_set_name){
    highlight <- stats_df$ID %in% gene_set
    plt <- EnhancedVolcano(stats_df,
                    lab = stats_df$'ID',
                    title = NULL,
                    subtitle = NULL,
                    caption = NULL,
                    axisLabSize = 14,
                    x = 'logFC',
                    y = "adj.P.Val",
                    xlim = c(-3,3),
                    ylim = c(0,3),
                    pCutoff = 0.05,
                    drawConnectors = TRUE,
                    maxoverlapsConnectors = Inf,
                    #lengthConnectors = unit(2, "npc"),
                    selectLab = gene_set,
                    pointSize = c(ifelse(highlight, 3, 1)),
                    )
    options(repr.plot.width=18, repr.plot.height=7)
    return(plt)
}

osc_volcano_iii <- function(stats_df){
    stats_df$highlight = ifelse(stats_df$ID %in% rdh,"replication-dependent histones",
                                ifelse(stats_df$ID %in% nrdh, "canonical histones",
                                       ifelse(stats_df$ID %in% H_pseudogenes, "histone pseudogenes", "other")))
    stats_df <- stats_df[order(stats_df$highlight=='other',decreasing=TRUE),]
    stats_df$color <- sapply(stats_df$highlight, function(x) color_code[[x]])
                             
    plt <- EnhancedVolcano(stats_df,
                    lab = stats_df$'ID',
                    title = NULL,
                    subtitle = NULL,
                    caption = NULL,
                    axisLabSize = 14,
                    x = 'logFC',
                    y = "adj.P.Val",
                    xlim = c(-2.75,2.75),
                    ylim = c(0,3),
                    pCutoff = 0.05,
                    FCcutoff = 10,
                    labSize=0,
                    pointSize = c(ifelse(stats_df$highlight == "other", 1, 3)),
                    colCustom = setNames(stats_df$color,stats_df$highlight)
                    )
    options(repr.plot.width=8, repr.plot.height=9)
    return(plt + ylab(ylabel)) #+ lims(x=c(0,4),y=c(-4,4))
}

osc_volcano_iv <- function(stats_df){
    plt <- EnhancedVolcano(stats_df,
                lab = stats_df$'ID',
                title = NULL,
                subtitle = NULL,
                caption = NULL,
                axisLabSize = 14,
                x = 'logFC',
                y = "adj.P.Val",
                xlim = c(-2.75,2.75),
                ylim = c(0,3),
                pCutoff = 0.05,
                drawConnectors = TRUE,
                maxoverlapsConnectors = Inf,
                lengthConnectors = unit(0, "npc"),   
                )
    options(repr.plot.width=12, repr.plot.height=12)
    return(plt)
}

In [None]:
osc_volcano_i(ec_stats_df,rdh) +
osc_volcano_i(ec_stats_df,nrdh) + 
osc_volcano_i(ec_stats_df,H_pseudogenes)

In [None]:
plt <- osc_volcano_iv(ec_stats_df)
#write_plot(plt,"sig_volcano",11,11)
plt

In [None]:
plt <- osc_volcano_iv(amp_stats_df)
plt