# Differential expression analysis
of RNA-seq data received 2026-01-13

## Results

### PCA
- MGG152_UNC1 a clear outlier
- After removal, PC1 strongly correlated with sequencing depth (83%) variance
- No obvious association with treatment in PCs 2 or 3.

### Differential Expression
- DE for IDHi:
    |gene  |  baseMean  |  l2FC      |      lfcSE   |    p       |        q   |
    |------|------------|------------|--------------|------------|------------|
    |CBLN1 |  175.0304	|-3.06166e-05|	0.001442798	|3.860136e-06|	0.04981505|
- DE for KMT5Ai:
    |gene|baseMean|l2FC     | lfcSE   |   p       |      q   |
    |----|--------|---------|---------|-----------|----------|
    |H1-0|382.3084|0.7500887|0.1811774|1.43793e-06|0.01855649|
- No significant hits for vorasidenib-specific effect
- No significant hits for synergistic effect

In [None]:
# Load dependencies

Sys.setenv(LANGUAGE = "en") # set language to "ja" if you prefer
suppressWarnings(library(DESeq2))
#suppressWarnings(library(plyranges))
suppressWarnings(library(dplyr))
suppressWarnings(library(tibble))
suppressWarnings(library(readr))
suppressWarnings(library(ggplot2))
suppressWarnings(library(ggrepel))
suppressWarnings(library(extrafont))
suppressWarnings(library(svglite))
suppressWarnings(library(patchwork)) # combine plots

suppressMessages(extrafont::font_import(pattern="Arial",prompt=FALSE))
suppressMessages(extrafont::loadfonts())

getwd()
sessionInfo()

In [None]:
# Plotting defaults
base_theme <- theme_classic(base_size=14, base_family="Arial",) +
    theme(axis.text = element_text(size=14,colour="black"),
          aspect.ratio=1,
          #plot.margin=unit(c(0,0,0,0), "null")
         )
theme_set(base_theme)

write_plot <- function(plt,outfile,width,height){
    b=basename(outfile)
    d=dirname(outfile)
    dir.create(d, recursive=TRUE, showWarnings = FALSE)
    pdf.options(encoding='ISOLatin2.enc')
    #pdfName = paste(outfile, ".pdf", sep="")
    pngName = paste(b, ".png", sep="")
    svgName = paste(b, ".svg", sep = "")
    #ggsave(path="figures", filename=pdfName, device="pdf", width=width, height=height, units='in')
    ggsave(path=d, device="png", filename=pngName, width=width, height=height, units='in')
    ggsave(path=d, device="svg", filename=svgName, width=width, height=height, units='in')

}

# plotPCA without having to specify groups
plot_pca <- function(dds,pcs=c(1,2),n=500,w=6,h=6){
    vsd <- vst(dds, blind=TRUE)
    rv <- rowVars(assay(vsd))
    select <- order(rv, decreasing = TRUE)[seq_len(min(n, length(rv)))]
    pca <- prcomp(t(assay(vsd)[select, ]))
    percentVar <- pca$sdev^2 / sum(pca$sdev^2)
    
    df <- as.data.frame(pca$x)
    df$name <- colnames(vsd)
    
    a <- ggplot(df, aes(.data[[paste0("PC", pcs[1])]],
                        .data[[paste0("PC", pcs[2])]])) +
      geom_point() +
      geom_text_repel(aes(label = name), vjust = -0.5) +
      xlab(paste0("PC",pcs[1],": ", round(percentVar[pcs[1]] * 100), "% variance")) +
      ylab(paste0("PC",pcs[2],": ", round(percentVar[pcs[2]] * 100), "% variance")) #+
      #coord_equal()
    
    #a = plotPCA(vsd, intgroup=c('DS1001b','vorasidenib','UNC0379')) + geom_label_repel(aes(label = name))
    #write_plot(a,outfile='results/deseq2/shSHTN1-shL1CAM/pca',width=w,height=h)
    options(repr.plot.width=w, repr.plot.height=h)
    return(a)
}

In [None]:
load_gex  <- function(data_location){
    message("loading gene expression data from ", data_location, "...")
    cts = read.table(data_location,check.names=FALSE)
    return(cts)
}
load_metadata <- function(data_location){
    message("loading sample metadata from ", data_location, "...")
    df = read.table(data_location,header=TRUE) %>%
        mutate(IDH1i = factor(DS1001b | vorasidenib),
               IDH2i = factor(vorasidenib),
               KMT5Ai = factor(UNC0379),
               treatment = case_when(
                   (DS1001b | vorasidenib) & UNC0379 ~ 'combination',
                   (DS1001b | vorasidenib) ~ 'IDHi',
                   UNC0379 ~ 'KMT5Ai',
                   TRUE ~ 'control'
        ))
    return(df)
}

counts_path = '../data/20260113/MGG152_hg19_countDATA_260106.txt' # 28263 genes
other_counts_path = '../data/20260113/MGG152_hg19_countDATA_sel_260106.txt' # 12918 genes, subset of expressed genes running DESeq2 under default parameters.
metadata_path = '../data/20260113/rna_sample_metadata.tsv'

metadata <- load_metadata(metadata_path)
counts <- load_gex(counts_path)
metadata

In [None]:
create_deseq_data_set <- function(counts,metadata,design){
    dds <- DESeq2::DESeqDataSetFromMatrix(
        countData = counts,
        colData = metadata,
        design = design
    )
    smallestGroupSize <- 3
    keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
    dds <- dds[keep,]
    dds <- DESeq(dds)
    return(dds)
}

## Multifactor design
Linear model using all samples.

In [None]:
design = ~ IDH1i + IDH2i + KMT5Ai + IDH1i:KMT5Ai
dds <- create_deseq_data_set(counts,metadata,design)
dds
resultsNames(dds)

In [None]:
#plot_pca(dds)
w=8;h=8
options(repr.plot.width=w, repr.plot.height=h)
vsd <- vst(dds, blind=TRUE)
a=plotPCA(vsd,intgroup='treatment',ntop=2000) + geom_label_repel(aes(label = name))
write_plot(a,outfile='results/pca_all',width=w,height=h)
a

In [None]:
design = ~ IDH1i + IDH2i + KMT5Ai + IDH1i:KMT5Ai
m <- metadata %>% filter(sample != 'MGG152_UNC1')
dds <- create_deseq_data_set(
    counts = counts %>% select(all_of(m$sample)),
    metadata=m,design=design)
dds
resultsNames(dds)

In [None]:
#plot_pca(dds)
w=8;h=8
options(repr.plot.width=w, repr.plot.height=h)
vsd <- vst(dds, blind=TRUE)
a=plotPCA(vsd,intgroup='treatment',ntop=2000) + geom_label_repel(aes(label = name))
write_plot(a,outfile='results/pca_drop_outliers',width=w,height=h)
a

In [None]:
# After dropping outliers, PC1 is strongly associated with sequencing depth
x=sizeFactors(dds)
x[order(unlist(x))] %>% as_tibble
names(x)

In [None]:
a = plot_pca(dds,pcs=c(2,3),n=2000)
write_plot(a,outfile='results/pca_2_3',width=w,height=h)
a

In [None]:
res_syn = lfcShrink(dds,coef='IDH1iTRUE.KMT5AiTRUE',type='apeglm')

In [None]:
# (remove UNC1) Significant hits for synergistic effect: 0 at q < 0.1, 0 at p < 0.05
syn_hits <- res_syn %>%
    as.data.frame %>%
    filter(padj < 0.1)
syn_hits

In [None]:
res_idh = lfcShrink(dds,coef='IDH1i_TRUE_vs_FALSE',type='apeglm')

In [None]:
# (remove UNC1) Significant hits for IDHi effect: 1 at q < 0.1, 1 at p < 0.05 (CBLN1)
idh_hits <- res_idh %>%
    as.data.frame %>%
    filter(padj < 0.1)
idh_hits

In [None]:
res_kmt5a = lfcShrink(dds,coef='KMT5Ai_TRUE_vs_FALSE',type='apeglm')

In [None]:
# (remove UNC1) Significant hits for KMT5Ai effect: 1 at q < 0.1, 3 at p < 0.05 (H1-0)
kmt5a_hits <- res_kmt5a %>%
    as.data.frame %>%
    filter(padj < 0.1)
kmt5a_hits

In [None]:
res_vora = lfcShrink(dds,coef='IDH2i_TRUE_vs_FALSE',type='apeglm')

In [None]:
# (remove UNC1) Significant hits for vorasidenib effect: 0 at q < 0.1, 0 at p < 0.05
vora_hits <- res_vora %>%
    as.data.frame %>%
    filter(padj < 0.1)
vora_hits

## Pairwise designs  
For comparison, models using only the 3 replicates in each pairwise comparison.  
(Dead code)


### IDHi samples vs control

In [None]:
meta_idhi = metadata %>%
    filter(treatment %in% c('IDHi','control')) %>%
    mutate(treatment = relevel(factor(treatment),ref='control'))
meta_idhi

In [None]:
dds_idhi <- create_deseq_data_set(
    counts = counts %>% select(all_of(meta_idhi$sample)),
    metadata = meta_idhi,
    design = ~ treatment
)
resultsNames(dds_idhi)

In [None]:
res_idhi_sub = lfcShrink(dds_idhi,coef='treatment_IDHi_vs_control',type='apeglm')

In [None]:
# No significant hits
res_idhi_sub %>%
    as.data.frame %>%
    filter(padj < 0.1)

### Intersection of vorasidenib | DS1001b

In [None]:
meta_ds = metadata %>%
    filter(treatment =='control' | (treatment == 'IDHi' & DS1001b)) %>%
    mutate(treatment = relevel(factor(treatment),ref='control'))
meta_ds

dds_ds <- create_deseq_data_set(
    counts = counts %>% select(all_of(meta_ds$sample)),
    metadata = meta_ds,
    design = ~ treatment
)
resultsNames(dds_ds)

res_ds = lfcShrink(dds_ds,coef='treatment_IDHi_vs_control',type='apeglm')
res_ds %>%
    as.data.frame %>%
    filter(padj < 0.1)

In [None]:
meta_vi = metadata %>%
    filter(treatment =='control' | (treatment == 'IDHi' & vorasidenib)) %>%
    mutate(treatment = relevel(factor(treatment),ref='control'))
meta_vi

dds_vi <- create_deseq_data_set(
    counts = counts %>% select(all_of(meta_vi$sample)),
    metadata = meta_vi,
    design = ~ treatment
)
resultsNames(dds_vi)

res_vi = lfcShrink(dds_vi,coef='treatment_IDHi_vs_control',type='apeglm')
res_vi %>%
    as.data.frame %>%
    filter(padj < 0.1)