# Differential TSS occupancy
of CUT&Tag data received 2026-01-13  
using limma

## MDS
Given an expression matrix (genes Ã— samples):
1. For each pair of samples, compute:
    Leading log-fold-change distance, the root-mean-square logFC across the top most variable genes (default top = 500)
2. Perform PCoA on that distance matrix
3. Plot the first two MDS dimensions

## Results
Using robust empirical bayes, no hits for any comparison.  
Using robust = FALSE,
- DE for IDHi:
  |gene     |logFC	        |AveExpr	|t	        |P.Value    	| adj.P.Val     |        B|
  |---------|---------------|-----------|-----------|---------------|---------------|---------|
  |ROCK1P1	|-0.67959413	|4.26730113	|-6.374184	|1.755699e-07	|0.004328501	|7.0067709|
- No significant hits for UNC0379 effect
- No significant hits for vorasidenib-specific effect
- No significant hits for synergistic effect

However, ROCK1P1 is a major outlier in average TSS occupancy, so further investigation would be warranted.

In [None]:
# Load dependencies

Sys.setenv(LANGUAGE = "en") # set language to "ja" if you prefer
suppressWarnings(library(limma))
#suppressWarnings(library(plyranges))
suppressWarnings(library(dplyr))
suppressWarnings(library(tibble))
suppressWarnings(library(readr))
suppressWarnings(library(ggplot2))
suppressWarnings(library(ggrepel))
suppressWarnings(library(extrafont))
suppressWarnings(library(svglite))
suppressWarnings(library(patchwork)) # combine plots

suppressMessages(extrafont::font_import(pattern="Arial",prompt=FALSE))
suppressMessages(extrafont::loadfonts())

getwd()
sessionInfo()

In [None]:
# Plotting defaults
base_theme <- theme_classic(base_size=14, base_family="Arial",) +
    theme(axis.text = element_text(size=14,colour="black"),
          aspect.ratio=1,
          #plot.margin=unit(c(0,0,0,0), "null")
         )
theme_set(base_theme)

# save ggplots
write_plot <- function(plt,outfile,width,height){
    b=basename(outfile)
    d=dirname(outfile)
    dir.create(d, recursive=TRUE, showWarnings = FALSE)
    pdf.options(encoding='ISOLatin2.enc')
    #pdfName = paste(outfile, ".pdf", sep="")
    pngName = paste(b, ".png", sep="")
    svgName = paste(b, ".svg", sep = "")
    #ggsave(path="figures", filename=pdfName, device="pdf", width=width, height=height, units='in')
    ggsave(path=d, device="png", filename=pngName, width=width, height=height, units='in')
    ggsave(path=d, device="svg", filename=svgName, width=width, height=height, units='in')
}

In [None]:
load_cnt <- function(data_location,metadata=NULL){
    message("loading gene expression data from ", data_location, "...")
    cts = read.table(data_location,check.names=FALSE,header=TRUE,row.names='gene_name')
    if (!is.null(metadata)){
        cts <- cts[,metadata$sample]
    }
    cts <- as.matrix(cts)
    log_cts <- log2(cts + 1)
    return(log_cts)
}

load_metadata <- function(data_location){
    message("loading sample metadata from ", data_location, "...")
    df = read.table(data_location,header=TRUE) %>%
        mutate(IDH1i = factor(DS1001b | vorasidenib),
               IDH2i = factor(vorasidenib),
               KMT5Ai = factor(UNC0379),
               treatment = case_when(
                   (DS1001b | vorasidenib) & UNC0379 ~ 'combination',
                   (DS1001b | vorasidenib) ~ 'IDHi',
                   UNC0379 ~ 'KMT5Ai',
                   TRUE ~ 'control'
        ))
    return(df)
}

quant_path = '../data/20260113/CnT_gene_promoterMeanCPM_byRep_TSSpm2kb_260109.tsv' # 28263 genes
metadata_path = '../data/20260113/cnt_sample_metadata.tsv'

metadata <- load_metadata(metadata_path)
quant <- load_cnt(quant_path,metadata)
quant %>% head
metadata

## QC
data are heteroskedastic even after log transform;  
--> use limma-trend

In [None]:
library(matrixStats)
mu  <- rowMeans(quant)
var <- matrixStats::rowVars(quant)
df <- tibble(mean = mu, variance = var)
m_cut <- quantile(df$mean, 0.10)
v_cut <- quantile(df$variance, 0.10)

plt <- ggplot(df, aes(mean, variance)) +
  geom_point(size = 0.5) +
  scale_x_log10() +
  scale_y_log10() +
  geom_vline(xintercept = m_cut, linetype = "dashed") +
  geom_hline(yintercept = v_cut, linetype = "dashed")
write_plot(plt,'figures/diff-tss-mv',6,6)
plt

In [None]:
hist(quant, breaks = 100, main = "Raw values", xlab = "Value")

In [None]:
qqnorm(as.vector(quant)); qqline(as.vector(quant))

In [None]:
keep <- mu > m_cut & var > v_cut
dim(quant)
quant_filtered <- quant[keep,]
dim(quant_filtered)

In [None]:
mds <- plotMDS(quant_filtered, plot=FALSE)
df_mds <- data.frame(
  Dim1 = mds$x,
  Dim2 = mds$y,
  sample = colnames(quant_filtered),
  metadata
)

plt <- ggplot(df_mds, aes(Dim1, Dim2, color = treatment)) +
  geom_point(size = 3) +
  geom_text_repel(aes(label = sample), vjust = -0.5) + 
  labs(x = "MDS1", y = "MDS2")
write_plot(plt,'figures/diff-tss-mds',6,6)
plt

In [None]:
df_mds

In [None]:
design = model.matrix(~ IDH1i + IDH2i + KMT5Ai + IDH1i:KMT5Ai,data=metadata)
colnames(design) <- make.names(colnames(design))
colnames(design)
# fit per-gene linear models; get coefficients & residual variances
fit <- lmFit(quant_filtered, design)

In [None]:
contrasts <- makeContrasts(
  IDHi = IDH1iTRUE,
  IDH2i_vs_IDH1i = IDH2iTRUE,
  KMT5Ai = KMT5AiTRUE,
  Combination = IDH1iTRUE.KMT5AiTRUE,
  levels = design
)
fit <- contrasts.fit(fit,contrasts)

In [None]:
fit <- eBayes(fit, trend = TRUE, robust=TRUE)

In [None]:
path_stem='figures/diff-tss-sa'

pngName=paste0(path_stem,'.png')
w=6;h=6;r=300
png(pngName, width=w*r, height=h*r, res=r)
plotSA(fit)
dev.off()

svgName=paste0(path_stem,'.svg')
w=6;h=6;r=300
svg(svgName, width=w, height=h)
plotSA(fit)
dev.off()

In [None]:
dec <- decideTests(fit,method='separate',adjust.method='BH',p.value=0.1)
summary(dec)

In [None]:
res_idh <- topTable(fit, coef = "IDHi", number = Inf)
res_idh2 <- topTable(fit, coef = "IDH2i_vs_IDH1i", number = Inf)
res_kmt <- topTable(fit, coef = "KMT5Ai", number = Inf)
res_combo <- topTable(fit, coef = "Combination", number = Inf)

In [None]:
res_idh %>% head

In [None]:
quant_filtered['ROCK1P1',] %>% mean

In [None]:
sqrt(fit$sigma['ROCK1P1'])