# Differential expression analysis
Data are EP1NS cells (ependymoma ZFTA-fusion cell line) treated with shL1CAM, shSHTN1, or shControl.
[DESeq2 docs](https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html).

## Required input files:
*shL1CAM-shSHTN1.expected_counts.tsv*.  
Generated by running RSEM on all samples using the following parameters:  
`rsem-calculate-expression --star --star-gzipped-read-file --paired-end --append-names $R1 $R2 $REF $SAMPLE`  
where the reference is ensembl 113 (GRCH38.p14)

*sample_metadata.tsv*.  
Sample metadata indicating shRNA treatments. Formatted as tsv of sample, treatment.

*Homo_sapiens.GRCh38.113.gtf*.  
Gene annotation file used for STAR, RSEM. Download from https://ftp.ensembl.org/pub/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh38.113.gtf.gz.

In [None]:
# Load dependencies

Sys.setenv(LANGUAGE = "en") # set language to "ja" if you prefer
suppressWarnings(library(DESeq2))
suppressWarnings(library(plyranges))
suppressWarnings(library(dplyr))
suppressWarnings(library(tibble))
suppressWarnings(library(readr))
suppressWarnings(library(ggplot2))
suppressWarnings(library(ggrepel))
suppressWarnings(library(extrafont))
suppressWarnings(library(svglite))
suppressWarnings(library(patchwork)) # combine plots

suppressMessages(extrafont::font_import(pattern="Arial",prompt=FALSE))
suppressMessages(extrafont::loadfonts())

getwd()
sessionInfo()

In [None]:
# Plotting defaults
base_theme <- theme_classic(base_size=14, base_family="Arial",) +
    theme(axis.text = element_text(size=14,colour="black"),
          aspect.ratio=1,
          #plot.margin=unit(c(0,0,0,0), "null")
         )
theme_set(base_theme)

write_plot <- function(plt,outfile,width,height){
    b=basename(outfile)
    d=dirname(outfile)
    dir.create(d, recursive=TRUE, showWarnings = FALSE)
    pdf.options(encoding='ISOLatin2.enc')
    #pdfName = paste(outfile, ".pdf", sep="")
    pngName = paste(b, ".png", sep="")
    svgName = paste(b, ".svg", sep = "")
    #ggsave(path="figures", filename=pdfName, device="pdf", width=width, height=height, units='in')
    ggsave(path=d, device="png", filename=pngName, width=width, height=height, units='in')
    ggsave(path=d, device="svg", filename=svgName, width=width, height=height, units='in')

}

# Get and process your data

In [None]:
data_location = 'results/rsem/shL1CAM-shSHTN1.expected_counts.tsv'
metadata_location = 'anno/sample_metadata.tsv'
annotation_location = 'anno/ensembl-113/Homo_sapiens.GRCh38.113.gtf'

In [None]:
load_annotations <- function(annotation_path) {
    # Load  gene annotations from a .gtf file.
    # I'm using the basic primary gene annotation file from ensembl 113:
    # https://ftp.ensembl.org/pub/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh38.113.gtf.gz
    g <- rtracklayer::import(annotation_path) %>%
        filter(type=='gene') %>%
       filter(gene_biotype=='protein_coding')
    return(g)
}

filter_protein_coding <- function(gene_matrix,annotation_path){
    # given a matrix with rownames of the form ENSG00000000003_TSPAN6,
    # return a subset consisting of only rows where the stable gene ID ENSG00000000003 
    # is annotated as a protein-coding gene.
    message('filtering for protein-coding genes ...')
    g <- load_annotations(annotation_path)
    # need to match on stable ENSG IDs
    ensembl_ids <- sub("_.*", "", rownames(gene_matrix))
    keep <- ensembl_ids %in% g$gene_id
    filtered_data <- gene_matrix[keep, ]
    return(filtered_data)
}

load_gex  <- function(data_location,annotation_location){
    message("loading gene expression data from ", data_location, "...")
    cts = as.matrix(read.csv(data_location,sep='\t',row.names="gene_id",check.names=FALSE))
    cts = round(cts) # counts must be integer
    cts = filter_protein_coding(cts,annotation_location)
    new_rownames <- sub("^ENSG\\d*_", "", rownames(cts)) # Remove ENSG prefixes
    rownames(cts) <- new_rownames
    return(cts)
}

In [None]:
# Read and preprocess expression data
data = load_gex(data_location,annotation_location)

In [None]:
# Read and format our annotation table
annot = read.table(metadata_location, row.names=1, header=TRUE)
annot <- annot[match(colnames(data), rownames(annot)), , drop=FALSE]

In [None]:
# Drop possibly problematic controls
# drop = c("shNC_REP3","shNC_REP4")
# data = data[, setdiff(colnames(data),drop)]
# annot = annot[setdiff(rownames(annot),drop), , drop=FALSE]

In [None]:
data %>% head
annot

# Full linear model


In [None]:
dds <- DESeq2::DESeqDataSetFromMatrix(countData = data,
                              colData = annot,
                              design = ~ treatment)
# Pre-filter
smallestGroupSize <- 2
keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
dds <- dds[keep,]
# Run regression
dds <- DESeq(dds)
# Print model summary
dds
# print model coefficient names
resultsNames(dds)

In [None]:
# Looks like much more RNA was extracted from all controls than all treatment samples.
sizeFactors(dds)

In [None]:
# A bit of QC - looks like PC1 separates controls from treatments, good. PC2 separates shL1CAM from shSHTN1, and also good controls from slight outlier controls.
vsd <- vst(dds, blind=TRUE)

a = plotPCA(vsd, intgroup=c("treatment")) + geom_label_repel(aes(label = name))
w=6;h=6
write_plot(a,outfile='results/deseq2/pca',width=w,height=h)
options(repr.plot.width=6, repr.plot.height=6)
a

## Uncomment to show pairwise plots
#vsd_s1 <- vsd[ , vsd$treatment %in% c("shSHTN1", "control") ]
#b = plotPCA(vsd_s1, intgroup=c("treatment")) + geom_label_repel(aes(label = name))
#vsd_s2 <- vsd[ , vsd$treatment %in% c("shL1CAM", "control") ]
#c = plotPCA(vsd_s2, intgroup=c("treatment")) + geom_label_repel(aes(label = name))
#vsd_s3 <- vsd[ , vsd$treatment %in% c("shL1CAM", "shSHTN1") ]
#d = plotPCA(vsd_s3, intgroup=c("treatment")) + geom_label_repel(aes(label = name))
# options(repr.plot.width=12, repr.plot.height=12)
#(a | b) / (c | d)

In [None]:
## Generate DE comparisons. Wald test with BH correction. Post-hoc LFC shrinkage with apeglm.
res_l1cam <- results(dds, name="treatment_shL1CAM_vs_control",independentFiltering=TRUE)
res_l1cam <- lfcShrink(dds, coef="treatment_shL1CAM_vs_control", res=res_l1cam, type="apeglm")

res_shtn1 <- results(dds, name="treatment_shSHTN1_vs_control",independentFiltering=TRUE)
res_shtn1 <- lfcShrink(dds, coef="treatment_shSHTN1_vs_control", res=res_shtn1, type="apeglm")

In [None]:
get_hits <- function(deseq_result,decreasing=FALSE,outfile=NULL){
    res_df <- as.data.frame(deseq_result)  # Convert DESeqResults object to a data frame
    res_df <- res_df[order(res_df$log2FoldChange, decreasing=decreasing),]  # Sort by foldChange
    # Filter by padj < 0.05
    filtered_sorted_res <- res_df[(!is.na(res_df$padj)) & (res_df$padj < 0.05), ]  # Filter by padj
    
    if (!is.null(outfile)) {
        # Write the data frame to a tab-separated file (TSV)
        write.table(res_df, file = outfile, sep = "\t", quote = FALSE)
    }
    return(filtered_sorted_res)
}

In [None]:
# genes down in shL1CAM  
get_hits(res_l1cam,FALSE,outfile='results/deseq2/shL1CAM_deg.tsv') %>% head(n=10)

In [None]:
# genes down in shSHTN1  
get_hits(res_shtn1,FALSE,outfile='results/deseq2/shSHTN1_deg.tsv') %>% head(n=10)

In [None]:
# Lookup genes of interest
genes_of_interest = c('SHTN1','L1CAM','CCN2','CCN1','YAP1') # CTGF = CCN2; CYR61 =  CCN1
res_shtn1[rownames(res_shtn1) %in% genes_of_interest,]
message("")
res_l1cam[rownames(res_l1cam) %in% genes_of_interest,]

In [None]:
# Log normalized expression scaled by library size.
# https://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extracting-transformed-values
assay(vsd)[rownames(vsd) %in% genes_of_interest, ]