Paired TAM (transcient abnormal myleoproliferation) vs AML (acute myeloid leukemia) analysis of patients with the co-occuring condition of Down Syndrome using DESeq2 on IJC counts obtained from rMATS analysis.

Using a matrix constructed from Kids First Workflow V4 done on single runs, a post-rMATS-single-run prepareSEfiles.sh was run that created a bed file for visualizaiton in UCSC Genome browser of all the events, as well as created a matrix of the single runs normalized to the non-redundant union of files.  Using associative arrays in an awk script, it was a rapid way to transform the individual counts from each of the individual runs into a matrix that facilitated analysis.

Using annotations obtained from the rMATS run that provided the coordinates of each of the splicing events as well as the gene that the junctions came from and the count of the reads that overlapped the junctions.   
 
We will use Limma to perform analysis of these junction counts in the identical way that a gene analysis would be completed.


In [None]:
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager", repos = "https://cloud.r-project.org")


In [None]:
BiocManager::install("dplyr")

In [None]:
library(Glimma)
library(dplyr)
library(edgeR)

In [None]:
setwd("../data/")


In [None]:
getwd()


In [None]:
cts <- as.matrix(read.csv("SE.IJC.paired.TAM.AML.csv",sep=",",row.names="ID"))

In [None]:
featureData <- data.frame(read.csv("SE.coordinates.matrix.csv", sep=",",row.names="ID"))

In [None]:
head(featureData,2)

In [None]:
featureData <- featureData[,c(1,2)]

In [None]:
head(featureData,2)

In [None]:
dim(cts)
head(cts,2)

In [None]:
dim(featureData)
head(featureData,2)

In [None]:
coldata <- read.csv("design_matrix.csv",row.names=1)

In [None]:
coldata


In [None]:
coldata <- coldata[,c("patient","condition")]
coldata$condition <- factor(coldata$condition)
coldata$patient <- factor(coldata$patient)

In [None]:
rownames(coldata)

In [None]:
rownames(coldata) <-sub("-",".",rownames(coldata))

In [None]:
colnames(cts)

In [None]:
all(rownames(coldata) %in% colnames(cts))

In [None]:
colnames(cts)

In [None]:
grouping_variable <- c("TAM","AML","TAM","AML","TAM","AML","TAM","AML")
grouping_variable

In [None]:
TAM_group <- cts[,grouping_variable == "TAM"]
colnames(TAM_group)
TAM_group_df <- data.frame(TAM_group)
AML_group <- cts[,grouping_variable == "AML"]
AML_group_df <- data.frame(AML_group)
colnames(AML_group)

In [None]:
dim(cts)
head(cts,4)

In [None]:
TAM_rowmeans <- rowMeans(TAM_group_df,na.rm=TRUE)
head(TAM_rowmeans,3)
length(TAM_rowmeans)
AML_rowmeans <- rowMeans(AML_group_df,na.rm=TRUE)
head(AML_rowmeans,3)
length(AML_rowmeans)

In [None]:
install.packages("matrixStats")
library(matrixStats)

In [None]:
TAM_rowsds = rowSds(as.matrix(TAM_group_df))
AML_rowsds = rowSds(as.matrix(AML_group_df))
length(TAM_rowsds)
length(AML_rowsds)

In [None]:
TAM_withinsds <- as.logical(abs((TAM_group[,1] - TAM_rowmeans) <= TAM_rowsds)) &
                 as.logical(abs((TAM_group[,2] - TAM_rowmeans) <= TAM_rowsds)) &
                 as.logical(abs((TAM_group[,3] - TAM_rowmeans) <= TAM_rowsds)) &
                 as.logical(abs((TAM_group[,4] - TAM_rowmeans) <= TAM_rowsds)) 
is.logical(TAM_withinsds)
length(TAM_withinsds)
dim(TAM_withinsds)
head(TAM_withinsds)
sum(TAM_withinsds == TRUE)

In [None]:
AML_withinsds <- as.logical(abs((AML_group[,1] - AML_rowmeans) <= AML_rowsds)) &
                 as.logical(abs((AML_group[,2] - AML_rowmeans) <= AML_rowsds)) &
                 as.logical(abs((AML_group[,3] - AML_rowmeans) <= AML_rowsds)) &
                 as.logical(abs((AML_group[,4] - AML_rowmeans) <= AML_rowsds))
is.logical(AML_withinsds)
length(AML_withinsds)
dim(AML_withinsds)
head(AML_withinsds)
sum(AML_withinsds == TRUE)

In [None]:
filter_cts_logical <- AML_withinsds & TAM_withinsds
is.logical(filter_cts_logical)
length(filter_cts_logical)
dim(filter_cts_logical)
head(filter_cts_logical)
sum(filter_cts_logical == TRUE)

In [None]:
head(cts,2)
filtered_cts <- cts[filter_cts_logical,]
dim(filtered_cts)
max(filtered_cts)
max(filtered_cts[,c(1,3,5,7)])
max(filtered_cts[,c(2,4,6,8)])

In [None]:
# lets look at limma/voom
BiocManager::install("limma")

In [None]:
BiocManager::install("statmod")

In [None]:
library(limma)
library(edgeR)
library(statmod)

In [None]:
# making a counts matrix
dge <- DGEList(counts=cts)
filtered_dge <- DGEList(counts=filtered_cts)

In [None]:
colnames(dge)

In [None]:
head(dge,2)
head(filtered_dge)

In [None]:
design <- model.matrix(~ 0 + factor(c(1,2,1,2,1,2,1,2)))
colnames(design) <- c("TAM","AML")

In [None]:
design

In [None]:
# normalize and filter
keep          <-filterByExpr(dge, design)
filtered_keep <-filterByExpr(filtered_dge, design)

In [None]:
is.logical(keep)
is.logical(filtered_keep)

In [None]:
dge          <- dge         [keep,,keep.lib.size=FALSE]
filtered_dge <- filtered_dge[filtered_keep,,keep.lib.size=FALSE]

In [None]:
# apply scale normalization
dge          <- calcNormFactors(dge)
filtered_dge <- calcNormFactors(filtered_dge)

In [None]:
# MDS Plot - can we separate the samples well?
logCPM <- cpm(dge, log=TRUE, prior.count=3)
plotMDS(logCPM,labels=coldata$condition,top=10, col=c(rep(c("red","black"),3)))

In [None]:
# MDS Plot - can we separate the samples well?
filteredlogCPM <- cpm(filtered_dge, log=TRUE, prior.count=3)
plotMDS(filteredlogCPM,labels=coldata$condition,top=10, col=c(rep(c("red","black"),3)))

In [None]:
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend=TRUE)
de_results <- topTable(fit, coef=ncol(design), n=Inf) 
lookup <- rownames(de_results)
length(featureData[lookup,2])
head(featureData[lookup,2])

In [None]:
filtered_fit <- lmFit(filteredlogCPM, design)
filtered_fit <- eBayes(filtered_fit, trend=TRUE)
filtered_de_results <- topTable(filtered_fit, coef=ncol(design), n=Inf) 
filtered_lookup <- rownames(filtered_de_results)
length(featureData[filtered_lookup,2])
head(featureData[filtered_lookup,2])

In [None]:
library("pheatmap")
df <- as.data.frame(coldata[,c("condition","patient")])
filtered_dge_expression <- filtered_dge[filtered_lookup,]
out <- pheatmap(filtered_dge_expression, cluster_rows5=TRUE, show_rownames=FALSE,
         cluster_cols=TRUE, annotation_col=df, scale="row",clustering_distance_cols = "minkowski", clustering_distance_rows = "minkowski" )

In [None]:
# Assuming you have the 'de_results' object from topTable
fold_change_threshold <- 8
adjusted_pvalue_threshold <- 0.05

# Select genes that meet both fold change and adjusted p-value criteria
significant_filtered_genes <- filtered_de_results[
  abs(filtered_de_results$logFC) > fold_change_threshold &
  filtered_de_results$adj.P.Val < adjusted_pvalue_threshold,
]
dim(significant_filtered_genes)

In [None]:
filtered_lookup <- rownames(significant_filtered_genes)
significant_filtered_expression <- filtered_dge[filtered_lookup,]

In [None]:
significant_filtered_out <- pheatmap(significant_filtered_expression, 
                            cluster_rows5=TRUE, 
                            show_rownames=FALSE,
                            cluster_cols=TRUE, 
                            annotation_col=df, 
                            scale="row",
                            clustering_method = "ward.D2",
                            clustering_distance_cols = "minkowski", 
                            clustering_distance_rows = "minkowski" )


In [None]:
# weighting 
v <- voom(filtered_dge, plot=TRUE, normalize="quantile")

In [None]:
vwts <- voomWithQualityWeights(filtered_dge, design=design, normalize.method="quantile", plot=TRUE)

In [None]:
vwtsfit <- lmFit(vwts, design, weights = vwts$weights )
# no other weighting at this time.
 #* c(1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0))


In [None]:
summary(vwtsfit)

In [None]:
vwtsfit <- eBayes (vwtsfit)
vwtsfit <- treat(vwtsfit, lfc=log2(1.2))
topTreat(vwtsfit, coef=ncol(design),sort.by="logFC")

In [None]:
de_vwts_results <- topTreat(vwtsfit, coef=ncol(design), n=Inf, sort.by="logFC")

In [None]:
sum(de_vwts_results[,"logFC"] >= 1.5)
sum(de_vwts_results[,"logFC"] >= 2.0)
sum(de_vwts_results[,"logFC"] >= 4.0)
sum(de_vwts_results[,"logFC"] >= 8.0)
sum(de_vwts_results[,"logFC"] >= 9.0)

In [None]:
# Assuming you have the 'de_results' object from topTable
fold_change_threshold <- 9
adjusted_pvalue_threshold <- 0.05

# Select genes that meet both fold change and adjusted p-value criteria
significant_genes <- de_vwts_results[
  abs(de_vwts_results$logFC) > fold_change_threshold &
  de_vwts_results$adj.P.Val < adjusted_pvalue_threshold,
]


In [None]:
dim(significant_genes)

In [None]:
transformed_expression <- vwts$E

In [None]:
dim(transformed_expression)

In [None]:
significant_transformed_expression <- transformed_expression[rownames(significant_genes),]
dim(significant_transformed_expression)

In [None]:
featureData[head(rownames(significant_transformed_expression),5),2]

In [None]:
string_gene_list <- as.matrix(featureData[rownames(significant_genes),2])
length(string_gene_list)
#string_gene_list

In [None]:
top_significant_genes <- dge[rownames(significant_genes),]

In [None]:
dim(top_significant_genes)

In [None]:
head(top_significant_genes,5)
featureData[rownames(head(top_significant_genes,5)),2]

In [None]:
head(vwts,5)

In [None]:
library("pheatmap")
df <- as.data.frame(coldata[,c("condition","patient")])

In [None]:
out<-pheatmap(significant_transformed_expression, cluster_rows5=TRUE, show_rownames=FALSE,
         cluster_cols=TRUE, annotation_col=df, scale="row",clustering_distance_cols = "minkowski", clustering_distance_rows = "minkowski" )

In [None]:
head(top_genes_expression[out$tree_row$order,],3)

In [None]:
top_genes_expression[out$tree_row$order[1:10],]

In [None]:
start=1
stop=441
date="2023Aug28"
piece <-top_genes_expression[out$tree_row$order[start:stop],]
fd <- data.frame(featureData[rownames(piece),])
genejunction <- paste(featureData[rownames(piece),2],rownames(piece),sep=".")
rownames(fd) <- genejunction
rownames(piece) <- genejunction
outpiece<-pheatmap(piece, cluster_rows5=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, annotation_col=df, scale="row",clustering_distance_cols = "minkowski", clustering_distance_rows = "minkowski" )
piece_filename <- paste(paste(paste(paste(date,"string_SE_gene_junction",sep="_"),start,sep="_"),stop,sep="_"),"csv",sep=".")
piece_exp <- piece[,c(1,3,5,7,2,4,6,8)]
colnames(piece_exp) <- colnames(piece[,c(1,3,5,7,2,4,6,8)])
rownames(piece_exp) <- rownames(piece)
string_filename <- paste(paste(paste(paste(date,"string_SE_gene",sep="_"),start,sep="_"),stop,sep="_"),"csv",sep=".")
piece_exp_filename <- paste(paste(paste(paste(date,"SE_expression",sep="_"),start,sep="_"),stop,sep="_"),"csv",sep=".")
write.csv(piece_exp$counts,piece_exp_filename,quote=FALSE)
write.csv(rownames(piece),piece_filename,quote=FALSE,row.names=FALSE)
write.csv(fd[,2],string_filename,quote=FALSE,row.names=FALSE)

In [None]:
summary(out)

In [None]:
length(featureData[out$tree_row$order,2])

In [None]:
save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
   stopifnot(!missing(x))
   stopifnot(!missing(filename))
   pdf(filename, width=width, height=height)
   grid::grid.newpage()
   grid::grid.draw(x$gtable)
   dev.off()
}
save_pheatmap_pdf(out, "top_genes_expression.pdf")

In [None]:
#Re-order original data (genes) to match ordering in heatmap (top-to-bottom)
rn <- rownames(top_genes_expression[out$tree_row[["order"]],])
cn <- colnames(top_genes_expression[,out$tree_col[["order"]]])

In [None]:
#If you want something like gene-to-cluster assignment, you can 'cut' your row dendrogram into a pre-selected number of groups as follows:
#8 groups
clusters<- as.matrix(row_clusters<- sort(cutree(out$tree_row, k=6)),nrows=dim(top_genes_expression)[1],ncols=1)
genes_in_clusters = featureData[rownames(clusters),2]
genes_in_clusters.df <- data.frame(featureData[rownames(clusters),2], clusters)
colnames(genes_in_clusters.df) <- c("geneSymbol","cluster")
dim(genes_in_clusters.df)
cluster_1_genes <- genes_in_clusters.df[genes_in_clusters.df$cluster == 1,]
cluster_2_genes <- genes_in_clusters.df[genes_in_clusters.df$cluster == 2,]
cluster_3_genes <- genes_in_clusters.df[genes_in_clusters.df$cluster == 3,]
cluster_4_genes <- genes_in_clusters.df[genes_in_clusters.df$cluster == 4,]
cluster_5_genes <- genes_in_clusters.df[genes_in_clusters.df$cluster == 5,]
cluster_6_genes <- genes_in_clusters.df[genes_in_clusters.df$cluster == 6,]
#cluster_7_genes <- genes_in_clusters.df[genes_in_clusters.df$cluster == 7,]
write.csv(cluster_1_genes$geneSymbol,"string_list_cluster_1.csv",quote=FALSE,row.names=FALSE)
write.csv(cluster_2_genes$geneSymbol,"string_list_cluster_2.csv",quote=FALSE,row.names=FALSE)
write.csv(cluster_3_genes$geneSymbol,"string_list_cluster_3.csv",quote=FALSE,row.names=FALSE)
write.csv(cluster_4_genes$geneSymbol,"string_list_cluster_4.csv",quote=FALSE,row.names=FALSE)
write.csv(cluster_5_genes$geneSymbol,"string_list_cluster_5.csv",quote=FALSE,row.names=FALSE)
write.csv(cluster_6_genes$geneSymbol,"string_list_cluster_6.csv",quote=FALSE,row.names=FALSE)
#write.csv(cluster_7_genes$geneSymbol,"string_list_cluster_7.csv",quote=FALSE,row.names=FALSE)


In [None]:
#to understand the difference between AML and TAM, we will use contrasts

In [None]:
fit2 <- eBayes(fit2)

In [None]:
topTable(fit2, adjust="BH")

In [None]:
de_results_fit2 <- topTable(fit2, adjust="BH", n=Inf)
dim(de_results_fit2)

In [None]:
results2 <- decideTests(fit2)

To permit the comparison and analysis of the group as distinguished by difference between the two conditions, TAM and AML, categorically, that is a means method.
We do this using contrasts.

In [None]:
contrasts_fit_venn_counts <- vennCounts(results2)

In [None]:
contrasts_fit_venn_counts

In [None]:
head(results2)

In [None]:
head(de_results2)

In [None]:
filtered_de_results2 <- de_results2[c((abs(de_results2[,c("AMLvsTAM")]) > 1.5) |
                        (abs(de_results2[,c("TAMvsAML")]) > 1.5)),]

In [None]:
filtered_de_results2