In [None]:
library(data.table)
library(dplyr)
library(readxl)
library(ggplot2)
library(limma)
library(EnhancedVolcano)
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(pheatmap)
library(VennDiagram)
library(tidyr)

In [None]:
genecode.annot.subsetted = read.table("/grehawi/splice-reg-prj/new-data/subsetted_gencode_annotation.txt")
head(genecode.annot.subsetted)

In [None]:
gene.names.ids = genecode.annot.subsetted[genecode.annot.subsetted$type=="gene", colnames(genecode.annot.subsetted) %in% c("gene_id", "gene_name")]
trx.names.ids = genecode.annot.subsetted[genecode.annot.subsetted$type=="transcript", colnames(genecode.annot.subsetted) %in% c("transcript_id", "transcript_name")]

In [None]:
samples.info = read.table("/grehawi/splice-reg-prj/new-data/Diff-Analysis/combined_pheno_withCT.csv")
dim(samples.info)

### Perform differential analysis on gene level (use the data after correction)

In [None]:
gene.count.corrected.final = read.table("/grehawi/splice-reg-prj/new-data/Diff-Analysis/gene.logCPM.corrected.final.matrix")

In [None]:
trx.count.corrected.final = read.table("/grehawi/splice-reg-prj/new-data/Diff-Analysis/trx.logCPM.corrected.final.matrix")

In [None]:
# perform limma-trend for DEA
design <- model.matrix(~ ltany_di, data = samples.info)

fit <- lmFit(gene.count.corrected.final, design)
#fit <- treat(fit, lfc=log2(1.2), trend=TRUE)
fit <- eBayes(fit, robust=TRUE, trend=TRUE)
#We can use topTable() to look at lists of differentially expressed genes for each contrast
# We can use decideTest() to look at all contrasts simultaneously
diff_gene_res = topTable(fit, coef=ncol(design),  n=Inf)
# add another column with gene names
diff_gene_res$gene_id <- rownames(diff_gene_res)
merged_diff_gene_res <- merge(diff_gene_res, gene.names.ids, by = "gene_id", all.x = TRUE)
# order based on p.adjust
merged_diff_gene_res <- merged_diff_gene_res[order(merged_diff_gene_res$adj.P.Val),]
head(merged_diff_gene_res)
dim(merged_diff_gene_res)
summary(merged_diff_gene_res[["logFC"]])
summary(merged_diff_gene_res[["adj.P.Val"]])

In [None]:
write.table(merged_diff_gene_res, "/grehawi/splice-reg-prj/new-data/Diff-Analysis/supp_table_DGE.csv", sep = ",", row.names=FALSE)

In [None]:
options(repr.plot.width=10, repr.plot.height=6)
genes_up <- merged_diff_gene_res[merged_diff_gene_res$adj.P.Val<=0.05 & merged_diff_gene_res$logFC > 0,]
genes_down <- merged_diff_gene_res[merged_diff_gene_res$adj.P.Val<=0.05 & merged_diff_gene_res$logFC < 0,]
dim(genes_up)
dim(genes_down)
selected = c(genes_up[1:5, ]$gene_name, genes_down[1:5, ]$gene_name)
selected

volcano_plot = EnhancedVolcano(merged_diff_gene_res, lab = merged_diff_gene_res$gene_name, 
                        x = "logFC",
                        y = "adj.P.Val",
                        xlim = c(min(merged_diff_gene_res[["logFC"]], na.rm = TRUE) - 1.0, 
                                 max(merged_diff_gene_res[["logFC"]], na.rm = TRUE) + 1.0),
                        ylim = c(0, max(-log10(merged_diff_gene_res[["adj.P.Val"]]), na.rm = TRUE)), 
                        pCutoff = 0.05, FCcutoff = 0, legendPosition = 'right', axisLabSize = 24, legendLabSize = 24, labSize = 6.0,
                        selectLab=selected, drawConnectors = TRUE) 
# Remove the gridlines using theme()
volcano_plot = volcano_plot + theme(
  panel.grid.major = element_blank(),   # Remove major gridlines
  panel.grid.minor = element_blank(),   # Remove minor gridlines
  panel.background = element_blank()   # Remove background color (optional)
)
volcano_plot
# Save the plot in high resolution (e.g., 300 DPI)
#ggsave("/grehawi/splice-reg-prj/Figures/volcano_plot_genes.pdf", plot = volcano_plot, dpi = 300, width = 12, height = 5)

### Perform differential analysis on transcript level (use the data after correction)

In [None]:
# perform limma-trend for DEA
design <- model.matrix(~ ltany_di, data = samples.info)

fit <- lmFit(trx.count.corrected.final, design)
#fit <- treat(fit, lfc=log2(1.2), trend=TRUE)
fit <- eBayes(fit, robust=TRUE, trend=TRUE)
#We can use topTable() to look at lists of differentially expressed genes for each contrast
# We can use decideTest() to look at all contrasts simultaneously
diff_trx_res = topTreat(fit, coef=ncol(design),  n=Inf)
# add another column with gene names
diff_trx_res$transcript_id <- rownames(diff_trx_res)
merged_diff_trx_res <- merge(diff_trx_res, trx.names.ids, by = "transcript_id", all.x = TRUE)
# order based on p.adjust
merged_diff_trx_res <- merged_diff_trx_res[order(merged_diff_trx_res$adj.P.Val),]
head(merged_diff_trx_res)
dim(merged_diff_trx_res)
summary(merged_diff_trx_res[["logFC"]])
summary(merged_diff_trx_res[["adj.P.Val"]])

In [None]:
write.table(merged_diff_trx_res, "/grehawi/splice-reg-prj/new-data/Diff-Analysis/supp_table_DTE.csv", sep = ",", row.names=FALSE)

In [None]:
options(repr.plot.width=14, repr.plot.height=8)
trx_up <- merged_diff_trx_res[merged_diff_trx_res$adj.P.Val<=0.05 & merged_diff_trx_res$logFC > 0,]
trx_down <- merged_diff_trx_res[merged_diff_trx_res$adj.P.Val<=0.05 & merged_diff_trx_res$logFC < 0,]
dim(trx_up)
dim(trx_down)
selected = c(trx_up[1:5, ]$transcript_name, trx_down[1:5, ]$transcript_name)
selected

volcano_plot = EnhancedVolcano(merged_diff_trx_res, lab = merged_diff_trx_res$transcript_name, 
                        x = "logFC",
                        y = "adj.P.Val",
                        xlim = c(min(merged_diff_trx_res[["logFC"]], na.rm = TRUE) - 1.0, 
                                 max(merged_diff_trx_res[["logFC"]], na.rm = TRUE) + 1.0),
                        ylim = c(0, max(-log10(merged_diff_trx_res[["adj.P.Val"]]), na.rm = TRUE)),
                        pCutoff = 0.05, FCcutoff = 0, legendPosition = 'right', axisLabSize = 24, legendLabSize = 24, labSize = 6.0,
                        selectLab=selected, drawConnectors = TRUE) 
# Remove the gridlines using theme()
volcano_plot = volcano_plot + theme(
  panel.grid.major = element_blank(),   # Remove major gridlines
  panel.grid.minor = element_blank(),   # Remove minor gridlines
  panel.background = element_blank()   # Remove background color (optional)
)
volcano_plot
# Save the plot in high resolution (e.g., 300 DPI)
#ggsave("/grehawi/splice-reg-prj/Figures/volcano_plot_trx.pdf", plot = volcano_plot, dpi = 300, width = 12, height = 5)

### Check if the dis-regulated trx are different from the dis-regulated genes

In [None]:
# read transcripts-genes mapping table
trx.genes = read.table("/grehawi/splice-reg-prj/data/transcriptsID-geneID.txt")
head(trx.genes)

In [None]:
genes_of_up_trx = trx.genes[trx.genes$transcript_id %in% trx_up$transcript_id,]
length(unique(genes_of_up_trx$gene_id))
#intersection btw the upregulated genes and genes of upregulated trx
length(intersect(genes_of_up_trx$gene_id, genes_up$gene_id))
length(setdiff(genes_of_up_trx$gene_id, genes_up$gene_id))

In [None]:
genes_of_down_trx = trx.genes[trx.genes$transcript_id %in% trx_down$transcript_id,]
length(unique(genes_of_down_trx$gene_id))
#intersection btw the downregulated genes and genes of downregulated trx
length(intersect(genes_of_down_trx$gene_id, genes_down$gene_id))

In [None]:
genes_up_names = genes_up$gene_name
genes_down_names = genes_down$gene_name
#genesNames_of_up_trx = genecode.annot.subsetted$gene_name[genecode.annot.subsetted$gene_id %in% genes_of_up_trx$gene_id & genecode.annot.subsetted$type == "gene"]
#genesNames_of_down_trx = genecode.annot.subsetted$gene_name[genecode.annot.subsetted$gene_id %in% genes_of_down_trx$gene_id & genecode.annot.subsetted$type == "gene"]
trx_up_names = trx_up$transcript_name
trx_down_names = trx_down$transcript_name


In [None]:
#save files
saveRDS(genes_up_names, "/grehawi/splice-reg-prj/new-data/Diff-Analysis/up_reg_ganes.rds")
saveRDS(genes_down_names, "/grehawi/splice-reg-prj/new-data/Diff-Analysis/down_reg_ganes.rds")
saveRDS(trx_up_names, "/grehawi/splice-reg-prj/new-data/Diff-Analysis/up_reg_trx.rds")
saveRDS(trx_down_names, "/grehawi/splice-reg-prj/new-data/Diff-Analysis/down_reg_trx.rds")

In [None]:
# save tables of up and down regulated genes and transcripts with names and ids (for trx also with gene name and id)
# I use these files for enrichment analysis with FUMA
genes_up_df = genecode.annot.subsetted[genecode.annot.subsetted$type == 'gene' & genecode.annot.subsetted$gene_name %in% genes_up_names, 
                                       colnames(genecode.annot.subsetted) %in% c('gene_name', 'gene_id')]

genes_down_df = genecode.annot.subsetted[genecode.annot.subsetted$type == 'gene' & genecode.annot.subsetted$gene_name %in% genes_down_names, 
                                       colnames(genecode.annot.subsetted) %in% c('gene_name', 'gene_id')]

trx_up_df = genecode.annot.subsetted[genecode.annot.subsetted$type == 'transcript' & genecode.annot.subsetted$transcript_name %in% trx_up_names, 
                                       colnames(genecode.annot.subsetted) %in% c('transcript_name', 'transcript_id', 'gene_id', 'gene_name')]

trx_down_df = genecode.annot.subsetted[genecode.annot.subsetted$type == 'transcript' & genecode.annot.subsetted$transcript_name %in% trx_down_names, 
                                       colnames(genecode.annot.subsetted) %in% c('transcript_name', 'transcript_id', 'gene_id', 'gene_name')]


In [None]:
length(unique(c(genes_up_df$gene_name, genes_down_df$gene_name, trx_up_df$gene_name, trx_down_df$gene_name)))

In [None]:
length(unique(trx_up_df[trx_up_df$gene_id %in% setdiff(genes_of_up_trx$gene_id, genes_up$gene_id), 'transcript_id']))
length(unique(trx_down_df[trx_down_df$gene_id %in% setdiff(genes_of_down_trx$gene_id, genes_down$gene_id), 'transcript_id']))

In [None]:
write.table(genes_up_df, '/grehawi/splice-reg-prj/new-data/Diff-Analysis/genes_up_df.txt')
write.table(genes_down_df, '/grehawi/splice-reg-prj/new-data/Diff-Analysis/genes_down_df.txt')
write.table(trx_up_df, '/grehawi/splice-reg-prj/new-data/Diff-Analysis/trx_up_df.txt')
write.table(trx_down_df, '/grehawi/splice-reg-prj/new-data/Diff-Analysis/trx_down_df.txt')

In [None]:
venn.diagram(
        x = list(genes_up_df$gene_name , trx_up_df$gene_name),
        category.names = c('+Genes' , '+Transcripts'),
        filename = '/grehawi/splice-reg-prj/Figures/venn_up_reg_gene_trx.svg',
        imagetype="svg",
        output=TRUE,
        # Circles
        lwd = 2,
        cex = 12,
        cat.cex = 12,
        lty = 'blank',
        fill = c("#818689", "#818689"),
        height=32,
        width=32)

In [None]:
venn.diagram(
        x = list(genes_down_df$gene_name , trx_down_df$gene_name),
        category.names = c('-Genes' , '-Transcripts'),
        filename = '/grehawi/splice-reg-prj/Figures/venn_down_reg_gene_trx.svg',
        imagetype="svg",
        output=TRUE,
        # Circles
        lwd = 2,
        cex = 12,
        cat.cex = 12,
        lty = 'blank',
        fill = c("#818689", "#818689"),
        height=32,
        width=32)

### Intersection with Wittenberg et al

(https://www.biologicalpsychiatryjournal.com/article/S0006-3223(20)31591-2/fulltext#app-1)

In [None]:
wittenberg_down_genes = read.table('/grehawi/splice-reg-prj/new-data/Diff-Analysis/genes_down_MDD_wittenberg_etal.txt', header=TRUE)
head(wittenberg_down_genes)
dim(wittenberg_down_genes)

wittenberg_up_genes = read.table('/grehawi/splice-reg-prj/new-data/Diff-Analysis/genes_up_MDD_wittenberg_etal.txt', header=TRUE)
head(wittenberg_up_genes)
dim(wittenberg_up_genes)

In [None]:
intersect(wittenberg_down_genes$Genes, genes_down_df$gene_name)
intersect(wittenberg_up_genes$Genes, genes_up_df$gene_name)

intersect(wittenberg_down_genes$Genes, trx_down_df$gene_name)
intersect(wittenberg_up_genes$Genes, trx_up_df$gene_name)

In [None]:
trx_down_df[trx_down_df$gene_name == 'PTPN4', ]
trx_up_df[trx_up_df$gene_name == 'CAMKK2', ]

### Enrichment analysis for dyregulated genes and transcripts using clusterProfiler

Using FUMA: we only saw enrichment for up-regulated genes and up-regulated transcripts in Go-bp

In [None]:
#The background genes are the same used for network inference, created in 03_prepare_input_for_netInf.R
background_genes = read.table("/grehawi/splice-reg-prj/new-data/ARACNE/gene_names_ids_table.txt")
head(background_genes)
dim(background_genes)

In [None]:
gene_set1 = genes_up_df$gene_name
gene_set2 = trx_up_df$gene_name
gene_set3 = genes_down_df$gene_name
gene_set4 = trx_down_df$gene_name

# Enrichment analysis for Gene Ontology Biological Process (BP) for all sets
go_enrichment_1 = enrichGO(gene = gene_set1, 
                            OrgDb = org.Hs.eg.db, 
                            keyType = "SYMBOL", 
                            ont = "BP",
                            universe = background_genes$gene_name, 
                            pAdjustMethod = "BH",
                            pvalueCutoff = 1,
                            qvalueCutoff = 1)

go_enrichment_2 = enrichGO(gene = gene_set2, 
                            OrgDb = org.Hs.eg.db, 
                            keyType = "SYMBOL", 
                            ont = "BP", 
                            universe = background_genes$gene_name,
                            pAdjustMethod = "BH",
                            pvalueCutoff = 1,
                            qvalueCutoff = 1)

go_enrichment_3 = enrichGO(gene = gene_set3, 
                            OrgDb = org.Hs.eg.db, 
                            keyType = "SYMBOL", 
                            ont = "BP", 
                            universe = background_genes$gene_name,
                            pAdjustMethod = "BH",
                            minGSSize = 3,
                            pvalueCutoff = 1,
                            qvalueCutoff = 1)

go_enrichment_4 = enrichGO(gene = gene_set4, 
                            OrgDb = org.Hs.eg.db, 
                            keyType = "SYMBOL", 
                            ont = "BP", 
                            universe = background_genes$gene_name,
                            pAdjustMethod = "BH",
                            pvalueCutoff = 1,
                            qvalueCutoff = 1)


In [None]:
#Cluster similar GO terms: To reduce redundancy and cluster similar GO terms, use the simplify() function from the clusterProfiler package.
#This will merge similar terms based on semantic similarity.
go_enrichment_1 = simplify(go_enrichment_1, cutoff=0.7, by="p.adjust", select_fun=min)
go_enrichment_2 = simplify(go_enrichment_2, cutoff=0.7, by="p.adjust", select_fun=min)
go_enrichment_3 = simplify(go_enrichment_3, cutoff=0.7, by="p.adjust", select_fun=min)
go_enrichment_4 = simplify(go_enrichment_4, cutoff=0.7, by="p.adjust", select_fun=min)

In [None]:
# combine results
go_df_1 = as.data.frame(go_enrichment_1)
go_df_2 = as.data.frame(go_enrichment_2)
go_df_3 = as.data.frame(go_enrichment_3)
go_df_4 = as.data.frame(go_enrichment_4)


# Add a column to differentiate the two sets
go_df_1$set = "+Genes"
go_df_2$set = "+Transcripts"
go_df_3$set = "-Genes"
go_df_4$set = "-Transcripts"

# Combine both data frames
combined_go_df = rbind(go_df_1, go_df_2, go_df_3, go_df_4)
head(combined_go_df)
dim(combined_go_df)

In [None]:
write.table(combined_go_df, "/grehawi/splice-reg-prj/new-data/Diff-Analysis/Suppl_table_clusterProfiler_diff_gene_trx.txt", sep= ',', row.names = FALSE)

In [None]:
# Create a dotplot
options(repr.plot.width=16, repr.plot.height=18)

# take top 10 most sign Go-terms for each set
go_df_1_sub = go_df_1[order(go_df_1$p.adjust, decreasing = FALSE),][1:10,]
go_df_2_sub = go_df_2[order(go_df_2$p.adjust, decreasing = FALSE),][1:10,]
go_df_3_sub = go_df_3[order(go_df_3$p.adjust, decreasing = FALSE),][1:10,]
go_df_4_sub = go_df_4[order(go_df_4$p.adjust, decreasing = FALSE),][1:10,]

combined_go_df_sub = rbind(go_df_1_sub, go_df_2_sub, go_df_3_sub, go_df_4_sub)

# Get p-values for the rest of the sets for each go-term in the combined_df
combined_go_df_sub_go_ids = combined_go_df_sub$ID
go_df_1_sub_extended = go_df_1[go_df_1$ID %in% combined_go_df_sub_go_ids, ]
go_df_2_sub_extended = go_df_2[go_df_2$ID %in% combined_go_df_sub_go_ids, ]
go_df_3_sub_extended = go_df_3[go_df_3$ID %in% combined_go_df_sub_go_ids, ]
go_df_4_sub_extended = go_df_4[go_df_4$ID %in% combined_go_df_sub_go_ids, ]

combined_go_df_sub_extended = rbind(go_df_1_sub_extended, go_df_2_sub_extended, go_df_3_sub_extended, go_df_4_sub_extended)

ggplot(combined_go_df_sub_extended, aes(x = set, y = Description, size = RichFactor, color = 0 - log10(p.adjust))) +
  geom_point() +
  scale_color_gradient(low = "blue", high = "red") +
  theme_minimal() +
  labs(title = "GO Enrichment for Gene Sets", 
       x = "Gene Set", 
       y = "GO Term",
       size = "Rich Factor", 
       color = "-log10 Adjusted p-value") +
    theme(axis.text= element_text(size = 18), axis.title.x = element_text(size = 22),
                                                         axis.title.y = element_text(size = 22),
                                                         legend.text=element_text(size=20),
                                                         legend.title=element_text(size=22))


In [None]:
ggsave("/grehawi/splice-reg-prj/Figures/dotplot_go_enrch.pdf", width = 18, height = 18)

### MAGMA of dyregulated genes/trx

In [None]:
# map name of genes and genes of trx to entrezIDs according to the file NCBI37.3.gene.loc
NCBI37_gene_loc = read.table('/grehawi/splice-reg-prj/new-data/MAGMA/NCBI37.3/NCBI37.3.gene.loc')
names(NCBI37_gene_loc)[1] <- "entrez_id"
names(NCBI37_gene_loc)[6] <- "gene_name"
head(NCBI37_gene_loc)

In [None]:
genes_up_df_tmp = merge(genes_up_df, NCBI37_gene_loc[, colnames(NCBI37_gene_loc) %in% c('entrez_id', 'gene_name')], by = "gene_name", all.x = TRUE)
genes_down_df_tmp = merge(genes_down_df, NCBI37_gene_loc[, colnames(NCBI37_gene_loc) %in% c('entrez_id', 'gene_name')], by = "gene_name", all.x = TRUE)
trx_up_df_tmp = merge(trx_up_df, NCBI37_gene_loc[, colnames(NCBI37_gene_loc) %in% c('entrez_id', 'gene_name')], by = "gene_name", all.x = TRUE)
trx_down_df_tmp = merge(trx_down_df, NCBI37_gene_loc[, colnames(NCBI37_gene_loc) %in% c('entrez_id', 'gene_name')], by = "gene_name", all.x = TRUE)

In [None]:
sum(is.na(genes_up_df_tmp$entrez_id))
sum(is.na(genes_down_df_tmp$entrez_id))
sum(is.na(trx_up_df_tmp$entrez_id))
sum(is.na(trx_down_df_tmp$entrez_id))
# remove rows with NA entries in entrez_id column
genes_up_df_tmp = subset(genes_up_df_tmp, !is.na(genes_up_df_tmp$entrez_id))
genes_down_df_tmp = subset(genes_down_df_tmp, !is.na(genes_down_df_tmp$entrez_id))
trx_up_df_tmp = subset(trx_up_df_tmp, !is.na(trx_up_df_tmp$entrez_id))
trx_down_df_tmp = subset(trx_down_df_tmp, !is.na(trx_down_df_tmp$entrez_id))


In [None]:
# create Set_Annot_File for the final step of running MAGMA 
set_annot_file_diff_expr = data.frame(c(rep('Genes', times=dim(genes_up_df_tmp)[1] + dim(genes_down_df_tmp)[1]),
                                        rep('Transcripts', times=dim(trx_up_df_tmp)[1] + dim(trx_down_df_tmp)[1])), 
                                        c(genes_up_df_tmp$entrez_id, genes_down_df_tmp$entrez_id, 
                                          trx_up_df_tmp$entrez_id, trx_down_df_tmp$entrez_id))
names(set_annot_file_diff_expr) <- NULL

In [None]:
write.table(set_annot_file_diff_expr, '/grehawi/splice-reg-prj/new-data/MAGMA/set_annot_file_diff_expr.txt', sep= ' ')

### Visualise MAGMA results as heatmap

In [None]:
# Read MAGMA results

options(repr.plot.width=12, repr.plot.height=6)
# List of your MAGMA result files
files <- list.files(path = "/grehawi/splice-reg-prj/new-data/MAGMA/output/gene_set_analysis_output", pattern = "*.gsa.out", full.names = TRUE)

# Initialize an empty list to store data
result_list <- list()

# Loop over each file to load and process the data
for (file in files) {
    # Read each MAGMA result file
    data <- read.table(file, header = TRUE)
    
    # Extract gene set name and relevant statistic (e.g., Z-score or p-value)
    phenotype <- strsplit(basename(file), split = "_")[[1]][1]  # Use filename as phenotype label
    result_list[[phenotype]] <- data  # assuming P is the p-value column
    result_list[[phenotype]]$TYPE = phenotype
    result_list[[phenotype]]$abs_beta = abs(data$BETA)
    result_list[[phenotype]]$beta_direction = ifelse(data$BETA >= 0, "Positive", "Negative")
    result_list[[phenotype]]$adjP_val = p.adjust(data$P, method = "BH")
    result_list[[phenotype]]$`-log10P` = 0 - log10(result_list[[phenotype]]$adjP_val)
}

# Merge all dataframes in the list by 'VARIABLE'
combined_data = do.call(rbind, result_list) 

ggplot(combined_data, aes(x = VARIABLE, y = TYPE)) +
  geom_point(aes(size = abs_beta, fill = `-log10P`, shape = beta_direction)) +
  scale_shape_manual(values = c("Negative" = 25, "Positive" = 21)) +
  scale_fill_gradient(low = "blue", high = "red") + xlab("") + ylab("") +
                    theme_bw() + theme(legend.text=element_text(size=18), 
                                       legend.title=element_text(size=18), axis.text = element_text(size = 22),
                                       axis.title = element_text(size = 22))

In [None]:
ggsave("/grehawi/splice-reg-prj/Figures/dotplot_MAGMA_res.png", width = 10, height = 8)

In [None]:
write.table(combined_data, '/grehawi/splice-reg-prj/new-data/Diff-Analysis/supp_table_MAGMA_dysreg.csv', row.names = FALSE)

### Visualize boxplots of dysregulated gene vs transcript expression  
The following is used for suppl Figure2

In [None]:
# Choose an up-reg gene that also show diff expr at the trx level
unique(trx_up_df[trx_up_df$gene_name %in% genes_up_df$gene_name, 'gene_name']) #--> ITGB3 (Autism)

In [None]:
trx_up_df[trx_up_df$gene_name %in% genes_up_df$gene_name & trx_up_df$gene_name == 'ITGB3', ]

In [None]:
# Choose a up-reg trx whos gene is not diff expr -->NR3C1
trx_up_df[!trx_up_df$gene_name %in% genes_up_df$gene_name & !trx_up_df$gene_name %in% genes_down_df$gene_name,]
trx_up_df[trx_up_df$gene_name == 'NR3C1', ]

In [None]:
# get the expr of ITGB3 and NR3C1 in cases vs controls
ITGB3_and_NR3C1_expr_cases = gene.count.corrected.final[rownames(gene.count.corrected.final) %in% c("ENSG00000259207", "ENSG00000113580"),
                                              colnames(gene.count.corrected.final) %in% rownames(samples.info[samples.info$ltany_di ==1,])]
ITGB3_and_NR3C1_expr_controls = gene.count.corrected.final[rownames(gene.count.corrected.final) %in% c("ENSG00000259207", "ENSG00000113580"),
                                              colnames(gene.count.corrected.final) %in% rownames(samples.info[samples.info$ltany_di ==0,])]

# Get expr of all trx of ITGB3 and NR3C1
trx_of_ITGB3_NR3C1 = trx.genes[trx.genes$gene_id %in% c('ENSG00000259207', 'ENSG00000113580'), ]
# Get their expr in cases vs controls
trx_of_ITGB3_NR3C1_expr_cases = trx.count.corrected.final[rownames(trx.count.corrected.final) %in% trx_of_ITGB3_NR3C1$transcript_id,
                                                  colnames(gene.count.corrected.final) %in% rownames(samples.info[samples.info$ltany_di ==1,])]
trx_of_ITGB3_NR3C1_expr_controls = trx.count.corrected.final[rownames(trx.count.corrected.final) %in% trx_of_ITGB3_NR3C1$transcript_id,
                                                  colnames(gene.count.corrected.final) %in% rownames(samples.info[samples.info$ltany_di ==0,])]

In [None]:
trx_of_ITGB3_NR3C1_expr_controls

In [None]:
# Reshape the data from wide to long format
ITGB3_and_NR3C1_expr_cases = t(ITGB3_and_NR3C1_expr_cases)
ITGB3_and_NR3C1_expr_cases = as.data.frame(ITGB3_and_NR3C1_expr_cases)
ITGB3_and_NR3C1_expr_cases$samples = rownames(ITGB3_and_NR3C1_expr_cases)
ITGB3_and_NR3C1_expr_cases$status = 'Affected Individuals'
rownames(ITGB3_and_NR3C1_expr_cases) = NULL

ITGB3_and_NR3C1_expr_controls = t(ITGB3_and_NR3C1_expr_controls)
ITGB3_and_NR3C1_expr_controls = as.data.frame(ITGB3_and_NR3C1_expr_controls)
ITGB3_and_NR3C1_expr_controls$samples = rownames(ITGB3_and_NR3C1_expr_controls)
ITGB3_and_NR3C1_expr_controls$status = 'Unaffected Individuals'
rownames(ITGB3_and_NR3C1_expr_controls) = NULL

rbind_ITGB3_and_NR3C1_expr_case_control = rbind(ITGB3_and_NR3C1_expr_cases, ITGB3_and_NR3C1_expr_controls)
rbind_ITGB3_and_NR3C1_expr_case_control_long = pivot_longer(rbind_ITGB3_and_NR3C1_expr_case_control, cols = c('ENSG00000113580', 'ENSG00000259207'), names_to = "ID", values_to = "Expression")
head(rbind_ITGB3_and_NR3C1_expr_case_control_long)


In [None]:
# Reshape the data from wide to long format
#combine_selected_cases = rbind(ITGB3_expr_cases, ITGB3_trxs_expr_cases)
trx_of_ITGB3_NR3C1_expr_cases = t(trx_of_ITGB3_NR3C1_expr_cases)
trx_of_ITGB3_NR3C1_expr_cases = as.data.frame(trx_of_ITGB3_NR3C1_expr_cases)
trx_of_ITGB3_NR3C1_expr_cases$samples = rownames(trx_of_ITGB3_NR3C1_expr_cases)
trx_of_ITGB3_NR3C1_expr_cases$status = 'Affected Individuals'
rownames(trx_of_ITGB3_NR3C1_expr_cases) = NULL

trx_of_ITGB3_NR3C1_expr_controls = t(trx_of_ITGB3_NR3C1_expr_controls)
trx_of_ITGB3_NR3C1_expr_controls = as.data.frame(trx_of_ITGB3_NR3C1_expr_controls)
trx_of_ITGB3_NR3C1_expr_controls$samples = rownames(trx_of_ITGB3_NR3C1_expr_controls)
trx_of_ITGB3_NR3C1_expr_controls$status = 'Unaffected Individuals'
rownames(trx_of_ITGB3_NR3C1_expr_controls) = NULL

rbind_trx_of_ITGB3_NR3C1_expr_case_control = rbind(trx_of_ITGB3_NR3C1_expr_cases, trx_of_ITGB3_NR3C1_expr_controls)
rbind_trx_of_ITGB3_NR3C1_expr_case_control_long = pivot_longer(rbind_trx_of_ITGB3_NR3C1_expr_case_control, cols = c('ENST00000394464', 'ENST00000424646', 'ENST00000503201', 'ENST00000559488'), names_to = "ID", values_to = "Expression")
head(rbind_trx_of_ITGB3_NR3C1_expr_case_control_long)


In [None]:
options(repr.plot.width=12, repr.plot.height=6)
par(mfrow=c(1,2))
pdf('/grehawi/splice-reg-prj/Figures/expression_diff_expr_ITGB3_NR3C1_genes.pdf', width = 12, height = 10)
ggplot(rbind_ITGB3_and_NR3C1_expr_case_control_long, aes(x = ID, y = Expression, fill = status)) +
  geom_boxplot() +
  theme_bw() +
  labs(x = "ID", y = "Expression", fill = "Status") +
    xlab("") + theme_classic() + theme(axis.text= element_text(size = 22), axis.title.x = element_text(size = 22),
                                                        axis.text.x = element_text(angle = 45, hjust = 1),
                                                         axis.title.y = element_text(size = 22),
                                                         legend.text=element_text(size=24),
                                                         legend.title=element_text(size=24)) +
  scale_fill_manual(values = c("Affected Individuals" = "#F0CF7F", "Unaffected Individuals" = "#5DB9B5"))

dev.off()
pdf('/grehawi/splice-reg-prj/Figures/expression_diff_expr_ITGB3_NR3C1_trx.pdf', width = 18, height = 10)
ggplot(rbind_trx_of_ITGB3_NR3C1_expr_case_control_long, aes(x = ID, y = Expression, fill = status)) +
  geom_boxplot() +
  theme_bw() +
  labs(x = "ID", y = "Expression", fill = "Status") +
    xlab("") + theme_classic() + theme(axis.text= element_text(size = 22), axis.title.x = element_text(size = 22),
                                                        axis.text.x = element_text(angle = 45, hjust = 1), 
                                                         axis.title.y = element_text(size = 22),
                                                         legend.text=element_text(size=24),
                                                         legend.title=element_text(size=24)) +
  scale_fill_manual(values = c("Affected Individuals" = "#F0CF7F", "Unaffected Individuals" = "#5DB9B5"))
dev.off()