In [None]:
library(xlsx)
library(ggplot2)
library(rtracklayer)
library(dplyr)
library(tidyr)
library(data.table)
library(matrixStats)
library(igraph)
library(purrr)
library(stringr)
library(VennDiagram)
library(TIN)
library(pheatmap)
library(UpSetR)
library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)

psize = function(x=4,y=5) { options(repr.plot.width=x, repr.plot.height=y) }

### Functions

In [None]:
# Function to remove all characters after '-'
remove_after_dash <- function(x) {
  sub("-.*$", "", x)
}

In [None]:
remove_double_edges <- function(df) {
    # remove double edges
    net.single.edges = df[!duplicated(apply(df,1,function(x) paste(sort(x),collapse=''))),]  
                                            
    return(net.single.edges)
}

In [None]:
get_igraph_obj <- function(df){
  # takes in data table of results from kimono
  # return igraph object
  actors<-unique(c(df$node1,df$node2))
  relations <- data.frame(from=df$node1,
                        to=df$node2,
                        value=df$value) 
  # network
  g <- graph_from_data_frame(relations, directed=FALSE, vertices=actors)
  return(g)
}

In [None]:
get_betweenness <- function(graph_obj){
  # takes igraph object
  # returns betweenness df
  x <- betweenness(graph_obj, directed=T, weights=NA)

  y <- as.data.frame(t(t(sort(x, decreasing = T)))) %>%
    dplyr::rename(betweenness=V1) %>% setDT(., keep.rownames=T) %>% 
    dplyr::rename(node=rn)
  return(y)
}

In [None]:
get_degree <- function(graph_obj){
    x = degree(graph_obj, v = V(graph_obj), loops=FALSE)
    y <- as.data.frame(t(t(sort(x, decreasing = T)))) %>%
        dplyr::rename(degree=V1) %>% setDT(., keep.rownames=T) %>% 
        dplyr::rename(node=rn)
    return(y)  
}

### Read required files

In [None]:
genecode.annot.subsetted = read.table("/grehawi/splice-reg-prj/new-data/subsetted_gencode_annotation.txt")
head(genecode.annot.subsetted)

In [None]:
# read transcripts-genes mapping table
trx.genes = read.table("/grehawi/splice-reg-prj/data/transcriptsID-geneID.txt")
head(trx.genes)

In [None]:
genes.ids.names = read.table("/grehawi/splice-reg-prj/new-data/ARACNE/gene_names_ids_table.txt")
head(genes.ids.names)
dim(genes.ids.names)

In [None]:
tx.ids.names = read.table("/grehawi/splice-reg-prj/new-data/ARACNE/trxs_names_ids_table.txt")
head(tx.ids.names)
dim(tx.ids.names)

In [None]:
genes.trxs.ids.names.map = read.table("/grehawi/splice-reg-prj/new-data/ARACNE/genes_trxs_ids_names_map.txt")
head(genes.trxs.ids.names.map)
dim(genes.trxs.ids.names.map)

In [None]:
genes_up_names = readRDS("/grehawi/splice-reg-prj/new-data/Diff-Analysis/up_reg_ganes.rds")
genes_down_names = readRDS("/grehawi/splice-reg-prj/new-data/Diff-Analysis/down_reg_ganes.rds")
trx_up_names = readRDS("/grehawi/splice-reg-prj/new-data/Diff-Analysis/up_reg_trx.rds")
trx_down_names = readRDS("/grehawi/splice-reg-prj/new-data/Diff-Analysis/down_reg_trx.rds")

In [None]:
all_disreg_genes_trx = c(genes_up_names, genes_down_names, trx_up_names, trx_down_names)
length(all_disreg_genes_trx)

In [None]:
#library(readxl)
# Splicing factor from SpliceAid: 67 unique splicing factors
sf.targets.links = fread("/grehawi/splice-reg-prj/data/splicing-factors.txt", sep="\t")
#This gene set is a comprehensive collection of 277 unique genes involved in pre-mRNA splicing events 
#(Sveen et al., Genome Medicine, 2011, 3:32).
data(splicingFactors)
splicingFactors$GeneSymbol = as.character(splicingFactors$GeneSymbol)

# 406 splicing factors from Seiler et al 2018: https://pubmed.ncbi.nlm.nih.gov/29617667/
SF.Seiler = fread("/grehawi/splice-reg-prj/data/SF-Seileretal2018.csv")
head(SF.Seiler)
length(unique(SF.Seiler$GeneSymbol))

#another resource of SF Anna et al : https://www.biorxiv.org/content/10.1101/2020.05.20.107375v1.full
Anna.SF = c("SRSF1", "SRSF2", "SRSF3", "SRSF5", "SRSF7", "HNRNPA2B1", "HNRNPL", "HNRNPLL", "RBFOX2", "RBFOX3", "FUS", "SNRNP70", "TRA2A",
  "TRA2B", "TIA1", "PTBP1", "PTBP2", "RBM10", "RBM5")

SF.Seiler.df = as.data.frame(SF.Seiler)
SF.alone.seiler = unique(SF.Seiler.df$GeneSymbol)
length(SF.alone.seiler)

all.SF = unique(c(sf.targets.links$Gene, splicingFactors$GeneSymbol, SF.alone.seiler, Anna.SF))
length(all.SF)

In [None]:
tf.gene.links = fread("/grehawi/splice-reg-prj/data/TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv", sep="\t")
head(tf.gene.links)
dim(tf.gene.links)
dim(tf.gene.links[complete.cases(tf.gene.links),])
length(unique(tf.gene.links$Name.Target))
length(unique(tf.gene.links$Name.TF))

In [None]:
all_known_factors = c(unique(tf.gene.links$Name.TF),all.SF)
length(all_known_factors)

## 1. Read processed networks

In [None]:
# unthresholded networks
annotated.filtered.cases.withNames.singleE = read.table('/grehawi/splice-reg-prj/new-data/ARACNE/filtered_cases_withNames_singleEdges.txt')
annotated.filtered.controls.withNames.singleE = read.table('/grehawi/splice-reg-prj/new-data/ARACNE/filtered_controls_withNames_singleEdges.txt')


In [None]:
#thresholded networks
cases.annotated.filtered.single.edges.thr = read.table('/grehawi/splice-reg-prj/new-data/ARACNE/filtered_thr_cases_withNames_singleEdges.txt')
controls.annotated.filtered.single.edges.thr = read.table('/grehawi/splice-reg-prj/new-data/ARACNE/filtered_thr_controls_withNames_singleEdges.txt')


In [None]:
head(controls.annotated.filtered.single.edges.thr)

In [None]:
table(controls.annotated.filtered.single.edges.thr$edgeType)
table(cases.annotated.filtered.single.edges.thr$edgeType)

In [None]:
# number of nodes after threshold filtering for both networks
nodes.controls = unique(c(controls.annotated.filtered.single.edges.thr$node1, controls.annotated.filtered.single.edges.thr$node2))
nodes.cases = unique(c(cases.annotated.filtered.single.edges.thr$node1, cases.annotated.filtered.single.edges.thr$node2))

length(nodes.controls)
length(nodes.cases)

In [None]:
# Number of gene nodes vs trx nodes in the 2 networks
length(intersect(nodes.controls, genes.ids.names$gene_name))
length(intersect(nodes.controls, tx.ids.names$name))
length(intersect(nodes.cases, genes.ids.names$gene_name))
length(intersect(nodes.cases, tx.ids.names$name))

In [None]:
control_network_nodeType_distr = data.frame(NodeType = c('Gene', 'IR'), Count= c(3514, 4758),
                                            Network = rep('Unaffected Individuals Network',2))
cases_network_nodeType_distr = data.frame(NodeType = c('Gene', 'IR'), Count= c(4115, 3881),
                                            Network = rep('Affected Individuals Network',2))
networks_nodeType_distr = rbind(cases_network_nodeType_distr, control_network_nodeType_distr)
ggplot(networks_nodeType_distr, aes(fill=NodeType, y=Count, x=Network)) + 
     geom_bar(position="dodge", stat="identity") + xlab("") + theme_classic() + theme(axis.text= element_text(size = 22), axis.title.x = element_text(size = 22),
                                                         axis.title.y = element_text(size = 22),
                                                         legend.text=element_text(size=24),
                                                         legend.title=element_text(size=24)) +
                                                         scale_fill_manual(values=c('#E7872B', '#3A68AE'))
#ggsave("/grehawi/splice-reg-prj/Figures/barplot_networks_nodeType.pdf", width = 14, height = 6)

In [None]:
networks_nodeType_summary_table <- networks_nodeType_distr %>%
  pivot_wider(
    id_cols = NodeType,
    names_from = Network,
    values_from = Count,
    names_prefix = ""
  ) %>%
  rename(
    Affected = 'Affected Individuals Network',
    Unaffected = 'Unaffected Individuals Network'
  )
row.names(networks_nodeType_summary_table) = networks_nodeType_summary_table$NodeType
networks_nodeType_summary_table$NodeType = NULL
networks_nodeType_summary_table

In [None]:
# chi-square test of independence: are the nodes distribution in the 2 networks similar
chisq <- chisq.test(networks_nodeType_summary_table)
chisq

In [None]:
# nodes that exist in the cases network but not in the controls and vice versa
length(setdiff(nodes.cases, nodes.controls))
length(setdiff(nodes.controls, nodes.cases))

In [None]:
unique_cases = setdiff(nodes.cases, nodes.controls)
nodes_unique_cases = data.frame(unique_cases)

unique_controls = setdiff(nodes.controls, nodes.cases)
nodes_unique_controls = data.frame(unique_controls)

## Hub nodes analysis (after threshold filtering)

In [None]:
g.controls = get_igraph_obj(controls.annotated.filtered.single.edges.thr)
g.controls.degrees = get_degree(g.controls)

g.cases = get_igraph_obj(cases.annotated.filtered.single.edges.thr)
g.cases.degrees = get_degree(g.cases)

summary(g.controls.degrees$degree)
summary(g.cases.degrees$degree)

### Analysis of hubs (degree =>10) in the context of PDs

In [None]:
hubs.with.at.least.degree.10.controls = g.controls.degrees$node[g.controls.degrees$degree >= 10]
hubs.with.at.least.degree.10.cases = g.cases.degrees$node[g.cases.degrees$degree >= 10]
length(hubs.with.at.least.degree.10.controls)
length(hubs.with.at.least.degree.10.cases)
hubs_intersect = intersect(hubs.with.at.least.degree.10.controls,hubs.with.at.least.degree.10.cases)
length(hubs_intersect)

In [None]:
hubs_intersect

In [None]:
hubs_intersect_df = as.data.frame(hubs_intersect)
write.table(hubs_intersect_df, "/grehawi/splice-reg-prj/new-data/ARACNE/hubs_intersect.csv", row.names=FALSE)

In [None]:
# save for suppl: table for common hub nodes and their connections in both networks
common_hubs_cases_connections = cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$node1 %in% hubs_intersect | cases.annotated.filtered.single.edges.thr$node2 %in% hubs_intersect,]
common_hubs_controls_connections = controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$node1 %in% hubs_intersect | controls.annotated.filtered.single.edges.thr$node2 %in% hubs_intersect,]

write.table(common_hubs_cases_connections, "/grehawi/splice-reg-prj/new-data/ARACNE/supp_table_comon_hubs_cases_connections.csv", row.names=FALSE)
write.table(common_hubs_controls_connections, "/grehawi/splice-reg-prj/new-data/ARACNE/supp_table_comon_hubs_controls_connections.csv", row.names=FALSE)


In [None]:
hubs_cases_unique = setdiff(hubs.with.at.least.degree.10.cases, hubs.with.at.least.degree.10.controls)
write.table(hubs_cases_unique, "/grehawi/splice-reg-prj/new-data/ARACNE/hubs_unique_cases.csv", row.names=FALSE)

In [None]:
hubs_controls_unique = setdiff(hubs.with.at.least.degree.10.controls, hubs.with.at.least.degree.10.cases)
write.table(hubs_controls_unique, "/grehawi/splice-reg-prj/new-data/ARACNE/hubs_unique_controls.csv", row.names=FALSE)

In [None]:
# save for suppl: table for hub nodes and their degree in both networks
suppl_table_hubs_cases = g.cases.degrees[g.cases.degrees$degree >= 10]
suppl_table_hubs_cases = left_join(suppl_table_hubs_cases, genes.trxs.ids.names.map, by = c("node" = "name"))
suppl_table_hubs_controls = g.controls.degrees[g.controls.degrees$degree >= 10]
suppl_table_hubs_controls = left_join(suppl_table_hubs_controls, genes.trxs.ids.names.map, by = c("node" = "name"))
write.table(suppl_table_hubs_cases, "/grehawi/splice-reg-prj/new-data/ARACNE/supp_table_hubs_cases.csv", row.names=FALSE)
write.table(suppl_table_hubs_controls, "/grehawi/splice-reg-prj/new-data/ARACNE/supp_table_hubs_controls.csv", row.names=FALSE)

In [None]:
suppl_table_hubs_cases

In [None]:
venn.diagram(
        x = list(hubs.with.at.least.degree.10.cases, hubs.with.at.least.degree.10.controls),
        category.names = c('#Hubs in the AIN', '#Hubs in the UIN'),
        filename = '/grehawi/splice-reg-prj/Figures/venn_hub_nodes_degree10.svg',
        imagetype="svg",
        output=FALSE,
        # Circles
        lwd = 2,
        cex = 12,
        cat.cex = 12,
        lty = 'blank',
        fill = c("#F0CF7F", "#5DB9B5"),
        height=32,
        width=32)

#### Check common and distinct connections of shared hub nodes (nodes with dg >= 10 in both networks)

In [None]:
shared_hubs_connections_controls = controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$node1 %in% hubs_intersect | controls.annotated.filtered.single.edges.thr$node2 %in% hubs_intersect,]
shared_hubs_connections_cases = cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$node1 %in% hubs_intersect | cases.annotated.filtered.single.edges.thr$node2 %in% hubs_intersect,]

dim(shared_hubs_connections_controls)
dim(shared_hubs_connections_cases)
# which of these connections are unique to the cases/controls networks
Reduce(setdiff, list(shared_hubs_connections_controls[, 1:2], shared_hubs_connections_cases[, 1:2]))
Reduce(setdiff, list(shared_hubs_connections_cases[, 1:2], shared_hubs_connections_controls[, 1:2]))

In [None]:
# check the connections of pds genes/trx in the set intersect_hubs (results from DisGeNet in another notebook)
pds_related_hubs_intersect = c('KCTD12', 'TNFRSF1B')
hubs_intersect_cases_connections = cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$node1 %in% pds_related_hubs_intersect | cases.annotated.filtered.single.edges.thr$node2 %in% pds_related_hubs_intersect, ]
hubs_intersect_controls_connections = controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$node1 %in% pds_related_hubs_intersect | controls.annotated.filtered.single.edges.thr$node2 %in% pds_related_hubs_intersect, ]
options(repr.plot.width=10, repr.plot.height=9)
par(mfrow=c(1,2))
#cases
hubs_intersect_cases <- graph_from_data_frame(hubs_intersect_cases_connections, directed = FALSE) # Use directed = TRUE for directed graphs
#svg("/grehawi/splice-reg-prj/Figures/cases_hubs_intersect_connections.svg", width = 10, height = 10)
plot(hubs_intersect_cases, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()
#controls
hubs_intersect_controls <- graph_from_data_frame(hubs_intersect_controls_connections, directed = FALSE)
#svg("/grehawi/splice-reg-prj/Figures/controls_hubs_intersect_connections.svg", width = 10, height = 10)
plot(hubs_intersect_controls, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()

In [None]:
# whcih of the connections of c('KCTD12', 'TNFRSF1B') are dysregulated/hubs in the cases network
pds_hubs_connections_cases = unique(c(hubs_intersect_cases_connections$node1, hubs_intersect_cases_connections$node2))
intersect(pds_hubs_connections_cases, hubs.with.at.least.degree.10.cases)
#intersect(pds_hubs_connections_cases, all_disreg_genes_trx) -->NONE
intersect(pds_hubs_connections_cases, tf.gene.links$Name.TF)
intersect(pds_hubs_connections_cases, all.SF)

In [None]:
# whcih of the connections of c('KCTD12', 'TNFRSF1B') are dysregulated/hubs in the controls network
pds_hubs_connections_controls = unique(c(hubs_intersect_controls_connections$node1, hubs_intersect_controls_connections$node2))
intersect(pds_hubs_connections_controls, hubs.with.at.least.degree.10.controls)
#intersect(pds_hubs_connections_controls, all_disreg_genes_trx) -->NONE
intersect(pds_hubs_connections_controls, tf.gene.links$Name.TF)
#intersect(pds_hubs_connections_controls, all.SF) -->NONE

In [None]:
# check the connections of pds genes/trx in the set intersect_hubs in the unthresholded networks (results from DisGeNet in another notebook)
pds_related_hubs_intersect = c('KCTD12', 'TNFRSF1B')
hubs_intersect_cases = annotated.filtered.cases.withNames.singleE[annotated.filtered.cases.withNames.singleE$node1 %in% pds_related_hubs_intersect | annotated.filtered.cases.withNames.singleE$node2 %in% pds_related_hubs_intersect, ]
hubs_intersect_controls = annotated.filtered.controls.withNames.singleE[annotated.filtered.controls.withNames.singleE$node1 %in% pds_related_hubs_intersect | annotated.filtered.controls.withNames.singleE$node2 %in% pds_related_hubs_intersect, ]
options(repr.plot.width=10, repr.plot.height=9)
par(mfrow=c(1,2))
#cases
hubs_intersect_cases <- graph_from_data_frame(hubs_intersect_cases, directed = FALSE) # Use directed = TRUE for directed graphs
#svg("/grehawi/splice-reg-prj/Figures/cases_hubs_intersect_connections_unThrsh_network.svg", width = 10, height = 10)
plot(hubs_intersect_cases, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()
#controls
hubs_intersect_controls <- graph_from_data_frame(hubs_intersect_controls, directed = FALSE)
#svg("/grehawi/splice-reg-prj/Figures/controls_hubs_intersect_connections_unThrsh_network.svg", width = 10, height = 10)
plot(hubs_intersect_controls, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()

In [None]:
cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$node1 == 'TNFAIP2' | cases.annotated.filtered.single.edges.thr$node2 == 'TNFAIP2', ]

controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$node1 == 'TNFAIP2' | controls.annotated.filtered.single.edges.thr$node2 == 'TNFAIP2', ]


### Analysis of hub IR nodes

In [None]:
hubs.with.at.least.degree.15.controls = g.controls.degrees$node[g.controls.degrees$degree >= 15]
hubs.with.at.least.degree.15.cases = g.cases.degrees$node[g.cases.degrees$degree >= 15]
length(hubs.with.at.least.degree.15.controls)
length(hubs.with.at.least.degree.15.cases)
hubs_intersect_degree15 = intersect(hubs.with.at.least.degree.15.controls,hubs.with.at.least.degree.15.cases)
length(hubs_intersect_degree15)

In [None]:
hubs.with.at.least.degree.20.controls = g.controls.degrees$node[g.controls.degrees$degree >= 20]
hubs.with.at.least.degree.20.cases = g.cases.degrees$node[g.cases.degrees$degree >= 20]
length(hubs.with.at.least.degree.20.controls)
length(hubs.with.at.least.degree.20.cases)
hubs_intersect_degree20 = intersect(hubs.with.at.least.degree.20.controls,hubs.with.at.least.degree.20.cases)
length(hubs_intersect_degree20)

#### Check % of trx vs genes in the set of hub nodes

In [None]:
# Check out percentage of IR vs Genes in the set of hub nodes (>=10) in the controls net 

hubs.with.at.least.degree.10.controls_df = as.data.frame(hubs.with.at.least.degree.10.controls)
names(hubs.with.at.least.degree.10.controls_df)[1] <- "nodes"
                                                         
hubs.with.at.least.degree.10.controls_df = hubs.with.at.least.degree.10.controls_df %>% 
 mutate(
        nodeType = ifelse(nodes %in% tx.ids.names$name, "IR", "Gene"))

hubs.with.at.least.degree.10.controls_table = as.data.frame(table(hubs.with.at.least.degree.10.controls_df$nodeType))
names(hubs.with.at.least.degree.10.controls_table)[1] <- "NodeType"
names(hubs.with.at.least.degree.10.controls_table)[2] <- "Frequency"
hubs.with.at.least.degree.10.controls_table$Network = "Unaffected Individuals Network"
hubs.with.at.least.degree.10.controls_table

# Check out percentage of IR vs Genes in the set of hub nodes (>=10) in the cases net 

hubs.with.at.least.degree.10.cases_df = as.data.frame(hubs.with.at.least.degree.10.cases)
names(hubs.with.at.least.degree.10.cases_df)[1] <- "nodes"
                                                         
hubs.with.at.least.degree.10.cases_df = hubs.with.at.least.degree.10.cases_df %>% 
 mutate(
        nodeType = ifelse(nodes %in% tx.ids.names$name, "IR", "Gene"))

hubs.with.at.least.degree.10.cases_table = as.data.frame(table(hubs.with.at.least.degree.10.cases_df$nodeType))
names(hubs.with.at.least.degree.10.cases_table)[1] <- "NodeType"
names(hubs.with.at.least.degree.10.cases_table)[2] <- "Frequency"
hubs.with.at.least.degree.10.cases_table$Network = "Affected Individuals Network"
hubs.with.at.least.degree.10.cases_table

hubs.with.at.least.degree.10.combined = rbind(hubs.with.at.least.degree.10.controls_table,
                                              hubs.with.at.least.degree.10.cases_table)

pdf("/grehawi/splice-reg-prj/Figures/barplot_hubs_dg10_nodeType.pdf", width=12, height=6)
# Create a bar plot
ggplot(hubs.with.at.least.degree.10.combined, aes(fill=NodeType, y=Frequency, x=Network)) + 
     geom_bar(position="dodge", stat="identity") + theme_classic() + xlab("Hubs (dg >= 10)") + theme(axis.text= element_text(size = 17), axis.title.x = element_text(size = 20),
                                                         axis.title.y = element_text(size = 20),
                                                         legend.text=element_text(size=22),
                                                         legend.title=element_text(size=22)) +
                                                         scale_fill_manual(values=c('#E7872B', '#3A68AE'))
dev.off()

In [None]:
# Check out percentage of IR vs Genes in the set of hub nodes (degree >=15) 

hubs.with.at.least.degree.15.controls_df = as.data.frame(hubs.with.at.least.degree.15.controls)
names(hubs.with.at.least.degree.15.controls_df)[1] <- "nodes"
                                                         
hubs.with.at.least.degree.15.controls_df = hubs.with.at.least.degree.15.controls_df %>% 
 mutate(
        nodeType = ifelse(nodes %in% tx.ids.names$name, "IR", "Gene"))

hubs.with.at.least.degree.15.controls_table = as.data.frame(table(hubs.with.at.least.degree.15.controls_df$nodeType))
names(hubs.with.at.least.degree.15.controls_table)[1] <- "NodeType"
names(hubs.with.at.least.degree.15.controls_table)[2] <- "Frequency"
hubs.with.at.least.degree.15.controls_table$Network = "Unaffected Individuals Network"
hubs.with.at.least.degree.15.controls_table


hubs.with.at.least.degree.15.cases_df = as.data.frame(hubs.with.at.least.degree.15.cases)
names(hubs.with.at.least.degree.15.cases_df)[1] <- "nodes"
                                                         
hubs.with.at.least.degree.15.cases_df = hubs.with.at.least.degree.15.cases_df %>% 
 mutate(
        nodeType = ifelse(nodes %in% tx.ids.names$name, "IR", "Gene"))

hubs.with.at.least.degree.15.cases_table = as.data.frame(table(hubs.with.at.least.degree.15.cases_df$nodeType))
names(hubs.with.at.least.degree.15.cases_table)[1] <- "NodeType"
names(hubs.with.at.least.degree.15.cases_table)[2] <- "Frequency"
hubs.with.at.least.degree.15.cases_table$Network = "Affected Individuals Network"
hubs.with.at.least.degree.15.cases_table

hubs.with.at.least.degree.15.combined = rbind(hubs.with.at.least.degree.15.controls_table,
                                              hubs.with.at.least.degree.15.cases_table)

pdf("/grehawi/splice-reg-prj/Figures/barplot_hubs_dg15_nodeType.pdf", width=12, height=6)
# Create a bar plot
ggplot(hubs.with.at.least.degree.15.combined, aes(fill=NodeType, y=Frequency, x=Network)) + 
     geom_bar(position="dodge", stat="identity") + theme_classic() + xlab("Hubs (dg >= 15)") + theme(axis.text= element_text(size = 17), axis.title.x = element_text(size = 20),
                                                         axis.title.y = element_text(size = 20),
                                                         legend.text=element_text(size=22),
                                                         legend.title=element_text(size=22)) +
                                                         scale_fill_manual(values=c('#E7872B', '#3A68AE'))
dev.off()

In [None]:
# Check out percentage of IR vs Genes in the set of hub nodes (degree >=20) 

hubs.with.at.least.degree.20.controls_df = as.data.frame(hubs.with.at.least.degree.20.controls)
names(hubs.with.at.least.degree.20.controls_df)[1] <- "nodes"
                                                         
hubs.with.at.least.degree.20.controls_df = hubs.with.at.least.degree.20.controls_df %>% 
 mutate(
        nodeType = ifelse(nodes %in% tx.ids.names$name, "IR", "Gene"))

hubs.with.at.least.degree.20.controls_table = as.data.frame(table(hubs.with.at.least.degree.20.controls_df$nodeType))
names(hubs.with.at.least.degree.20.controls_table)[1] <- "NodeType"
names(hubs.with.at.least.degree.20.controls_table)[2] <- "Frequency"
hubs.with.at.least.degree.20.controls_table$Network = "Unaffected Individuals Network"
hubs.with.at.least.degree.20.controls_table


hubs.with.at.least.degree.20.cases_df = as.data.frame(hubs.with.at.least.degree.20.cases)
names(hubs.with.at.least.degree.20.cases_df)[1] <- "nodes"
                                                         
hubs.with.at.least.degree.20.cases_df = hubs.with.at.least.degree.20.cases_df %>% 
 mutate(
        nodeType = ifelse(nodes %in% tx.ids.names$name, "IR", "Gene"))

hubs.with.at.least.degree.20.cases_table = as.data.frame(table(hubs.with.at.least.degree.20.cases_df$nodeType))
names(hubs.with.at.least.degree.20.cases_table)[1] <- "NodeType"
names(hubs.with.at.least.degree.20.cases_table)[2] <- "Frequency"
hubs.with.at.least.degree.20.cases_table$Network = "Affected Individuals Network"
hubs.with.at.least.degree.20.cases_table

hubs.with.at.least.degree.20.combined = rbind(hubs.with.at.least.degree.20.controls_table,
                                              hubs.with.at.least.degree.20.cases_table)

pdf("/grehawi/splice-reg-prj/Figures/barplot_hubs_dg20_nodeType.pdf", width=12, height=6)
# Create a bar plot
ggplot(hubs.with.at.least.degree.20.combined, aes(fill=NodeType, y=Frequency, x=Network)) + 
     geom_bar(position="dodge", stat="identity") + theme_classic() + xlab("Hubs (dg >= 20)") + theme(axis.text= element_text(size = 17), axis.title.x = element_text(size = 20),
                                                         axis.title.y = element_text(size = 20),
                                                         legend.text=element_text(size=22),
                                                         legend.title=element_text(size=22)) +
                                                         scale_fill_manual(values=c('#E7872B', '#3A68AE'))
dev.off()

In [None]:
# Check out percentage of IR vs Genes in the set of hub nodes that are shared

hubs_intersect_df = as.data.frame(hubs_intersect)
names(hubs_intersect_df)[1] <- "nodes"
                                                         
hubs_intersect_df = hubs_intersect_df %>% 
 mutate(
        nodeType = ifelse(grepl("-", nodes), "Transcript", "Gene"))

hubs_intersect_table = as.data.frame(table(hubs_intersect_df$nodeType))
names(hubs_intersect_table)[1] <- "Type"
hubs_intersect_table

### Investigate IR nodes since many hub nodes are IR and because they are mostly connected with other IR nodes

In [None]:
# take the the list of intersect_hubs and find IR nodes
hubs_intersect_IR_Nodes = intersect(hubs_intersect, tx.ids.names$name)
hubs_intersect_IR_Nodes

# get gene name to discover which are known TFs/SFs
hubs_intersect_IR_Nodes_with_gene_names = remove_after_dash(hubs_intersect_IR_Nodes)
hubs_intersect_IR_Nodes_with_gene_names_known_factors = intersect(hubs_intersect_IR_Nodes_with_gene_names, all.SF)
hubs_intersect_IR_Nodes_with_gene_names_known_factors

# choosing HNRNPH1 and its isoform HNRNPH1-227

In [None]:
tx.ids.names[tx.ids.names$name == 'HNRNPH1-227', ]

In [None]:
cases_network_HNRNPH1227_connections = cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$node1== 'HNRNPH1-227' | cases.annotated.filtered.single.edges.thr$node2== 'HNRNPH1-227', ]
controls_network_HNRNPH1227_connections = controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$node1== 'HNRNPH1-227' | controls.annotated.filtered.single.edges.thr$node2== 'HNRNPH1-227', ]
options(repr.plot.width=10, repr.plot.height=9)
par(mfrow=c(1,2))
#cases
cases_network_HNRNPH1227 <- graph_from_data_frame(cases_network_HNRNPH1227_connections, directed = FALSE) # Use directed = TRUE for directed graphs
#svg("/grehawi/splice-reg-prj/Figures/cases_network_HubIR_HNRNPH1227.svg", width = 10, height = 10)
#plot(cases_network_HNRNPH1227, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()
#controls
controls_network_HNRNPH1227 <- graph_from_data_frame(controls_network_HNRNPH1227_connections, directed = FALSE)
#svg("/grehawi/splice-reg-prj/Figures/controls_network_HubIR_HNRNPH1227.svg", width = 10, height = 10)
#plot(controls_network_HNRNPH1227, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()

In [None]:
cases_network_unthrsh_HNRNPH1227_connections = annotated.filtered.cases.withNames.singleE[annotated.filtered.cases.withNames.singleE$node1== 'HNRNPH1-227' | annotated.filtered.cases.withNames.singleE$node2== 'HNRNPH1-227', ]
controls_network_unthrsh_HNRNPH1227_connections = annotated.filtered.controls.withNames.singleE[annotated.filtered.controls.withNames.singleE$node1== 'HNRNPH1-227' | annotated.filtered.controls.withNames.singleE$node2== 'HNRNPH1-227', ]
options(repr.plot.width=10, repr.plot.height=9)
par(mfrow=c(1,2))
#cases
cases_network_unthrsh_HNRNPH1227 <- graph_from_data_frame(cases_network_unthrsh_HNRNPH1227_connections, directed = FALSE) # Use directed = TRUE for directed graphs
#svg("/grehawi/splice-reg-prj/Figures/cases_network_unthrsh_HubIR_HNRNPH1227.svg", width = 10, height = 10)
#plot(cases_network_unthrsh_HNRNPH1227, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()
#controls
controls_network_unthrsh_HNRNPH1227 <- graph_from_data_frame(controls_network_unthrsh_HNRNPH1227_connections, directed = FALSE)
#svg("/grehawi/splice-reg-prj/Figures/controls_network_unthrsh_HubIR_HNRNPH1227.svg", width = 10, height = 10)
#plot(controls_network_unthrsh_HNRNPH1227, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()

In [None]:
cases_network_unthrsh_HNRNPH1 = annotated.filtered.cases.withNames.singleE[annotated.filtered.cases.withNames.singleE$node1== 'HNRNPH1' | annotated.filtered.cases.withNames.singleE$node2== 'HNRNPH1', ]
controls_network_unthrsh_HNRNPH1 = annotated.filtered.controls.withNames.singleE[annotated.filtered.controls.withNames.singleE$node1== 'HNRNPH1' | annotated.filtered.controls.withNames.singleE$node2== 'HNRNPH1', ]
options(repr.plot.width=10, repr.plot.height=9)
par(mfrow=c(1,2))
#cases
cases_network_unthrsh_HNRNPH1 <- graph_from_data_frame(cases_network_unthrsh_HNRNPH1, directed = FALSE) # Use directed = TRUE for directed graphs
#svg("/grehawi/splice-reg-prj/Figures/cases_network_unthrsh_HubIR_HNRNPH1.svg", width = 10, height = 10)
#plot(cases_network_unthrsh_HNRNPH1, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()
#controls
controls_network_unthrsh_HNRNPH1 <- graph_from_data_frame(controls_network_unthrsh_HNRNPH1, directed = FALSE)
#svg("/grehawi/splice-reg-prj/Figures/controls_network_unthrsh_HubIR_HNRNPH1.svg", width = 10, height = 10)
#plot(controls_network_unthrsh_HNRNPH1, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()


In [None]:
cases_network_HNRNPH1_connections = cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$node1== 'HNRNPH1' | cases.annotated.filtered.single.edges.thr$node2== 'HNRNPH1', ]
controls_network_HNRNPH1_connections = controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$node1== 'HNRNPH1' | controls.annotated.filtered.single.edges.thr$node2== 'HNRNPH1', ]
options(repr.plot.width=10, repr.plot.height=9)
par(mfrow=c(1,2))
#cases
cases_network_HNRNPH1 <- graph_from_data_frame(cases_network_HNRNPH1_connections, directed = FALSE) # Use directed = TRUE for directed graphs
#svg("/grehawi/splice-reg-prj/Figures/cases_network_HubIR_HNRNPH1.svg", width = 10, height = 10)
#plot(cases_network_HNRNPH1, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()
#controls
controls_network_HNRNPH1 <- graph_from_data_frame(controls_network_HNRNPH1_connections, directed = FALSE)
#svg("/grehawi/splice-reg-prj/Figures/controls_network_HubIR_HNRNPH1.svg", width = 10, height = 10)
#plot(controls_network_HNRNPH1, vertex.color= '#E7872B', edge.width = 1.6)
#dev.off()

In [None]:
#annotate the neighbors of the nodes of interest
HNRNPH1_neighbors_cases = c(cases_network_HNRNPH1_connections$node1, cases_network_HNRNPH1_connections$node2)

intersect(HNRNPH1_neighbors_cases, hubs.with.at.least.degree.10.cases)
#intersect(HNRNPH1_neighbors_cases, all_disreg_genes_trx) -->None
intersect(HNRNPH1_neighbors_cases, tf.gene.links$Name.TF)
intersect(HNRNPH1_neighbors_cases, all.SF)

In [None]:
HNRNPH1_neighbors_controls = c(controls_network_HNRNPH1_connections$node1, controls_network_HNRNPH1_connections$node2)

intersect(HNRNPH1_neighbors_controls, hubs.with.at.least.degree.10.controls)
#intersect(HNRNPH1_neighbors_controls, all_disreg_genes_trx) -->NONE
intersect(HNRNPH1_neighbors_controls, tf.gene.links$Name.TF)
intersect(HNRNPH1_neighbors_controls, all.SF)

In [None]:
HNRNPH1227_neighbors_cases = c(cases_network_HNRNPH1227_connections$node1, cases_network_HNRNPH1227_connections$node2)
HNRNPH1227_neighbors_cases_genes = remove_after_dash(HNRNPH1227_neighbors_cases)

intersect(HNRNPH1227_neighbors_cases, hubs.with.at.least.degree.10.cases)
#intersect(HNRNPH1227_neighbors_cases, all_disreg_genes_trx) -->NONE
intersect(HNRNPH1227_neighbors_cases_genes, tf.gene.links$Name.TF)
intersect(HNRNPH1227_neighbors_cases_genes, all.SF)

In [None]:
HNRNPH1227_neighbors_controls = c(controls_network_HNRNPH1227_connections$node1, controls_network_HNRNPH1227_connections$node2)
HNRNPH1227_neighbors_controls_genes = remove_after_dash(HNRNPH1227_neighbors_controls)

intersect(HNRNPH1227_neighbors_controls, hubs.with.at.least.degree.10.controls)
intersect(HNRNPH1227_neighbors_controls, all_disreg_genes_trx)
intersect(HNRNPH1227_neighbors_controls_genes, tf.gene.links$Name.TF)
intersect(HNRNPH1227_neighbors_controls_genes, all.SF)

#### Check if isoform-level (IR nodes) of the genes capture more known protein-protein interactions

In [None]:
biogrid = fread("/grehawi/splice-reg-prj/new-data/BIOGRID-ALL-4.4.237.tab.txt", sep = '\t', header=TRUE)
biogrid = biogrid[biogrid$ORGANISM_A_ID == '9606' & biogrid$ORGANISM_A_ID == '9606', ]
# homosapien ID = 9606
head(biogrid)
dim(biogrid)

In [None]:
#neighbors of gene node of the SF HNRNPH1
HNRNPH1_neighbors_cases = HNRNPH1_neighbors_cases[ !HNRNPH1_neighbors_cases == 'HNRNPH1']
HNRNPH1_neighbors_controls = HNRNPH1_neighbors_controls[ !HNRNPH1_neighbors_controls == 'HNRNPH1']

#neighbors of IR node of the SF HNRNPH1-227
HNRNPH1227_neighbors_cases = remove_after_dash(HNRNPH1227_neighbors_cases[ !HNRNPH1227_neighbors_cases == 'HNRNPH1-227'])
HNRNPH1227_neighbors_controls = remove_after_dash(HNRNPH1227_neighbors_controls[ !HNRNPH1227_neighbors_controls == 'HNRNPH1-227'])

In [None]:
# which capture more interactions
HNRNPH1_interactions = biogrid[biogrid$OFFICIAL_SYMBOL_A == 'HNRNPH1']$OFFICIAL_SYMBOL_B

# interactions captured by the gene node HNRNPH1 in the cases network
HNRNPH1_interactions_captured_cases = intersect(HNRNPH1_neighbors_cases, HNRNPH1_interactions)
# interactions captured by the IR node HNRNPH1_227 in the cases network
HNRNPH1227_interactions_captured_cases = intersect(HNRNPH1227_neighbors_cases, HNRNPH1_interactions) 

# interactions captured by the gene node HNRNPH1 in the controls network
HNRNPH1_interactions_captured_controls = intersect(HNRNPH1_neighbors_controls, HNRNPH1_interactions)
# interactions captured by the IR node HNRNPH1_227 in the controls network
HNRNPH1227_interactions_captured_controls = intersect(HNRNPH1227_neighbors_controls, HNRNPH1_interactions)

In [None]:
HNRNPH1_interactions_captured_cases
HNRNPH1227_interactions_captured_cases

In [None]:
HNRNPH1_interactions_captured_controls
HNRNPH1227_interactions_captured_controls

In [None]:
# Upsetplot for intersections of interactors between HNRNPH1_227 and HNRNPH1 in both cases and controls networks
listInput <- list(HNRNPH1 = HNRNPH1_interactions_captured_cases, HNRNPH1_227=HNRNPH1227_interactions_captured_cases)
pdf("/grehawi/splice-reg-prj/Figures/upsetplot_HNRNPH1_cases.pdf", onefile=FALSE)
upset(fromList(listInput), order.by = "freq", point.size = 4, text.scale=3)
dev.off()

In [None]:
listInput <- list(HNRNPH1 = HNRNPH1_interactions_captured_controls, HNRNPH1_227=HNRNPH1227_interactions_captured_controls)
pdf("/grehawi/splice-reg-prj/Figures/upsetplot_HNRNPH1_controls.pdf", onefile=FALSE)
upset(fromList(listInput), order.by = "freq", point.size = 4, text.scale=3)
dev.off()

### Check nodes with big shift in degree btw cases and controls network

Our idea of differential analysis is to test also the unthresholded version of the network to make sure filtering is not responsible for these differential edges, hence for the following differential hub analsysis we consider nodes with at least degree 10 in the cases network but which have significanlty smaller degree in the UNTHRESHOLDED controls network

In [None]:
g.controls.unthr = get_igraph_obj(annotated.filtered.controls.withNames.singleE)
g.controls.degrees.unthr = get_degree(g.controls.unthr)

g.cases.unthr = get_igraph_obj(annotated.filtered.cases.withNames.singleE)
g.cases.degrees.unthr = get_degree(g.cases.unthr)

In [None]:
#node with dg >= 10 in the cases network 'hubs.with.at.least.degree.10.cases'
#which of them have substantially reduced dg in the Unthresholded controls network (smaller than half)
merged_networks_degrees_casesthr_controlsUnth = merge(g.cases.degrees[g.cases.degrees$node %in% hubs.with.at.least.degree.10.cases, ], g.controls.degrees.unthr, by = "node")
master_hubs_cases = merged_networks_degrees_casesthr_controlsUnth[merged_networks_degrees_casesthr_controlsUnth$degree.y <= 0.5*(merged_networks_degrees_casesthr_controlsUnth$degree.x), ]
head(master_hubs_cases)
dim(master_hubs_cases)

In [None]:
#node with dg >= 10 in the controls network 'hubs.with.at.least.degree.10.controls'
#which of them have substantially reduced dg in the Unthresholded cases network
merged_networks_degrees_controlsthr_casesUnth = merge(g.controls.degrees[g.controls.degrees$node %in% hubs.with.at.least.degree.10.controls, ], g.cases.degrees.unthr, by = "node")
master_hubs_controls = merged_networks_degrees_controlsthr_casesUnth[merged_networks_degrees_controlsthr_casesUnth$degree.y <= 0.5*(merged_networks_degrees_controlsthr_casesUnth$degree.x), ]
head(master_hubs_controls)
dim(master_hubs_controls)

In [None]:
# how many of these differential hubs are also differentially expressed/ TFs /SFs
intersect_masterHubsCases_disreg = intersect(master_hubs_cases$node, all_disreg_genes_trx)
intersect_masterHubsCases_disreg
names_of_master_nodes = remove_after_dash(master_hubs_cases$node)
intersect(names_of_master_nodes, tf.gene.links$Name.TF)
intersect(names_of_master_nodes, all.SF)

In [None]:
master_hubs_cases$node

In [None]:
#how many IR node in tge list of master hubs in the cases network
length(intersect(master_hubs_cases$node, tx.ids.names$name))

In [None]:
master_nodes_cases_df = data.frame(c(rep("AIN" , times=length(master_hubs_cases$node)),
                                      rep("UIN" , times=length(master_hubs_cases$node))),
                                        c(master_hubs_cases$node, master_hubs_cases$node),
                                        c(master_hubs_cases$degree.x, master_hubs_cases$degree.y))
names(master_nodes_cases_df) = c('Network', 'node', 'degree')
# highlight dusregulated nodes with *
# highlight TF nodes with ~
# highlight SF nodes with ^
condition = master_nodes_cases_df$node %in% intersect_masterHubsCases_disreg
master_nodes_cases_df$node[condition] = paste('*', master_nodes_cases_df$node[condition])
master_nodes_cases_df$node[master_nodes_cases_df$node == 'CCNT2-208'] = '~CCNT2-208'
master_nodes_cases_df$node[master_nodes_cases_df$node == 'CREBL2-201'] = '~CREBL2-201'
master_nodes_cases_df$node[master_nodes_cases_df$node == 'SATB1-201'] = '~SATB1-201'
master_nodes_cases_df$node[master_nodes_cases_df$node == 'ZNF292-203'] = '~ZNF292-203'
master_nodes_cases_df$node[master_nodes_cases_df$node == 'HMGB1-201'] = '~HMGB1-201'
master_nodes_cases_df$node[master_nodes_cases_df$node == 'CHD2'] = '~CHD2'
master_nodes_cases_df$node[master_nodes_cases_df$node == 'AFF4-203'] = '~AFF4-203'

master_nodes_cases_df$node[master_nodes_cases_df$node == 'CRNKL1-207'] = '^CRNKL1-207'
master_nodes_cases_df$node[master_nodes_cases_df$node == 'DDX39B-219'] = '^DDX39B-219'
master_nodes_cases_df$node[master_nodes_cases_df$node == '* NUDT21-201'] = '^* NUDT21-201'
master_nodes_cases_df$node[master_nodes_cases_df$node == 'SRRM2-201'] = '^SRRM2-201'
master_nodes_cases_df$node[master_nodes_cases_df$node == 'SRSF6-201'] = '^SRSF6-201'
master_nodes_cases_df$node[master_nodes_cases_df$node == 'ZC3HAV1-204'] = '^ZC3HAV1-204'

head(master_nodes_cases_df)
dim(master_nodes_cases_df)

In [None]:
options(repr.plot.width=49, repr.plot.height=4)
heatmap_matrix <- reshape2::acast(master_nodes_cases_df, Network ~ node, value.var = 'degree')
# Create the heatmap
heatmap = pheatmap(heatmap_matrix,
         cluster_rows = FALSE,    # Keep rows in the order 'Gene' and 'Transcript'
         cluster_cols = FALSE,    # Do not cluster columns if you want them ordered as in data
         main = NA, legend_breaks = c(10, 20, 30, 31), legend_labels = c("10", "20", "30", "Degree\n"),
         cellwidth = 40, cellhight = 40, fontsize = 32, angle_col = "45")
heatmap

In [None]:
ggsave("/grehawi/splice-reg-prj/Figures/pheatmap_master_nodes_cases.pdf", plot = heatmap, width = 47, height = 6)

In [None]:
# intersection with Zeng et al (https://www.frontiersin.org/journals/genetics/articles/10.3389/fgene.2019.00703/full#supplementary-material)
genes_Gandal_etal = read.table('/grehawi/splice-reg-prj/new-data/ARACNE/genes_Gandal_etal.txt', header =TRUE)
dim(genes_Gandal_etal)
# intersection with master hubs in the cases network
intersect(genes_Gandal_etal$Genes, hubs_cases_unique)
intersect(genes_Gandal_etal$Genes, names_of_master_nodes)

In [None]:
wittenberg_down_genes = read.table('/grehawi/splice-reg-prj/new-data/Diff-Analysis/genes_down_MDD_wittenberg_etal.txt', header=TRUE)
head(wittenberg_down_genes)
dim(wittenberg_down_genes)

wittenberg_up_genes = read.table('/grehawi/splice-reg-prj/new-data/Diff-Analysis/genes_up_MDD_wittenberg_etal.txt', header=TRUE)
head(wittenberg_up_genes)
dim(wittenberg_up_genes)

intersect(wittenberg_up_genes$Genes, names_of_master_nodes)
intersect(wittenberg_down_genes$Genes, names_of_master_nodes)

In [None]:
cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$node1== 'DOCK5' | cases.annotated.filtered.single.edges.thr$node2== 'DOCK5', ]
controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$node1== 'DOCK5' | controls.annotated.filtered.single.edges.thr$node2== 'DOCK5', ]

### Cluster profiler enrichment in Go-terms of the master hub nodes in both networks (61 each) and their first-order neighbors

In [None]:
neighborhood_cases_master_nodes =c()
neighborhood_controls_master_nodes =c()

# Define the node of interest

for (node in master_hubs_cases$node){
    # Define the node of interest
    node_of_interest <- node

    # Get the neighborhood of the node with order 1
    neighborhood_nodes_cases <- neighborhood(g.cases, order = 1, nodes = node_of_interest)[[1]]

    # Induce the subgraph from the neighborhood nodes
    subgraph_cases <- induced_subgraph(g.cases, vids = neighborhood_nodes_cases)
    neighborhood_cases_master_nodes = c(neighborhood_cases_master_nodes, V(subgraph_cases)$name)
}

for (node in master_hubs_controls$node){
    # Define the node of interest
    node_of_interest <- node

    # Get the neighborhood of the node with order 1
    neighborhood_nodes_controls <- neighborhood(g.controls, order = 1, nodes = node_of_interest)[[1]]

    # Induce the subgraph from the neighborhood nodes
    subgraph_controls <- induced_subgraph(g.controls, vids = neighborhood_nodes_controls)
    neighborhood_controls_master_nodes = c(neighborhood_controls_master_nodes, V(subgraph_controls)$name)
}

In [None]:
# create dataframe for the neighborhood nodes of both the cases and controls networks
controls_master_nodes_neighborhood_df = data.frame(neighborhood_controls_master_nodes)
colnames(controls_master_nodes_neighborhood_df) = c('gene_name')
# some neighbors are transcripts --> get their corresponding gene names
controls_master_nodes_neighborhood_df <- controls_master_nodes_neighborhood_df %>%
  mutate(gene_name = if_else(controls_master_nodes_neighborhood_df$gene_name %in% tx.ids.names$name, remove_after_dash(gene_name), gene_name))

cases_master_nodes_neighborhood_df = data.frame(neighborhood_cases_master_nodes)
colnames(cases_master_nodes_neighborhood_df) = c('gene_name')
# some neighbors are transcripts --> get their corresponding gene names
cases_master_nodes_neighborhood_df <- cases_master_nodes_neighborhood_df %>%
  mutate(gene_name = if_else(cases_master_nodes_neighborhood_df$gene_name %in% tx.ids.names$name, remove_after_dash(gene_name), gene_name))
                                                    

In [None]:
dim(controls_master_nodes_neighborhood_df)
dim(cases_master_nodes_neighborhood_df)

In [None]:
gene_set1 = cases_master_nodes_neighborhood_df$gene_name
gene_set2 = controls_master_nodes_neighborhood_df$gene_name

# Enrichment analysis for Gene Ontology Biological Process (BP) for all sets
go_enrichment_1 = enrichGO(gene = gene_set1, 
                            OrgDb = org.Hs.eg.db, 
                            keyType = "SYMBOL", 
                            ont = "BP",
                            universe = genes.ids.names$gene_name, 
                            pAdjustMethod = "BH",
                            pvalueCutoff = 0.7,
                            qvalueCutoff = 0.7)

go_enrichment_2 = enrichGO(gene = gene_set2, 
                            OrgDb = org.Hs.eg.db, 
                            keyType = "SYMBOL", 
                            ont = "BP", 
                            universe = genes.ids.names$gene_name,
                            pAdjustMethod = "BH",
                            pvalueCutoff = 0.7,
                            qvalueCutoff = 0.7)

In [None]:
go_enrichment_1

In [None]:
go_enrichment_2

In [None]:
#Cluster similar GO terms: To reduce redundancy and cluster similar GO terms, use the simplify() function from the clusterProfiler package.
#This will merge similar terms based on semantic similarity.
#when specifying pvalueCutoff and qvalueCutoff =1 this takes so long to calculate
go_enrichment_1 = simplify(go_enrichment_1, cutoff=0.7, by="p.adjust", select_fun=min)
go_enrichment_2 = simplify(go_enrichment_2, cutoff=0.7, by="p.adjust", select_fun=min)

In [None]:
# Create a dotplot
options(repr.plot.width=16, repr.plot.height=18)

# take top 10 most sign Go-terms for each set
go_df_1 = as.data.frame(go_enrichment_1)
go_df_2 = as.data.frame(go_enrichment_2)
# Add a column to differentiate the two sets
go_df_1$set = "Master Hubs Neighbours-AIN"
go_df_2$set = "Master Hubs Neighbours-UIN"

go_df_1_sub = go_df_1[order(go_df_1$p.adjust, decreasing = FALSE),][1:10,]
go_df_2_sub = go_df_2[order(go_df_2$p.adjust, decreasing = FALSE),][1:10,]

combined_go_df_sub = rbind(go_df_1_sub, go_df_2_sub)

# Get p-values for the rest of the sets for each go-term in the combined_df
combined_go_df_sub_go_ids = combined_go_df_sub$ID
go_df_1_sub_extended = go_df_1[go_df_1$ID %in% combined_go_df_sub_go_ids, ]
go_df_2_sub_extended = go_df_2[go_df_2$ID %in% combined_go_df_sub_go_ids, ]

combined_go_df_sub_extended = rbind(go_df_1_sub_extended, go_df_2_sub_extended)

ggplot(combined_go_df_sub_extended, aes(x = set, y = Description, size = RichFactor, color = 0 - log10(p.adjust))) +
  geom_point() +
  scale_color_gradient(low = "blue", high = "red") +
  theme_minimal() +
  labs(title = "GO Enrichment for Gene Sets", 
       x = "Gene Set", 
       y = "GO Term",
       size = "Rich Factor", 
       color = "-log10 Adjusted p-value") +
    theme(axis.text= element_text(size = 18), axis.title.x = element_text(size = 22),
                                                         axis.title.y = element_text(size = 22),
                                                         axis.text.x = element_text(angle = 45, hjust = 1),
                                                         legend.text=element_text(size=20),
                                                         legend.title=element_text(size=22))


In [None]:
ggsave("/grehawi/splice-reg-prj/Figures/dotplot_go_enrch_masterHubs_all.pdf", width = 18, height = 18)

In [None]:
write.table(rbind(go_df_1, go_df_2), "/grehawi/splice-reg-prj/new-data/ARACNE/supp_table_GO_all_master_hubs.csv", row.names = FALSE)

### MAGMA analysis of the top 2 master nodes in each network and their 2-hops neighbors

In [None]:
# pick one or two master hubs in each network with the highest shift in degree foor MAGMA analysis around their neigh..
master_hubs_cases[master_hubs_cases$degree.x == max(master_hubs_cases$degree.x), ]
master_hubs_controls[master_hubs_controls$degree.x == max(master_hubs_controls$degree.x), ]

In [None]:
second_largest_master_cases <- master_hubs_cases[order(master_hubs_cases$degree.x, decreasing = TRUE)][2]
second_largest_master_cases
second_largest_master_controls <- master_hubs_controls[order(master_hubs_controls$degree.x, decreasing = TRUE)][2]
second_largest_master_controls

In [None]:
tx.ids.names[tx.ids.names$name %in% c("RC3H1-202", "SRSF6-201", "SYNCRIP-202"), ]

In [None]:
neighborhood_cases =c()
neighborhood_controls =c()

# Define the node of interest

for (node in c("RC3H1-202", "SRSF6-201")){
    # Define the node of interest
    node_of_interest <- node

    # Get the neighborhood of the node with order 2
    neighborhood_nodes_cases <- neighborhood(g.cases, order = 2, nodes = node_of_interest)[[1]]

    # Induce the subgraph from the neighborhood nodes
    subgraph_cases <- induced_subgraph(g.cases, vids = neighborhood_nodes_cases)
    neighborhood_cases = c(neighborhood_cases, V(subgraph_cases)$name)
}

for (node in c("MEGF9", "SYNCRIP-202")){
    # Define the node of interest
    node_of_interest <- node

    # Get the neighborhood of the node with order 2
    neighborhood_nodes_controls <- neighborhood(g.controls, order = 2, nodes = node_of_interest)[[1]]

    # Induce the subgraph from the neighborhood nodes
    subgraph_controls <- induced_subgraph(g.controls, vids = neighborhood_nodes_controls)
    neighborhood_controls = c(neighborhood_controls, V(subgraph_controls)$name)
}

In [None]:
length(neighborhood_cases)

In [None]:
length(neighborhood_controls)

In [None]:
intersect(neighborhood_controls, neighborhood_cases)

In [None]:
# map name of genes and genes of trx to entrezIDs according to the file NCBI37.3.gene.loc
NCBI37_gene_loc = read.table('/grehawi/splice-reg-prj/new-data/MAGMA/NCBI37.3/NCBI37.3.gene.loc')
names(NCBI37_gene_loc)[1] <- "entrez_id"
names(NCBI37_gene_loc)[6] <- "gene_name"
head(NCBI37_gene_loc)

In [None]:
# create dataframe for the neighborhood nodes of both the cases and controls networks
neighborhood_controls_df = data.frame(neighborhood_controls)
colnames(neighborhood_controls_df) = c('gene_name')
# some neighbors are transcripts --> get their corresponding gene names
neighborhood_controls_df <- neighborhood_controls_df %>%
  mutate(gene_name = if_else(neighborhood_controls_df$gene_name %in% tx.ids.names$name, remove_after_dash(gene_name), gene_name))

neighborhood_cases_df = data.frame(neighborhood_cases)
colnames(neighborhood_cases_df) = c('gene_name')
# some neighbors are transcripts --> get their corresponding gene names
neighborhood_cases_df <- neighborhood_cases_df %>%
  mutate(gene_name = if_else(neighborhood_cases_df$gene_name %in% tx.ids.names$name, remove_after_dash(gene_name), gene_name))
                                                    

In [None]:
neighborhood_cases_df_tmp = merge(neighborhood_cases_df, NCBI37_gene_loc[, colnames(NCBI37_gene_loc) %in% c('entrez_id', 'gene_name')], by = "gene_name", all.x = TRUE)
neighborhood_controls_df_tmp = merge(neighborhood_controls_df, NCBI37_gene_loc[, colnames(NCBI37_gene_loc) %in% c('entrez_id', 'gene_name')], by = "gene_name", all.x = TRUE)

sum(is.na(neighborhood_cases_df_tmp$entrez_id))
sum(is.na(neighborhood_controls_df_tmp$entrez_id))

# remove rows with NA entries in entrez_id column
neighborhood_cases_df_tmp = subset(neighborhood_cases_df_tmp, !is.na(neighborhood_cases_df_tmp$entrez_id))
neighborhood_controls_df_tmp = subset(neighborhood_controls_df_tmp, !is.na(neighborhood_controls_df_tmp$entrez_id))


In [None]:
dim(neighborhood_cases_df_tmp)

In [None]:
# create Set_Annot_File for the final step of running MAGMA 
set_annot_file_networks_neighborhood = data.frame(c(rep('AIN', times=dim(neighborhood_cases_df_tmp)[1]),
                                        rep('UIN', times=dim(neighborhood_controls_df_tmp)[1])), 
                                        c(neighborhood_cases_df_tmp$entrez_id, neighborhood_controls_df_tmp$entrez_id))
names(set_annot_file_networks_neighborhood) <- NULL

In [None]:
write.table(set_annot_file_networks_neighborhood, '/grehawi/splice-reg-prj/new-data/MAGMA/set_annot_file_SuperMasterNode_neighborhood.txt', sep= ' ')


In [None]:
# Read MAGMA results

options(repr.plot.width=12, repr.plot.height=6)
# List of your MAGMA result files
files <- list.files(path = "/grehawi/splice-reg-prj/new-data/MAGMA/output/gene_set_analysis_SuperMasterNode_neighbors_output", pattern = "*.gsa.out", full.names = TRUE)

# Initialize an empty list to store data
result_list <- list()

# Loop over each file to load and process the data
for (file in files) {
    # Read each MAGMA result file
    data <- read.table(file, header = TRUE)
    
    # Extract gene set name and relevant statistic (e.g., Z-score or p-value)
    phenotype <- strsplit(basename(file), split = "_")[[1]][1]  # Use filename as phenotype label
    result_list[[phenotype]] <- data  # assuming P is the p-value column
    result_list[[phenotype]]$TYPE = phenotype
    result_list[[phenotype]]$abs_beta = abs(data$BETA)
    result_list[[phenotype]]$beta_direction = ifelse(data$BETA >= 0, "Positive", "Negative")
    result_list[[phenotype]]$adjP_val = p.adjust(data$P, method = "BH")
    result_list[[phenotype]]$`-log10P` = 0 - log10(result_list[[phenotype]]$adjP_val)
}

# Merge all dataframes in the list by 'VARIABLE'
combined_data = do.call(rbind, result_list) 

pdf("/grehawi/splice-reg-prj/Figures/dotplot_MAGMA_top2_MasterHubs.pdf", width = 18, height = 18)
ggplot(combined_data, aes(x = VARIABLE, y = TYPE)) +
  geom_point(aes(size = abs_beta, fill = `-log10P`, shape = beta_direction)) +
  scale_shape_manual(values = c("Negative" = 25, "Positive" = 21)) +
  scale_fill_gradient(low = "blue", high = "red") + xlab("") + ylab("") + scale_x_discrete(labels=c("Top 2 Master Hubs Neighbours-AIN", "Top 2 Master Hubs Neighbours-UIN")) +
                    theme_bw() + coord_flip() + theme(legend.text=element_text(size=18),
                                       axis.text.x = element_text(angle = 45, hjust = 1),
                                       legend.title=element_text(size=18), axis.text = element_text(size = 22),
                                       axis.title = element_text(size = 22))
dev.off()

In [None]:
write.table(combined_data, '/grehawi/splice-reg-prj/new-data/ARACNE/supp_table_MAGMA_top2_MasterHubs.csv', row.names = FALSE)


In [None]:
plot_graph = function (node_name, node_type='gene'){
    cases_connections = cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$node1 %in% node_name | cases.annotated.filtered.single.edges.thr$node2 %in% node_name, ]
    controls_connections = controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$node1 %in% node_name | controls.annotated.filtered.single.edges.thr$node2 %in% node_name, ]
    cases_connections_g <- graph_from_data_frame(cases_connections, directed = FALSE)
    controls_connections_g <- graph_from_data_frame(controls_connections, directed = FALSE)
    if (node_type == 'gene'){
        svg(paste0("/grehawi/splice-reg-prj/Figures/cases_", node_name, ".svg"), width=16, height=12)
        plot(cases_connections_g, vertex.color= '#E7872B', edge.width = 1.6, label.family= 'arial', label.color='black', arrow.width=3)
        dev.off()
        svg(paste0("/grehawi/splice-reg-prj/Figures/controls_", node_name, ".svg"), width=16, height=12)
        plot(controls_connections_g, vertex.color= '#E7872B', edge.width = 1.6, label.family= 'arial', label.color='black', arrow.width=3)
        dev.off()
    }
    else{
        svg(paste0("/grehawi/splice-reg-prj/Figures/cases_", node_name, ".svg"), width=16, height=12)
        plot(cases_connections_g, vertex.color= '#3A68AE', edge.width = 1.6, label.family= 'arial', label.color='black', arrow.width=3)
        dev.off()
        svg(paste0("/splice-reg-prj/Figures/controls_", node_name, ".svg"), width=16, height=12)
        plot(controls_connections_g, vertex.color= '#3A68AE', edge.width = 1.6, label.family= 'arial', label.color='black', arrow.width=3)
        dev.off()
    }
    
    return(list(cases_connections, controls_connections))   
}

In [None]:
connections_RC3H1202 = plot_graph("RC3H1-202", 'IR')
connections_SRSF6201 = plot_graph("SRSF6-201", "IR")
connections_MEGF9 = plot_graph("MEGF9", 'gene')
connections_SYNCRIP202 = plot_graph("SYNCRIP-202", 'IR')

In [None]:
# highlight important nodes (cases)
nodes_of_RC3H1202_cases = c(connections_RC3H1202[[1]]$node1, connections_RC3H1202[[1]]$node2)
intersect(nodes_of_RC3H1202_cases, all_disreg_genes_trx)
names_of_nodes_RC3H1202_cases = remove_after_dash(nodes_of_RC3H1202_cases)
intersect(names_of_nodes_RC3H1202_cases, tf.gene.links$Name.TF)
intersect(names_of_nodes_RC3H1202_cases, all.SF)
intersect(nodes_of_RC3H1202_cases, hubs.with.at.least.degree.10.cases)

In [None]:
# highlight important nodes (controls)
nodes_of_RC3H1202_controls = c(connections_RC3H1202[[2]]$node1, connections_RC3H1202[[2]]$node2)
intersect(nodes_of_RC3H1202_controls, all_disreg_genes_trx)
names_of_nodes_RC3H1202_controls = remove_after_dash(nodes_of_RC3H1202_controls)
intersect(names_of_nodes_RC3H1202_controls, tf.gene.links$Name.TF)
intersect(names_of_nodes_RC3H1202_controls, all.SF)
intersect(nodes_of_RC3H1202_controls, hubs.with.at.least.degree.10.controls)

In [None]:
intersect(nodes_of_RC3H1202_controls, nodes_of_RC3H1202_cases)

In [None]:
# highlight important nodes SRSF6-201 cases
nodes_of_SRSF6201_cases = c(connections_SRSF6201[[1]]$node1, connections_SRSF6201[[1]]$node2)
intersect(nodes_of_SRSF6201_cases, all_disreg_genes_trx)
names_of_nodes= remove_after_dash(nodes_of_SRSF6201_cases)
intersect(names_of_nodes, tf.gene.links$Name.TF)
intersect(names_of_nodes, all.SF)
intersect(nodes_of_SRSF6201_cases, hubs.with.at.least.degree.10.cases)

In [None]:
nodes_of_SRSF6201_controls = c(connections_SRSF6201[[2]]$node1, connections_SRSF6201[[2]]$node2)
intersect(nodes_of_SRSF6201_controls, all_disreg_genes_trx)
names_of_nodes= remove_after_dash(nodes_of_SRSF6201_controls)
#intersect(names_of_nodes, tf.gene.links$Name.TF) -->NONE
intersect(names_of_nodes, all.SF)
intersect(nodes_of_SRSF6201_controls, hubs.with.at.least.degree.10.controls)

In [None]:
intersect(nodes_of_SRSF6201_controls, nodes_of_SRSF6201_cases)

In [None]:
nodes_of_MEGF9_cases = c(connections_MEGF9[[1]]$node1, connections_MEGF9[[1]]$node2)
#intersect(nodes_of_MEGF9_cases, all_disreg_genes_trx) -->NONE
names_of_nodes= remove_after_dash(nodes_of_MEGF9_cases)
intersect(names_of_nodes, tf.gene.links$Name.TF)
#intersect(names_of_nodes, all.SF) -->NONE
intersect(nodes_of_MEGF9_cases, hubs.with.at.least.degree.10.cases)

In [None]:
nodes_of_MEGF9_controls = c(connections_MEGF9[[2]]$node1, connections_MEGF9[[2]]$node2)
intersect(nodes_of_MEGF9_controls, all_disreg_genes_trx)
names_of_nodes= remove_after_dash(nodes_of_MEGF9_controls)
intersect(names_of_nodes, tf.gene.links$Name.TF)
#intersect(names_of_nodes, all.SF) -->NONE
intersect(nodes_of_MEGF9_controls, hubs.with.at.least.degree.10.controls)

In [None]:
intersect(nodes_of_MEGF9_controls, nodes_of_MEGF9_cases)

In [None]:
# highlight important nodes (cases)
nodes_of_SYNCRIP202_cases = c(connections_SYNCRIP202[[1]]$node1, connections_SYNCRIP202[[1]]$node2)
#intersect(nodes_of_SYNCRIP202_cases, all_disreg_genes_trx) -->NONE
names_of_nodes_SYNCRIP202_cases = remove_after_dash(nodes_of_SYNCRIP202_cases)
intersect(names_of_nodes_SYNCRIP202_cases, tf.gene.links$Name.TF)
intersect(names_of_nodes_SYNCRIP202_cases, all.SF)
intersect(nodes_of_SYNCRIP202_cases, hubs.with.at.least.degree.10.cases)

In [None]:
nodes_of_SYNCRIP202_controls = c(connections_SYNCRIP202[[2]]$node1, connections_SYNCRIP202[[2]]$node2)
intersect(nodes_of_SYNCRIP202_controls, all_disreg_genes_trx)
names_of_nodes_SYNCRIP202_controls = remove_after_dash(nodes_of_SYNCRIP202_controls)
intersect(names_of_nodes_SYNCRIP202_controls, tf.gene.links$Name.TF)
intersect(names_of_nodes_SYNCRIP202_controls, all.SF)
intersect(nodes_of_SYNCRIP202_controls, hubs.with.at.least.degree.10.cases)

In [None]:
intersect(nodes_of_SYNCRIP202_controls, nodes_of_SYNCRIP202_cases)

### Visualizing the big network of cases and controls

In [None]:
g = make_lattice( c(100,100) ) 
g = layout_on_grid(g)
g_networx = g.to_networkx()

In [None]:
# Visualize the cases network and annotate the nodes KCTD12 and TNFRSF1B #94C47D
options(repr.plot.width=24, repr.plot.height=16)
#cases graph
cases_network_full <- graph_from_data_frame(cases.annotated.filtered.single.edges.thr, directed = FALSE) # Use directed = TRUE for directed graphs

#plot only largest connected comp
lcc_cases = largest_component(cases_network_full, mode = "strong")

# create a df to hold annotations
cases_lcc_node_annot_df = as.data.frame(V(lcc_cases)$name)
names(cases_lcc_node_annot_df) = c('name')
cases_lcc_node_annot_df <- cases_lcc_node_annot_df %>% 
 mutate(
        type = ifelse(name %in% c("TNFRSF1B", "KCTD12"), "#94C47D", ifelse(name %in% tx.ids.names$name, "#3A68AE", "#E7872B")))
V(lcc_cases)$type = cases_lcc_node_annot_df$type

pdf("/grehawi/splice-reg-prj/Figures/cases_network_full_lcc.pdf", width = 18, height = 16)
cases_full_plot = plot(lcc_cases, vertex.color=V(lcc_cases)$type , edge.width = 1.6,
     vertex.label = ifelse(V(lcc_cases)$name %in% c("KCTD12", "TNFRSF1B"),V(lcc_cases)$name, NA),
     vertex.size=ifelse(degree(lcc_cases) >= 10, 10, 2), vertex.label.cex=1, vertex.label.color='black')
dev.off()

In [None]:
controls_network_full <- graph_from_data_frame(controls.annotated.filtered.single.edges.thr, directed = FALSE) # Use directed = TRUE for directed graphs

#plot only largest connected comp
lcc_controls = largest_component(controls_network_full, mode = "strong")

# create a df to hold annotations
controls_lcc_node_annot_df = as.data.frame(V(lcc_controls)$name)
names(controls_lcc_node_annot_df) = c('name')
controls_lcc_node_annot_df <- controls_lcc_node_annot_df %>% 
 mutate(
        type = ifelse(name %in% c("TNFRSF1B", "KCTD12"), "#94C47D", ifelse(name %in% tx.ids.names$name, "#3A68AE", "#E7872B")))
V(lcc_controls)$type = controls_lcc_node_annot_df$type

pdf("/grehawi/splice-reg-prj/Figures/controls_network_full_lcc.pdf", width = 18, height = 16)
plot(lcc_controls, vertex.color=V(lcc_controls)$type, edge.width = 1.6,
          vertex.label = ifelse(V(lcc_controls)$name %in% c("KCTD12", "TNFRSF1B"),V(lcc_controls)$name, NA),
         vertex.size=ifelse(degree(lcc_controls) >= 10, 10, 2), vertex.label.cex=1, vertex.label.color='black')
dev.off()