In [None]:
#library(xlsx)
library(ggplot2)
library(rtracklayer)
library(dplyr)
library(tidyr)
library(data.table)
library(matrixStats)
library(igraph)
library(purrr)
library(stringr)
library(VennDiagram)
library(TIN)
psize = function(x=4,y=5) { options(repr.plot.width=x, repr.plot.height=y) }

### Functions

In [None]:
# Function to remove all characters after '-'
remove_after_dash <- function(x) {
  sub("-.*$", "", x)
}

In [None]:
get_intersection_with_known_TFTarget <- function (df) {   
    # intersect with TFLink
    intersection.set = df %>% 
      left_join(tf.gene.links, by = c("node1" = "Name.TF", "node2" = "Name.Target"))
    # select non NAs
    intersection.set= intersection.set[complete.cases(intersection.set),]
    return(intersection.set)
    
}

In [None]:
get_intersection_with_known_SF <- function (intersection_set, df) {
    # intersect with splicing factots
    nodes.unq = unique(c(df$node1, df$node2))
    intersection.set = intersect(nodes.unq, intersection_set)

    return(intersection.set)
}

In [None]:
normalize <- function(x) (x- min(x))/(max(x) - min(x))

In [None]:
process_aracne_net <- function(dt) {
    
    # melt columns and remove 0 confidence values (no edges)
    aracne.net.df = as.data.frame(dt)
    aracne.net.long = aracne.net.df %>% melt( )
    aracne.net.long$variable = as.character(aracne.net.long$variable)
    #aracne.net.long$value = normalize(aracne.net.long$value)
    aracne.net.long = aracne.net.long[aracne.net.long$value != 0 , ]
    # order by Confidence value deceasing
    aracne.net.long = aracne.net.long[order(-(aracne.net.long$value)),]
    # rename columns
    aracne.net.long = aracne.net.long %>% 
          dplyr::rename(node1 = V1) %>% dplyr::rename(node2 = variable)
    return(aracne.net.long)
}

In [None]:
get_network_with_ids <- function(df) {
    net.withIds = df %>% left_join(genes.trxs.ids.names.map, by = c("node1" = "name"))
    net.withIds = net.withIds %>% left_join(genes.trxs.ids.names.map, by = c("node2" = "name"))
    # select columns from the join and rename
    net.withIds = net.withIds %>% select(id.x, id.y, value) %>% 
          dplyr::rename(node1 = id.x) %>% dplyr::rename(node2 = id.y)
    return(net.withIds)
}

In [None]:
get_network_with_names <- function(df) {
    net.withNames = df %>% left_join(genes.trxs.ids.names.map, by = c("node1" = "id"))
    net.withNames = net.withNames %>% left_join(genes.trxs.ids.names.map, by = c("node2" = "id"))
    # select columns from the join and rename
    net.withNames = net.withNames %>% select(name.x, name.y, value, edgeType) %>% 
          dplyr::rename(node1 = name.x) %>% dplyr::rename(node2 = name.y)
    return(net.withNames)
}

In [None]:
remove_double_edges <- function(df) {
    # remove double edges
    net.single.edges = df[!duplicated(apply(df,1,function(x) paste(sort(x),collapse=''))),]  
                                            
    return(net.single.edges)
}

In [None]:
annotate_edges <- function (df) {

    annotated.net = df %>% 
    mutate(
        edgeType = ifelse(grepl("ENSG", node1) & grepl("ENSG", node2), "TE-TE", ifelse(grepl("ENST", node1) & grepl("ENST", node2), "IR-IR", ifelse(grepl("ENSG", node1) & grepl("ENST", node2), "TE-IR", "IR-TE")))
    )
    return(annotated.net)
}



In [None]:
filter_edges <- function(df, trx.genes) {
    
    # filter edges that belong to features of the same gene (IR-IR or TE-IR or IR-TE)
    IR.IR.sub.df = df[df$edgeType == "IR-IR", ]
    df1.res1 = left_join(IR.IR.sub.df, trx.genes, by = c("node1" = "transcript_id"))
    df1.res2 = left_join(df1.res1, trx.genes, by = c("node2" = "transcript_id"))
    df1 = df1.res2[df1.res2$gene_id.x != df1.res2$gene_id.y, ]
    df1$gene_id.x = NULL
    df1$gene_id.y = NULL
    
    TE.IR.sub.df = df[df$edgeType == "TE-IR", ]
    df2.res1 = left_join(TE.IR.sub.df, trx.genes, by = c("node2" = "transcript_id"))
    df2 = df2.res1[df2.res1$node1 != df2.res1$gene_id, ]
    df2$gene_id = NULL
    
    IR.TE.sub.df = df[df$edgeType == "IR-TE", ]
    df3.res1 = left_join(IR.TE.sub.df, trx.genes, by = c("node1" = "transcript_id"))
    df3 = df3.res1[df3.res1$node2 != df3.res1$gene_id, ]
    df3$gene_id = NULL
    
    TE.TE.sub.df = df[df$edgeType == "TE-TE", ]
    
    return(rbind(df1, df2, df3, TE.TE.sub.df))
}

In [None]:
get_igraph_obj <- function(df){
  # takes in data table of results from kimono
  # return igraph object
  actors<-unique(c(df$node1,df$node2))
  relations <- data.frame(from=df$node1,
                        to=df$node2,
                        value=df$value) 
  # network
  g <- graph_from_data_frame(relations, directed=FALSE, vertices=actors)
  return(g)
}

In [None]:
get_betweenness <- function(graph_obj){
  # takes igraph object
  # returns betweenness df
  x <- betweenness(graph_obj, directed=T, weights=NA)

  y <- as.data.frame(t(t(sort(x, decreasing = T)))) %>%
    dplyr::rename(betweenness=V1) %>% setDT(., keep.rownames=T) %>% 
    dplyr::rename(node=rn)
  return(y)
}

In [None]:
get_degree <- function(graph_obj){
    x = degree(graph_obj, v = V(graph_obj), loops=FALSE)
    y <- as.data.frame(t(t(sort(x, decreasing = T)))) %>%
        dplyr::rename(degree=V1) %>% setDT(., keep.rownames=T) %>% 
        dplyr::rename(node=rn)
    return(y)  
}

### Parse all knowm splicing factors from collected resources

In [None]:
#library(readxl)
# Splicing factor from SpliceAid: 67 unique splicing factors
sf.targets.links = fread("/grehawi/splice-reg-prj/data/splicing-factors.txt", sep="\t")
#This gene set is a comprehensive collection of 277 unique genes involved in pre-mRNA splicing events 
#(Sveen et al., Genome Medicine, 2011, 3:32).
data(splicingFactors)
splicingFactors$GeneSymbol = as.character(splicingFactors$GeneSymbol)

# 406 splicing factors from Seiler et al 2018: https://pubmed.ncbi.nlm.nih.gov/29617667/
SF.Seiler = fread("/grehawi/splice-reg-prj/data/SF-Seileretal2018.csv")
head(SF.Seiler)
length(unique(SF.Seiler$GeneSymbol))

#another resource of SF Anna et al : https://www.biorxiv.org/content/10.1101/2020.05.20.107375v1.full
Anna.SF = c("SRSF1", "SRSF2", "SRSF3", "SRSF5", "SRSF7", "HNRNPA2B1", "HNRNPL", "HNRNPLL", "RBFOX2", "RBFOX3", "FUS", "SNRNP70", "TRA2A",
  "TRA2B", "TIA1", "PTBP1", "PTBP2", "RBM10", "RBM5")

SF.Seiler.df = as.data.frame(SF.Seiler)
SF.alone.seiler = unique(SF.Seiler.df$GeneSymbol)
length(SF.alone.seiler)

all.SF = unique(c(sf.targets.links$Gene, splicingFactors$GeneSymbol, SF.alone.seiler, Anna.SF))
length(all.SF)

### Parse known TF-target connections from TFLink

In [None]:
tf.gene.links = fread("/grehawi/splice-reg-prj/data/TFLink_Homo_sapiens_interactions_All_simpleFormat_v1.0.tsv", sep="\t")
head(tf.gene.links)
dim(tf.gene.links)
dim(tf.gene.links[complete.cases(tf.gene.links),])
length(unique(tf.gene.links$Name.Target))
length(unique(tf.gene.links$Name.TF))

### Read required files

In [None]:
# read transcripts-genes mapping table
trx.genes = read.table("/grehawi/splice-reg-prj/data/transcriptsID-geneID.txt")
head(trx.genes)

In [None]:
genes.ids.names = read.table("/grehawi/splice-reg-prj/new-data/ARACNE/gene_names_ids_table.txt")
head(genes.ids.names)
dim(genes.ids.names)

In [None]:
tx.ids.names = read.table("/grehawi/splice-reg-prj/new-data/ARACNE/trxs_names_ids_table.txt")
head(tx.ids.names)
dim(tx.ids.names)

In [None]:
genes.trxs.ids.names.map = read.table("/grehawi/splice-reg-prj/new-data/ARACNE/genes_trxs_ids_names_map.txt")
head(genes.trxs.ids.names.map)
dim(genes.trxs.ids.names.map)

## 1. Read and process networks

In [None]:
# Read and preprocess the networks (read as data table)
aracne.net.controls <- fread(file="/grehawi/splice-reg-prj/new-data/ARACNE/aracne_output/aracne-net-controls.txt")
aracne.net.cases.all <- fread(file="/grehawi/splice-reg-prj/new-data/ARACNE/aracne_output/aracne-net-cases.txt")


In [None]:
head(aracne.net.controls)

In [None]:
aracne.net.long.controls = process_aracne_net(aracne.net.controls)
aracne.net.long.cases.all = process_aracne_net(aracne.net.cases.all)

In [None]:
dim(aracne.net.long.controls)
dim(aracne.net.long.cases.all)

In [None]:
#Reduce(setdiff,list(aracne.net.controls$V1,genes.trxs.ids.names.map$name))

any(is.na(aracne.net.long.controls))
any(is.na(aracne.net.long.cases.all))

In [None]:
any(is.null(aracne.net.long.controls))
any(is.null(aracne.net.long.cases.all))

In [None]:
head(aracne.net.long.cases.all)

In [None]:
summary(aracne.net.long.controls$value)
summary(aracne.net.long.cases.all$value)

In [None]:
options(repr.plot.width=8, repr.plot.height=6)
# Controls network Confidence values
pdf("/grehawi/splice-reg-prj/Figures/Hist_MI_Controls.pdf")
h <- hist(aracne.net.long.controls$value, plot = FALSE)
plot(h, xlab = "Mutual Information Values", ylab = "Frequency",
     main = "Histogram of The Mutual Information Values In The Controls Network", col = "pink")
dev.off() 

In [None]:
options(repr.plot.width=8, repr.plot.height=6)
pdf("/grehawi/splice-reg-prj/Figures/Hist_MI_Cases.pdf")
# Cases network Confidence values
h <- hist(aracne.net.long.cases.all$value, plot = FALSE)
plot(h, xlab = "Mutual Information Values", ylab = "Frequency",
     main = "Histogram of The Mutual Information Values In The Cases Network", col = "pink")
dev.off() 

In [None]:
# get the network with single edge format
aracne.net.long.controls.single.edges = remove_double_edges(aracne.net.long.controls)
aracne.net.long.cases.all.single.edges = remove_double_edges(aracne.net.long.cases.all)

In [None]:
head(aracne.net.long.cases.all.single.edges)

In [None]:
# total number of edges in the networks (before filtering)
dim(aracne.net.long.controls.single.edges)
dim(aracne.net.long.cases.all.single.edges)

In [None]:
#Total number of nodes in the network (before filtering)
length(unique(c(aracne.net.long.controls.single.edges$node1, aracne.net.long.controls.single.edges$node2)))
length(unique(c(aracne.net.long.cases.all.single.edges$node1, aracne.net.long.cases.all.single.edges$node2)))


## 1.1 Network annotation and filtering

In [None]:
controls.withIds = get_network_with_ids(aracne.net.long.controls.single.edges)
cases.all.withIds = get_network_with_ids(aracne.net.long.cases.all.single.edges)
dim(controls.withIds)
dim(cases.all.withIds)
head(cases.all.withIds)

In [None]:
# annotate the edges of the network

annotated.controls = annotate_edges(controls.withIds)
annotated.cases = annotate_edges(cases.all.withIds)


# remove edges connecting features of the same gene (TE-IR or IR-IR)

annotated.filtered.controls = filter_edges(annotated.controls, trx.genes)
annotated.filtered.cases = filter_edges(annotated.cases, trx.genes)

In [None]:
#Total number of nodes in the network after filtering

length(unique(c(annotated.filtered.controls$node1, annotated.filtered.controls$node2)))
length(unique(c(annotated.filtered.cases$node1, annotated.filtered.cases$node2)))

In [None]:
head(annotated.filtered.controls)

In [None]:
dim(annotated.filtered.controls)
dim(annotated.filtered.cases)

In [None]:
table(annotated.filtered.controls$edgeType)
table(annotated.filtered.cases$edgeType)

In [None]:
annotated.filtered.cases.withNames.singleE = get_network_with_names(annotated.filtered.cases)
annotated.filtered.controls.withNames.singleE = get_network_with_names(annotated.filtered.controls)


In [None]:
write.table(annotated.filtered.cases.withNames.singleE, '/grehawi/splice-reg-prj/new-data/ARACNE/filtered_cases_withNames_singleEdges.txt')
write.table(annotated.filtered.controls.withNames.singleE, '/grehawi/splice-reg-prj/new-data/ARACNE/filtered_controls_withNames_singleEdges.txt')


In [None]:
# edge MI histogram per edge-type after filtering
#pdf("/grehawi/splice-reg-prj/Figures/Edgelevel_Hist_MI_Controls.pdf")

controls.annotated.filtered.single.edges.TETE = annotated.filtered.controls.withNames.singleE[annotated.filtered.controls.withNames.singleE$edgeType == 'TE-TE',]
controls.annotated.filtered.single.edges.TEIR = annotated.filtered.controls.withNames.singleE[annotated.filtered.controls.withNames.singleE$edgeType == 'IR-TE',]
controls.annotated.filtered.single.edges.IRIR = annotated.filtered.controls.withNames.singleE[annotated.filtered.controls.withNames.singleE$edgeType == 'IR-IR',]

col1 = rgb(0,0,1,1/4)

# Create a blank plot
hist(controls.annotated.filtered.single.edges.TETE$value,
     xlab = "Mutual Information Values",
     col = "skyblue",
     border = "black",
     cex.axis = 1.2,
     cex.lab = 1.5)


# Add another histogram
hist(controls.annotated.filtered.single.edges.IRIR$value, add = TRUE, col = col1, border = "black")

# Add another histogram on top
hist(controls.annotated.filtered.single.edges.TEIR$value, add = TRUE, col = "salmon", border = "black")

# Create a legend
legend("topright", legend = c("TE-TE", "TE-IR", "IR-IR"),
      fill = c("skyblue", "salmon" ,col1))

#dev.off()

In [None]:
summary(controls.annotated.filtered.single.edges.TETE$value)

In [None]:
summary(controls.annotated.filtered.single.edges.TEIR$value)

In [None]:
summary(controls.annotated.filtered.single.edges.IRIR$value)

In [None]:
# edge MI histogram per edge-type after filtering
#pdf("/grehawi/splice-reg-prj/Figures/Edgelevel_Hist_MI_Cases.pdf")
cases.annotated.filtered.single.edges.TETE = annotated.filtered.cases.withNames.singleE[annotated.filtered.cases.withNames.singleE$edgeType == 'TE-TE',]
cases.annotated.filtered.single.edges.TEIR = annotated.filtered.cases.withNames.singleE[annotated.filtered.cases.withNames.singleE$edgeType == 'IR-TE',]
cases.annotated.filtered.single.edges.IRIR = annotated.filtered.cases.withNames.singleE[annotated.filtered.cases.withNames.singleE$edgeType == 'IR-IR',]

col1 = rgb(0,0,1,1/4)
# Create a blank plot
hist(cases.annotated.filtered.single.edges.TETE$value,
     xlab = "Mutual Information Values",
     col = "skyblue",
     border = "black",
     cex.axis = 1.2,
     cex.lab = 1.5)

# Add another histogram
hist(cases.annotated.filtered.single.edges.IRIR$value, add = TRUE, col = col1, border = "black")

# Add another histogram on top
hist(cases.annotated.filtered.single.edges.TEIR$value, add = TRUE, col = "salmon", border = "black")

# Create a legend
legend("topright", legend = c("TE-TE", "TE-IR", "IR-IR"),
      fill = c("skyblue", "salmon" ,col1))
#dev.off()

In [None]:
summary(cases.annotated.filtered.single.edges.TETE$value)

In [None]:
summary(cases.annotated.filtered.single.edges.TEIR$value)

In [None]:
summary(cases.annotated.filtered.single.edges.IRIR$value)

## 1.2 Hub nodes analysis (before threshold filtering to compare with previous analysis)

In [None]:
g.controls_before_thr = get_igraph_obj(annotated.filtered.controls.withNames.singleE)
g.controls.degrees.before.thr = get_degree(g.controls_before_thr)

g.cases_before_thr = get_igraph_obj(annotated.filtered.cases.withNames.singleE)
g.cases.degrees.before.thr = get_degree(g.cases_before_thr)

In [None]:
head(g.controls.degrees.before.thr)

In [None]:
summary(g.controls.degrees.before.thr$degree)
summary(g.cases.degrees.before.thr$degree)

## 1.3 Put a MI threshold on the edges basd on their edge-type

In [None]:
# thresholds are determined based on median from the above distribution summery 
# We take same threshold for the controls network as in the cases network
# in order to bring the total number of edges as well as the MI distribution per edge-type 
# in both networks closer to each other 

cases.annotated.filtered.single.edges.TETE = cases.annotated.filtered.single.edges.TETE[cases.annotated.filtered.single.edges.TETE$value >= 0.24, ] 
cases.annotated.filtered.single.edges.TEIR = cases.annotated.filtered.single.edges.TEIR[cases.annotated.filtered.single.edges.TEIR$value >= 0.14, ] 
cases.annotated.filtered.single.edges.IRIR = cases.annotated.filtered.single.edges.IRIR[cases.annotated.filtered.single.edges.IRIR$value >= 0.38, ]

cases.annotated.filtered.single.edges.thr = rbind(cases.annotated.filtered.single.edges.TETE,
                                              cases.annotated.filtered.single.edges.TEIR,
                                              cases.annotated.filtered.single.edges.IRIR)

controls.annotated.filtered.single.edges.TETE = controls.annotated.filtered.single.edges.TETE[controls.annotated.filtered.single.edges.TETE$value >= 0.24, ] 
controls.annotated.filtered.single.edges.TEIR = controls.annotated.filtered.single.edges.TEIR[controls.annotated.filtered.single.edges.TEIR$value >= 0.14, ] 
controls.annotated.filtered.single.edges.IRIR = controls.annotated.filtered.single.edges.IRIR[controls.annotated.filtered.single.edges.IRIR$value >= 0.38, ]

controls.annotated.filtered.single.edges.thr = rbind(controls.annotated.filtered.single.edges.TETE,
                                              controls.annotated.filtered.single.edges.TEIR,
                                              controls.annotated.filtered.single.edges.IRIR)



In [None]:
dim(controls.annotated.filtered.single.edges.thr)
dim(cases.annotated.filtered.single.edges.thr)

In [None]:
# edge MI histogram per edge-type after filtering and Thresholding
#pdf("/grehawi/splice-reg-prj/Figures/Edgelevel_Hist_MI_Controls_thresholded.pdf")

controls.annotated.filtered.single.edges.TETE.thr = controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$edgeType == 'TE-TE',]
controls.annotated.filtered.single.edges.TEIR.thr = controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$edgeType == 'IR-TE',]
controls.annotated.filtered.single.edges.IRIR.thr = controls.annotated.filtered.single.edges.thr[controls.annotated.filtered.single.edges.thr$edgeType == 'IR-IR',]

col1 = rgb(0,0,1,1/4)

# Create a blank plot
hist(controls.annotated.filtered.single.edges.TETE.thr$value,
     xlab = "Mutual Information Values",
     col = "skyblue",
     border = "black",
     cex.axis = 1.2,
     cex.lab = 1.5)


# Add another histogram
hist(controls.annotated.filtered.single.edges.IRIR.thr$value, add = TRUE, col = col1, border = "black")

# Add another histogram on top
hist(controls.annotated.filtered.single.edges.TEIR.thr$value, add = TRUE, col = "salmon", border = "black")

#Create a legend
legend("topright", legend = c("TE-TE", "TE-IR", "IR-IR"),
      fill = c("skyblue", "salmon" ,col1))

#dev.off()

In [None]:
# edge MI histogram per edge-type after filtering and thresholding
#pdf("/grehawi/splice-reg-prj/Figures/Edgelevel_Hist_MI_Cases_thresholded.pdf")
cases.annotated.filtered.single.edges.TETE.thr = cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$edgeType == 'TE-TE',]
cases.annotated.filtered.single.edges.TEIR.thr = cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$edgeType == 'IR-TE',]
cases.annotated.filtered.single.edges.IRIR.thr = cases.annotated.filtered.single.edges.thr[cases.annotated.filtered.single.edges.thr$edgeType == 'IR-IR',]

col1 = rgb(0,0,1,1/4)
# Create a blank plot
hist(cases.annotated.filtered.single.edges.TETE.thr$value,
     xlab = "Mutual Information Values",
     col = "skyblue",
     border = "black",
     cex.axis = 1.2,
     cex.lab = 1.5)

# Add another histogram
hist(cases.annotated.filtered.single.edges.IRIR.thr$value, add = TRUE, col = col1, border = "black")

# Add another histogram on top
hist(cases.annotated.filtered.single.edges.TEIR.thr$value, add = TRUE, col = "salmon", border = "black")

# Create a legend
legend("topright", legend = c("TE-TE", "TE-IR", "IR-IR"),
      fill = c("skyblue", "salmon" ,col1))
#dev.off()

In [None]:
summary(cases.annotated.filtered.single.edges.TETE.thr$value)

In [None]:
summary(cases.annotated.filtered.single.edges.TEIR.thr$value)

In [None]:
summary(cases.annotated.filtered.single.edges.IRIR.thr$value)

In [None]:
summary(controls.annotated.filtered.single.edges.TETE.thr$value)

In [None]:
summary(controls.annotated.filtered.single.edges.TEIR.thr$value)

In [None]:
summary(controls.annotated.filtered.single.edges.IRIR.thr$value)

In [None]:
write.table(cases.annotated.filtered.single.edges.thr, '/grehawi/splice-reg-prj/new-data/ARACNE/filtered_thr_cases_withNames_singleEdges.txt')
write.table(controls.annotated.filtered.single.edges.thr, '/grehawi/splice-reg-prj/new-data/ARACNE/filtered_thr_controls_withNames_singleEdges.txt')
