In [8]:
import networkx as nx
import pandas as pd
from networkx.algorithms import bipartite

data = pd.read_csv('DF-Miner_miner-disease-function.tsv', sep='\t')
DF_graph = nx.Graph()
df_edgedata = list(zip(data['# MESH_ID'], data['GO_ID']))
disease_nodes = list(data['# MESH_ID'])
function_nodes = list(data['GO_ID'])
DF_graph.add_edges_from(df_edgedata)
DF_graph.nodes

NodeView(('MESH:D000037', 'GO:0009257', 'MESH:C536409', 'MESH:D009436', 'MESH:D000860', 'GO:0009258', 'MESH:D008106', 'MESH:D009765', 'MESH:D056486', 'GO:0006653', 'MESH:D015785', 'MESH:D057130', 'MESH:C567636', 'MESH:D029242', 'MESH:D014802', 'MESH:C535306', 'GO:0006103', 'MESH:D000230', 'MESH:C536582', 'MESH:D000592', 'MESH:D001321', 'MESH:D001943', 'MESH:D002277', 'MESH:D044584', 'MESH:D006528', 'MESH:D002285', 'MESH:D018275', 'MESH:D018281', 'MESH:D002812', 'MESH:D018450', 'MESH:D004687', 'MESH:D004827', 'MESH:D005910', 'MESH:D006391', 'MESH:143890', 'MESH:D007119', 'MESH:D008607', 'MESH:D046150', 'MESH:C537871', 'MESH:D007888', 'MESH:D054066', 'MESH:D015470', 'MESH:D008175', 'MESH:D016399', 'MESH:D016410', 'MESH:D016411', 'MESH:D015674', 'MESH:D008325', 'MESH:D008375', 'MESH:D017202', 'MESH:D009421', 'MESH:D009837', 'MESH:D010003', 'MESH:D010024', 'MESH:D011230', 'MESH:D054198', 'MESH:D011471', 'MESH:D012174', 'MESH:C565805', 'MESH:D013274', 'MESH:C536783', 'MESH:D020176', 'MESH:D

In [9]:
import numpy as np

gene_function_data = pd.read_csv('GF-Miner_miner-gene-function.tsv', sep = '\t')
gf_edgedata = list(zip(gene_function_data['# GO_ID'], gene_function_data['Gene']))
gf_graph = nx.Graph()
gf_graph.add_edges_from(gf_edgedata)
deg_cents_gf = nx.degree_centrality(gf_graph)
function_nodes_gf = list(set(gene_function_data['# GO_ID']))

function_cents = [(i, deg_cents_gf[i]) for i in deg_cents_gf.keys() if i in function_nodes_gf]
function_cents = sorted(function_cents, key=lambda x: x[1], reverse = True)

a = np.empty([1, len(function_cents)])
for dc in range(len(function_cents)):
    np.append(a, function_cents[dc][1])

ignore_thr = np.percentile(a, 90)
ign_functions = [i[0] for i in function_cents if i[1] > ignore_thr]      # removing all nodes with higher degree centralities

print(ign_functions)

function_nodes_gf = [i for i in function_nodes_gf if i not in ign_functions]    

print('before pruning',len(df_edgedata))
df_edgedata = [i for i in df_edgedata if i[1] not in ign_functions]
print('after pruning',len(df_edgedata))



['GO:0005509', 'GO:0004672', 'GO:0005524', 'GO:0005634', 'GO:0005737', 'GO:0006468', 'GO:0005615', 'GO:0006629', 'GO:0006869', 'GO:0008047', 'GO:0042627', 'GO:0043085', 'GO:0001932', 'GO:0003677', 'GO:0005671', 'GO:0006357', 'GO:0007067', 'GO:0016573', 'GO:0031063', 'GO:0031647', 'GO:0072686', 'GO:0090043', 'GO:0007399', 'GO:0016021', 'GO:0004871', 'GO:0005834', 'GO:0007186', 'GO:0008270', 'GO:0000166', 'GO:0003676', 'GO:0005654', 'GO:0005829', 'GO:0007026', 'GO:0008017', 'GO:0031175', 'GO:0046785', 'GO:0003950', 'GO:0006302', 'GO:0006471', 'GO:0000445', 'GO:0006397', 'GO:0005212', 'GO:0001525', 'GO:0005021', 'GO:0007411', 'GO:0017154', 'GO:0038084', 'GO:0046872', 'GO:0071526', 'GO:0003723', 'GO:0003735', 'GO:0006412', 'GO:0015935', 'GO:0019843', 'GO:0000213', 'GO:0000214', 'GO:0000379', 'GO:0016829', 'GO:0090502', 'GO:0005730', 'GO:0031965', 'GO:0005525', 'GO:0005769', 'GO:0005811', 'GO:0005886', 'GO:0006897', 'GO:0010008', 'GO:0010886', 'GO:0030139', 'GO:0031095', 'GO:0032456', 'GO:0

before pruning 802760
after pruning 27607


In [10]:
DF_graph = nx.Graph()
DF_graph.add_edges_from(df_edgedata)

In [11]:
# function for getting a weighted projection of the disease gene association network. 
# this will give a specific network of disease nodes with edge weights being the number of genes shared between them 

from itertools import combinations
def get_edge_wt_projection(graph, nodes):
    
    nodes = list(set(nodes))
    wt_prj = nx.Graph()
    combs = combinations(nodes, 2)
    prj = nx.bipartite.projected_graph(graph, nodes)
    wt_prj.edges.data('weight', default=1)
    for comb in combs:
        if comb in prj.edges:
            # count how many nodes are common between the neighbourhoods of the two nodes
            n1 = set(list(graph.neighbors(comb[0]))) 
            n2 = set(list(graph.neighbors(comb[1])))
            intrsct = n1.intersection(n2)
            wt_prj.add_edge(comb[0], comb[1], weight=len(intrsct))     # add weighted edge to the graph 
    
    return wt_prj

disease_projection = get_edge_wt_projection(DF_graph, disease_nodes)


KeyError: 'MESH:D046150'

In [None]:
# finding eigenvector centralities of the nodes
disease_ev_cents = (nx.eigenvector_centrality(disease_projection, max_iter=1000)).items()

# finding degree centralities of the nodes 
disease_dg_cents = (nx.degree_centrality(disease_projection)).items()

In [None]:
# calculating the vertex strength of the nodes in the graph

adj = (nx.adjacency_matrix(disease_projection)).todense()           # todense() is for making it into an actual matrix
n = len(disease_projection.nodes)
vertex_st = {}
nodes = list(disease_projection.nodes)

for i in nodes:    
    
    vertex_st[i] = 0
    for j in nodes:
        # try except block to prevent keyerror from trying to find non-existent edge. 
        try:
            vertex_st[i] += disease_projection[i][j]['weight'] * adj[nodes.index(i), nodes.index(j)]  
        except KeyError:
            pass

vertex_st = vertex_st.items()

In [None]:
# combining all the data into a dataframe

combined_data = pd.DataFrame(columns =['disease ID', 'deg_cent','ev_cent', 'vertex_st'])
combined_data['disease ID'] = [i[0] for i in disease_ev_cents]
combined_data['deg_cent'] = [i[1] for i in disease_dg_cents]
combined_data['ev_cent'] = [i[1] for i in disease_ev_cents]
combined_data['vertex_st'] = [i[1] for i in vertex_st]
combined_data.head()

In [None]:
import matplotlib.pyplot as plt
plt.hist(combined_data['vertex_st'], bins = 100)
plt.xlabel('vertex strength')
plt.show()

# proper power law shizz
# this can also be one of the observations we write
# Think of some inferences to be drawn from the disease projection network being a power law network. 
# this also means we might get properly decent clusters (nice.)

In [None]:
# sort the edges according to the weights
sortededges = sorted(disease_projection.edges(data = True), key = lambda x: x[2]['weight'], reverse=True)
top5 = sortededges[:5]
top5

In [None]:

bottom10 = sortededges[-10:]
bottom10

# this has a lot of diseases that are not completely of genetic origin
# we need to filter out those diseases that are purely genetic in origin and the remaining ones will be the ones that are genetically inversely comorbid

In [None]:
sorted_data = pd.DataFrame(sortededges, columns = ['Disease1', 'Disease2', 'Weight'])
sorted_data.loc[sorted_data['Disease1'] == 'C0030567']