In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import dgl

Using backend: pytorch


In [2]:
# Node file
gse = pd.read_csv('../GSE_full.csv', sep=',')

# Edge file
ppi = pd.read_csv('../PPI_net.txt', sep=' ')

In [3]:
gse.head(5)

Unnamed: 0.1,Unnamed: 0,5'nucleotidase increased,AGEP,AIDS dementia complex,AION,AML progression,ANA increased,Abasia,Abdominal abscess,Abdominal adhesions,...,Xerophthalmia,Xerosis,Yawning,Yeast infection,Yellow nail syndrome,Yellow skin,Zinc deficiency,Zollinger-Ellison syndrome,pH urine decreased,pH urine increased
0,9606.ENSP00000000233,0,1,1,1,0,0,1,0,0,...,10,2,19,0,0,2,0,0,0,0
1,9606.ENSP00000000412,0,2,1,3,1,1,1,1,0,...,12,4,32,0,0,4,0,0,0,1
2,9606.ENSP00000001008,0,2,1,3,1,1,1,1,0,...,16,4,33,0,0,4,0,0,0,1
3,9606.ENSP00000001146,0,0,0,0,0,0,0,0,0,...,1,0,3,0,0,0,0,0,0,0
4,9606.ENSP00000002125,0,0,0,1,0,0,0,0,0,...,3,0,6,0,0,0,0,0,0,0


In [4]:
ppi = ppi[['protein1', 'protein2', 'neighborhood']]
ppi['neighborhood'] = 1
ppi.head(5)

Unnamed: 0,protein1,protein2,neighborhood
0,9606.ENSP00000000233,9606.ENSP00000272298,1
1,9606.ENSP00000000233,9606.ENSP00000253401,1
2,9606.ENSP00000000233,9606.ENSP00000401445,1
3,9606.ENSP00000000233,9606.ENSP00000418915,1
4,9606.ENSP00000000233,9606.ENSP00000327801,1


In [8]:
#Nodes need to be numerical and ordered corectly, so we make a id dic
genes = gse['Unnamed: 0'].tolist()
gene_ids = {}
for i, gene in enumerate(genes):
    gene_ids[gene] = i+1
gene_ids   

{'9606.ENSP00000000233': 1,
 '9606.ENSP00000000412': 2,
 '9606.ENSP00000001008': 3,
 '9606.ENSP00000001146': 4,
 '9606.ENSP00000002125': 5,
 '9606.ENSP00000002165': 6,
 '9606.ENSP00000002596': 7,
 '9606.ENSP00000002829': 8,
 '9606.ENSP00000003084': 9,
 '9606.ENSP00000003100': 10,
 '9606.ENSP00000003302': 11,
 '9606.ENSP00000004531': 12,
 '9606.ENSP00000005178': 13,
 '9606.ENSP00000005226': 14,
 '9606.ENSP00000005257': 15,
 '9606.ENSP00000005260': 16,
 '9606.ENSP00000005284': 17,
 '9606.ENSP00000005286': 18,
 '9606.ENSP00000005340': 19,
 '9606.ENSP00000005386': 20,
 '9606.ENSP00000005587': 21,
 '9606.ENSP00000005995': 22,
 '9606.ENSP00000006015': 23,
 '9606.ENSP00000006053': 24,
 '9606.ENSP00000006275': 25,
 '9606.ENSP00000006526': 26,
 '9606.ENSP00000006658': 27,
 '9606.ENSP00000006724': 28,
 '9606.ENSP00000006777': 29,
 '9606.ENSP00000007390': 30,
 '9606.ENSP00000007414': 31,
 '9606.ENSP00000007699': 32,
 '9606.ENSP00000007722': 33,
 '9606.ENSP00000007735': 34,
 '9606.ENSP00000008391'

In [17]:
#There are 11 genes that are in GSE, so we remove them.
drop_list =[] 

for gene in ppi['protein1'].unique().tolist():
    if gene not in gene_ids:
        drop_list.append(gene)
    else: pass
drop_list

for gene in drop_list: 
    ppi.drop(ppi.loc[ppi['protein1'] == gene].index, inplace=True)
    ppi.drop(ppi.loc[ppi['protein2'] == gene].index, inplace=True)


In [18]:
#converting to gene ids
%%time
def gene2id (gene):
    return(gene_ids[gene])

gse['Unnamed: 0'] = gse.apply(lambda x: gene2id(x['Unnamed: 0']), axis=1)
ppi['protein1'] = ppi.apply(lambda x: gene2id(x['protein1']), axis=1)
ppi['protein2'] = ppi.apply(lambda x: gene2id(x['protein2']), axis=1)

Wall time: 3min 26s


In [19]:
#introducing edges
src = ppi['protein1'].to_numpy()
dst = ppi['protein2'].to_numpy()

# Create a DGL graph from a pair of numpy arrays
g = dgl.graph((src, dst))

# Print a graph gives some meta information such as number of nodes and edges.
print(g)

Graph(num_nodes=19556, num_edges=11738330,
      ndata_schemes={}
      edata_schemes={})


In [20]:
print('#Nodes', g.number_of_nodes())
print('#Edges', g.number_of_edges())

#Nodes 19556
#Edges 11738330


In [None]:
%%time
# Since the actual graph is undirected, we convert it for visualization purpose.
nx_g = g.to_networkx().to_undirected()
# Kamada-Kawaii layout usually looks pretty for arbitrary graphs
pos = nx.kamada_kawai_layout(nx_g)
nx.draw(nx_g, pos, with_labels=True, node_color=[[.7, .7, .7]])

In [None]:
'''
decagon = pd.read_csv('../original data/bio-decagon-combo.csv', sep=',')
decagon_se = decagon['Side Effect Name'].unique().tolist()
len(decagon_se)
pse = gse.columns.values.tolist()
decagon_se_lower = []
final_pse = []

for se in decagon_se:
    decagon_se_lower.append(se.lower())
    
for i,se in enumerate(pse):
    if se.lower() in decagon_se_lower:
        final_pse.append(se)
        print(i,se)
    else:
        #print(i,se)
        pass
#final_pse    
'''
#ppi.to_csv('PPI-net-clean.csv', index=False, sep = ',')