In [1]:
import networkx as nx
import pandas as pd

In [9]:
class Node:
    def __init__(self, name, id, type):
        self.name = name
        self.id = id
        self.type = type 
    
    def _get_name(self):
        return self.name
    
    def _get_type(self):
        return self.type
    
    def _get_id(self):
        return self.id

In [10]:
# load raw nodes 
df_raw_nodes = pd.read_csv("/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/nodes/raw_nodes.txt", sep="\t", header=None)
df_raw_nodes.rename(columns={0: "node_type", 1:"name"}, inplace=True)
df_raw_nodes.drop_duplicates("name", inplace=True)

In [11]:
df_raw_nodes["node_type"].value_counts()
# 213 extra proteins
# 16 fewer complexes 

complex     7797
protein     7324
family      1574
abstract     586
rna           52
miRNA         15
Name: node_type, dtype: int64

In [12]:
# create unique identifiers for all components (pathway identifiers are added later on in this document)
identifiers = dict(zip(df_raw_nodes.name, df_raw_nodes.index))
len(identifiers)

17348

In [11]:
dict_of_nodes = {}
for idx in df_raw_nodes.index:
    dict_of_nodes[idx] = Node(name=df_raw_nodes.loc[idx, "name"], id=identifiers[df_raw_nodes.loc[idx, "name"]], type=df_raw_nodes.loc[idx, "node_type"])

import pickle
save_dir = "/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/nodes/nodes.pickle"
with open(save_dir, 'wb') as handle:
    pickle.dump(dict_of_nodes, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [28]:
# save the identifiers dictionary 
import pickle
save_dir = "/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/nodes/node_identifiers.pickle"
with open(save_dir, 'wb') as handle:
    pickle.dump(identifiers, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [97]:
# define the proteins -> complexes edges 
df_compositions = pd.read_csv("/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/edges/edges_raw.txt", sep='\t', header=None)
df_compositions.rename(columns={0: "parent", 1:"child", 2: "type"}, inplace=True)
df_compositions["type"].value_counts()

component>    24129
member>        7170
-a>            5337
-a|            3266
-t>            2120
-t|             397
Name: type, dtype: int64

In [98]:
edge_types = {
    "component>":"component",
    "member>":"component", 
    "-a>":"activation", 
    "-a|":"inhibition", 
    "-t>":"transcriptional_activation",
    "-t|":"transcriptional_inhibition",
    }

edge_attr_dict_raw = {
    "component":False,
    "activation":False,
    "inhibition":False,
    "transcriptional_activation":False,
    "transcriptional_inhibition":False
    }

In [99]:
df_compositions["type"] = df_compositions["type"].apply(lambda x : edge_types[x])
df_compositions["type"].value_counts()

component                     31299
activation                     5337
inhibition                     3266
transcriptional_activation     2120
transcriptional_inhibition      397
Name: type, dtype: int64

In [41]:
class Edge:
    def __init__(self, source_id, target_id, attr_dict):
        self.source_id = source_id
        self.target_id = target_id
        self.attr_dict = attr_dict
    
    def _get_source(self):
        return self.source_id
    
    def _get_target(self):
        return self.target_id
    
    def _get_attr(self):
        return self.attr_dict
    
    def __eq__(self, other):
        return (self._get_source() == other._get_source()) and (self._get_target() == other._get_target())
    
    def _update_dict(self, other):
        final_dict = {}
        for key in self._get_attr():
            final_dict[key] = self._get_attr()[key] or other._get_attr()[key]
        return final_dict

In [105]:
list_of_edges = []
for edge_type in edge_types:
    
    edge_name = edge_types[edge_type]
    df_curr = df_compositions[df_compositions["type"] == edge_type]
    ignore = []
    
    for parent_name in df_curr["parent"]:
        if not (parent_name in ignore):
            parent_id = identifiers[parent_name]
            df_temp = df_curr[df_curr["parent"] == parent_name]

            for i in df_temp.index:
                child_name = df_temp.loc[i, "child"]
                child_id = identifiers[child_name]
                attr_dict = edge_attr_dict_raw.copy()
                attr_dict[edge_name] = True
                list_of_edges.append(Edge(source_id=parent_id, target_id=child_id, attr_dict=attr_dict))
            ignore.append(parent_name)

In [103]:
len(list_of_edges)

0

In [87]:
# define proteins -> families edges
df_family_comps = pd.read_csv("/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/edges/family_components.txt", sep='\t', header=None)
df_family_comps.drop(2, axis=1, inplace=True)
df_family_comps.rename(columns={0: "parent", 1:"child"}, inplace=True)

In [88]:
ignore = []
list_of_tuples = []
for parent_name in df_family_comps["parent"]:
    if not (parent_name in ignore):
        parent_id = identifiers[parent_name]
        df_temp = df_family_comps[df_family_comps["parent"] == parent_name]

        for i in df_temp.index:
            child_name = df_temp.loc[i, "child"]
            child_id = identifiers[child_name]
            list_of_tuples.append((parent_id, child_id))
        ignore.append(parent_name)

In [89]:
# write protein->family edges to a \t file 
f = open("/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/edges/protein_family_edges.txt", "w")
for edge in list_of_tuples:
    str_to_write = "{}\t{}".format(edge[0], edge[1])
    f.write(str_to_write)
    f.write("\n")
f.close()

### Pathway stuff

In [32]:
# pathway stuff 
f = open("/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/pathways_metadata/pid_120912_genesets.gmt")
lines = f.readlines()
f.close()

In [33]:
# put pathways into a dictionary 
count = 17364
pathway_identifiers = {}
pathway_compositions_by_ids = {}
for i in lines:
    pathway_name = i.split("\t")[0]
    gene_set = i.split("\t")[1:]
    pathway_identifiers[pathway_name] = count 
    
    gene_set_by_ids = []

    # currently pathway composition is name-based. But we want to convert those names in node ids from identifiers 
    for item in gene_set:
        item = item.strip()
        if not ("smallMolecule" in item):
            id = identifiers.get(item)
            if id:
                gene_set_by_ids.append(id)
    pathway_compositions_by_ids[count] = gene_set_by_ids

    count += 1


In [34]:
# add pathway identifiers to identifiers
identifiers = {**identifiers, **pathway_identifiers}

import pickle
save_dir = "/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/nodes/node_identifiers.pickle"
with open(save_dir, 'wb') as handle:
    pickle.dump(identifiers, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
# load the nodes that already exist and add the pathway nodes to it 
with open("/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/nodes/nodes.pickle", "rb") as input_file:
     nodes_dict = pickle.load(input_file)

In [36]:
for pathway in pathway_identifiers.keys():
    name = pathway
    id = pathway_identifiers[name]
    type = "pathway"
    nodes_dict[id] = Node(name=name, id=id, type=type)

In [37]:
# save new nodes dict 
import pickle
save_dir = "/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/nodes/nodes.pickle"
with open(save_dir, 'wb') as handle:
    pickle.dump(nodes_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [38]:
len(nodes_dict)

18725

In [42]:
# create a dict of edges to pathways 
edge_attr_dict_raw = {
    "component":True,
    "activation":False,
    "inhibition":False,
    "transcriptional_activation":False,
    "transcriptional_inhibition":False
    }
count = 0
edges_to_pathways_dict = {}
for pathway_node_id in pathway_compositions_by_ids:
    list_of_constituents = pathway_compositions_by_ids[pathway_node_id]
    target_id = pathway_node_id
    for source_id in list_of_constituents:
        edges_to_pathways_dict[count] = Edge(source_id=source_id, target_id=target_id, attr_dict=edge_attr_dict_raw)
        count += 1

In [43]:
save_dir = "/media/hdd1/Anurag/Projects/Code/reactome_gnn/superpathway/data/edges/edges_to_pathways.pickle"
with open(save_dir, 'wb') as handle: 
    pickle.dump(edges_to_pathways_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
