In [35]:
from hypergraphs import *
from kegg import *
import pandas as pd
import os
import logging
from Bio.KEGG.KGML.KGML_parser import read

In [36]:
organismDict = {"hsa": "Homo sapiens (human)"}
outputDirectory = "/Users/boldi/Desktop/pw/"
inputDirectory = "../../LaTeX/Data/KEGG-Pathways/Nov2023/"
dataDirectory = "../../LaTeX/Data/KEGG-Pathways"

In [37]:
pws = pd.read_csv(os.path.join(inputDirectory, "KEGG-pathways.txt"), header=0, 
            names=["Pathway ID", "Pathway type", "Pathway name"], dtype="string", keep_default_na=False)


In [38]:
superpathwayDict = {
    "Involved": [(x,y) for x,y in zip(pws["Pathway ID"], pws["Pathway name"])]
}

In [39]:
superpathway = "Involved"
organism = "hsa"

In [40]:
safeSuperpathway = makesafe(superpathway)

In [41]:
def words2set(words):
    """
        Given a space-separated list of words, returns it as a set of words
    """
    return set(words.split())

In [42]:
from typing import NamedTuple

class Relation(NamedTuple):
    """
        Represents a relation. It is characterized by:
        - a type (e.g. "PPRel") 
        - a list of subtypes (e.g. [('activation', '-->'), ('indirect effect', '..>')])
        - a set of components (source)
        - a list of sets of components (target)
    """
    relType: str
    relSubtypes: list
    source: set
    target: list

    @classmethod
    def fromKEGG(cls, relation, pathway):
        """
            Constructor: builds a Relation from a KEGG relation
        """
        if relation.entry2.name == "undefined":
            s = []
            for component in relation.entry2.components:
                s += [words2set(pathway.entries[component.id].name)]
        else:
            s = [words2set(relation.entry2.name)]
        return cls(relation.type, relation.subtypes, words2set(relation.entry1.name), s)
        

In [43]:
def relations(pathway):
    """
        Given a KEGG pathway, returns the list of relations it contains (represented as Relation).
    """
    return [Relation.fromKEGG(r, pathway) for r in pathway.relations]
    

In [44]:
def relationsFromFile(dataDirectory, organism, pathwayID):
    """
        Given a pathway file (specified by a root dataDirectory, the name of the organism subdirectory, and the name
        of the file [organism+pathwayID.xml]), reads it and returns the list of its relations.
    """
    pathway = KGML_parser.read(open(os.path.join(dataDirectory, organism, organism+pathwayID+".xml"), "r"))
    return relations(pathway)

In [45]:
def relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway):
    """
        Given a superpathDict (whose keys are superpathway names and whose values are list of
        pairs (pathwayID, pathwayName)), reads all the pathway file for a specific superpathway name
        and returns a dictionary whose keys are the pathwayID's and whose values are the list of relations.
    """
    rel = {}
    for k,v in superpathwayDict[superpathway]:
        #logging.info("Reading", k, v)
        rel[k] = relationsFromFile(dataDirectory, organism, k)
    return rel

In [46]:
def subtypes(rel):
    """
        Given a map whose values are relations, accumulate all subtypes appearing and assign them a number.
        The result is a map from subtype string to number.
    """
    all_subtypes = set([])
    for pathwayid, relations in rel.items():
        for relation in relations:
            all_subtypes |= set([str(relation.relSubtypes)])
        
    s = sorted(list(all_subtypes))
    subtype2color = {v:k for k,v in enumerate(s)}

    return subtype2color

In [47]:
rel = relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway)

In [52]:
subtype2color = subtypes(rel)

G = nx.DiGraph()
for pathwayID, relations in rel.items():
    for relation in relations:
        for x in relation.source:
            for t in relation.target:
                for y in t:
                    G.add_edge(x, y, color=subtype2color[str(relation.relSubtypes)])

In [53]:
cc=cardon_crochemore_colored(G)

In [56]:
G.number_of_nodes()

634

In [58]:
len(set(cc.values()))

144

In [59]:
for x in set(cc.values()):
    print(x,[k for k in cc if cc[k]==x])

0 ['hsa:10672']
1 ['hsa:5747']
2 ['hsa:6714']
3 ['hsa:2266', 'hsa:2243', 'hsa:2244']
4 ['hsa:5578']
5 ['hsa:5582', 'hsa:5579']
6 ['hsa:859', 'hsa:858', 'hsa:857']
7 ['hsa:894', 'hsa:896']
8 ['hsa:595']
9 ['hsa:329', 'hsa:330', 'hsa:331']
10 ['hsa:2885']
11 ['hsa:3667']
12 ['hsa:6774']
13 ['hsa:5290', 'hsa:5291', 'hsa:8503', 'hsa:5293', 'hsa:5296', 'hsa:5295']
14 ['hsa:362']
15 ['hsa:3783', 'hsa:3778', 'hsa:54831']
16 ['hsa:808', 'hsa:51806', 'hsa:801', 'hsa:805', 'hsa:810', 'hsa:163688', 'hsa:91860', 'hsa:815', 'hsa:818', 'hsa:817', 'hsa:816']
17 ['hsa:340156', 'hsa:91807', 'hsa:85366', 'hsa:4638']
18 ['hsa:10235', 'hsa:10125']
19 ['hsa:2475']
20 ['hsa:64223', 'hsa:57521']
21 ['hsa:5970', 'hsa:4790']
22 ['hsa:2776']
23 ['hsa:2778']
24 ['hsa:5879']
25 ['hsa:5336']
26 ['hsa:5335', 'hsa:6915', 'hsa:2771', 'hsa:2773', 'hsa:2770']
27 ['cpd:C05981']
28 ['hsa:7248', 'hsa:7249']
29 ['hsa:4067']
30 ['hsa:2207']
31 ['hsa:2534']
32 ['hsa:3091']
33 ['hsa:347688', 'hsa:7280', 'hsa:203068', 'hsa:347