In [88]:
from hypergraphs import *
from kegg import *
import pandas as pd
import os
import logging
from Bio.KEGG.KGML.KGML_parser import read

In [89]:
organismDict = {"hsa": "Homo sapiens (human)"}
outputDirectory = "/Users/boldi/Desktop/pw/"
inputDirectory = "../../LaTeX/Data/KEGG-Pathways/Nov2023/"
dataDirectory = "../../LaTeX/Data/KEGG-Pathways"

In [90]:
pws = pd.read_csv(os.path.join(inputDirectory, "KEGG-pathways.txt"), header=0, 
            names=["Pathway ID", "Pathway type", "Pathway name"], dtype="string", keep_default_na=False)


In [91]:
superpathwayDict = {
    "Involved": [(x,y) for x,y in zip(pws["Pathway ID"], pws["Pathway name"])]
}

In [92]:
superpathway = "Involved"
organism = "hsa"

In [93]:
safeSuperpathway = makesafe(superpathway)

In [94]:
def words2set(words):
    """
        Given a space-separated list of words, returns it as a set of words
    """
    return set(words.split())

In [95]:
from typing import NamedTuple

class Relation(NamedTuple):
    """
        Represents a relation. It is characterized by:
        - a type (e.g. "PPRel") 
        - a list of subtypes (e.g. [('activation', '-->'), ('indirect effect', '..>')])
        - a set of components (source)
        - a list of sets of components (target)
    """
    relType: str
    relSubtypes: list
    source: set
    target: list

    @classmethod
    def fromKEGG(cls, relation, pathway):
        """
            Constructor: builds a Relation from a KEGG relation
        """
        if relation.entry2.name == "undefined":
            s = []
            for component in relation.entry2.components:
                s += [words2set(pathway.entries[component.id].name)]
        else:
            s = [words2set(relation.entry2.name)]
        return cls(relation.type, relation.subtypes, words2set(relation.entry1.name), s)
        

In [96]:
def relations(pathway):
    """
        Given a KEGG pathway, returns the list of relations it contains (represented as Relation).
    """
    return [Relation.fromKEGG(r, pathway) for r in pathway.relations]
    

In [105]:
def relationsFromFile(dataDirectory, organism, pathwayID):
    """
        Given a pathway file (specified by a root dataDirectory, the name of the organism subdirectory, and the name
        of the file [organism+pathwayID.xml]), reads it and returns the list of its relations.
    """
    pathway = KGML_parser.read(open(os.path.join(dataDirectory, organism, organism+pathwayID+".xml"), "r"))
    return relations(pathway)

In [106]:
def relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway):
    """
        Given a superpathDict (whose keys are superpathway names and whose values are list of
        pairs (pathwayID, pathwayName)), reads all the pathway file for a specific superpathway name
        and returns a dictionary whose keys are the pathwayID's and whose values are the list of relations.
    """
    rel = {}
    for k,v in superpathwayDict[superpathway]:
        #logging.info("Reading", k, v)
        rel[k] = relationsFromFile(dataDirectory, organism, k)
    return rel

In [108]:
relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway)

{'04510': [Relation(relType='PPrel', relSubtypes=[('inhibition', '--|'), ('phosphorylation', '+p')], source={'hsa:5894'}, target=[{'hsa:572'}]),
  Relation(relType='PPrel', relSubtypes=[('binding/association', '---')], source={'hsa:2013'}, target=[{'hsa:3693', 'hsa:3696', 'hsa:3695', 'hsa:3688', 'hsa:3691', 'hsa:3694', 'hsa:3690'}]),
  Relation(relType='PPrel', relSubtypes=[('activation', '-->')], source={'hsa:1793'}, target=[{'hsa:5881', 'hsa:5880', 'hsa:5879'}]),
  Relation(relType='PPrel', relSubtypes=[('inhibition', '--|'), ('dephosphorylation', '-p')], source={'hsa:4660', 'hsa:4659', 'hsa:5501', 'hsa:5499', 'hsa:54776', 'hsa:5500'}, target=[{'hsa:29895', 'hsa:58498', 'hsa:93408', 'hsa:4633', 'hsa:4636', 'hsa:10398', 'hsa:10627', 'hsa:103910'}]),
  Relation(relType='PPrel', relSubtypes=[('activation', '-->'), ('phosphorylation', '+p')], source={'hsa:340156', 'hsa:91807', 'hsa:85366', 'hsa:4638'}, target=[{'hsa:29895', 'hsa:58498', 'hsa:93408', 'hsa:4633', 'hsa:4636', 'hsa:10398', '

In [12]:
writeKEGGHtml(dd, os.path.join(outputDirectory, organism, safeSuperpathway), safeSuperpathway + "R", 
              f"{organismDict[organism]}, {superpathway}, Reactions group",
              onlyNonTrivial=True, prefOnly="r", cutPref="rn")
index = [(f"{safeSuperpathway}R-cont-0.html", "Reaction groups")]

In [13]:
writeKEGGHtml(dd, os.path.join(outputDirectory, organism, safeSuperpathway), safeSuperpathway + "C", 
              f"{organismDict[organism]}, {superpathway}, Compound group",
              onlyNonTrivial=True, prefOnly="C", cutPref="")
index += [(f"{safeSuperpathway}C-cont-0.html", "Compound groups")]

In [14]:
groupedReactions = mapKeys(dd, prefOnly="rn")
r2gr = {}
r2grall = {}
for i, reactions in enumerate(groupedReactions):
    for reaction in reactions:
        r2gr[reaction] = i
        r2grall[reaction] = reactions

In [15]:
groupedCompounds = mapKeys(dd, prefOnly="C")
c2gc = {}
c2gcall = {}
for i, compounds in enumerate(groupedCompounds):
    for compound in compounds:
        c2gc[compound] = i + len(groupedReactions)
        c2gcall[compound] = compounds

In [16]:
index += writeKEGGpdf(outputDirectory, organism, superpathway, superpathwayDict, r2gr, r2grall, c2gc, c2gcall)

In [17]:
writeIndex(os.path.join(outputDirectory, organism, safeSuperpathway), index, 
           f"{organismDict[organism]}, {superpathway}")

In [18]:
with open(os.path.join(outputDirectory, organism, safeSuperpathway, "data.pkl"), "wb") as file:
    pickle.dump([dd, sr2r, r2ss, rid2rname, mid2mname], file)

In [19]:
writeIndex(os.path.join(outputDirectory, organism), 
           [("{}/index.html".format(makesafe(k)), k) for k in superpathwayDict.keys()],
           f"{organismDict[organism]}")

In [20]:
writeIndex(os.path.join(outputDirectory), 
          [(f"{k}/index.html", v) for k,v in organismDict.items()],
           "Pathways")

In [21]:
with open(os.path.join(outputDirectory, organism, safeSuperpathway, "data.pkl"), "rb") as file:
    dd, sr2r, r2ss, rid2rname, mid2mname = pickle.load(file)