In [1]:
from hypergraphs import *
from kegg import *
import pandas as pd
import os
import logging
from Bio.KEGG.KGML.KGML_parser import read

In [2]:
organismDict = {"hsa": "Homo sapiens (human)"}
outputDirectory = "/Users/boldi/Desktop/pw/"
inputDirectory = "../../LaTeX/Data/KEGG-Pathways/Nov2023/"
dataDirectory = "../../LaTeX/Data/KEGG-Pathways"

In [3]:
pws = pd.read_csv(os.path.join(inputDirectory, "KEGG-pathways.txt"), header=0, 
            names=["Pathway ID", "Pathway type", "Pathway name"], dtype="string", keep_default_na=False)


In [4]:
superpathwayDict = {
    "Involved": [(x,y) for x,y in zip(pws["Pathway ID"], pws["Pathway name"])]
}

In [5]:
superpathway = "Involved"
organism = "hsa"

In [6]:
safeSuperpathway = makesafe(superpathway)

In [7]:
def words2set(words):
    """
        Given a space-separated list of words, returns it as a set of words
    """
    return set(words.split())

In [8]:
from typing import NamedTuple

class Relation(NamedTuple):
    """
        Represents a relation. It is characterized by:
        - a type (e.g. "PPRel") 
        - a list of subtypes (e.g. [('activation', '-->'), ('indirect effect', '..>')])
        - a set of components (source)
        - a list of sets of components (target)
    """
    relType: str
    relSubtypes: list
    source: set
    target: list

    @classmethod
    def fromKEGG(cls, relation, pathway):
        """
            Constructor: builds a Relation from a KEGG relation
        """
        if relation.entry2.name == "undefined":
            s = []
            for component in relation.entry2.components:
                s += [words2set(pathway.entries[component.id].name)]
        else:
            s = [words2set(relation.entry2.name)]
        return cls(relation.type, relation.subtypes, words2set(relation.entry1.name), s)
        

In [9]:
def relationsFromKEGGpathway(pathway):
    """
        Given a KEGG pathway, returns the list of relations it contains (represented as Relation).
    """
    return [Relation.fromKEGG(r, pathway) for r in pathway.relations]
    

In [10]:
def relationsFromFile(dataDirectory, organism, pathwayID):
    """
        Given a pathway file (specified by a root dataDirectory, the name of the organism subdirectory, and the name
        of the file [organism+pathwayID.xml]), reads it and returns the list of its relations.
    """
    pathway = KGML_parser.read(open(os.path.join(dataDirectory, organism, organism+pathwayID+".xml"), "r"))
    return relationsFromKEGGpathway(pathway)

In [11]:
def relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway):
    """
        Given a superpathDict (whose keys are superpathway names and whose values are list of
        pairs (pathwayID, pathwayName)), reads all the pathway file for a specific superpathway name
        and returns a dictionary whose keys are the pathwayID's and whose values are the list of relations.
    """
    rel = {}
    for k,v in superpathwayDict[superpathway]:
        #logging.info("Reading", k, v)
        rel[k] = relationsFromFile(dataDirectory, organism, k)
    return rel

In [12]:
def subtypes(rel):
    """
        Given a map whose values are relations, accumulate all subtypes appearing and assign them a number.
        The result is a map from subtype string to number.
    """
    all_subtypes = set([])
    for pathwayid, relations in rel.items():
        for relation in relations:
            all_subtypes |= set([str(relation.relSubtypes)])
        
    s = sorted(list(all_subtypes))
    subtype2color = {v:k for k,v in enumerate(s)}

    return subtype2color

In [78]:
rel = relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway)

In [79]:
subtype2color = subtypes(rel)

G = nx.DiGraph()
for pathwayID, relations in rel.items():
    for relation in relations:
        for x in relation.source:
            for t in relation.target:
                for y in t:
                    G.add_edge(x, y, color=subtype2color[str(relation.relSubtypes)])

In [80]:
from qf.cc import cardon_crochemore_colored, cardon_crochemore

cc=cardon_crochemore_colored(G)

In [81]:
G.number_of_nodes()

634

In [82]:
len(set(cc.values()))

144

In [83]:
for x in set(cc.values()):
    group = set([k for k in cc if cc[k]==x])
    for pathwayID, relations in rel.items():
        for relation in relations:
            if relation.source == group:
                print(x, "S", group)

0 S {'hsa:10672'}
0 S {'hsa:10672'}
1 S {'hsa:5747'}
1 S {'hsa:5747'}
1 S {'hsa:5747'}
1 S {'hsa:5747'}
1 S {'hsa:5747'}
1 S {'hsa:5747'}
1 S {'hsa:5747'}
1 S {'hsa:5747'}
1 S {'hsa:5747'}
2 S {'hsa:6714'}
2 S {'hsa:6714'}
2 S {'hsa:6714'}
6 S {'hsa:857', 'hsa:859', 'hsa:858'}
10 S {'hsa:2885'}
10 S {'hsa:2885'}
11 S {'hsa:3667'}
12 S {'hsa:6774'}
13 S {'hsa:5293', 'hsa:5291', 'hsa:5296', 'hsa:5295', 'hsa:8503', 'hsa:5290'}
13 S {'hsa:5293', 'hsa:5291', 'hsa:5296', 'hsa:5295', 'hsa:8503', 'hsa:5290'}
13 S {'hsa:5293', 'hsa:5291', 'hsa:5296', 'hsa:5295', 'hsa:8503', 'hsa:5290'}
13 S {'hsa:5293', 'hsa:5291', 'hsa:5296', 'hsa:5295', 'hsa:8503', 'hsa:5290'}
13 S {'hsa:5293', 'hsa:5291', 'hsa:5296', 'hsa:5295', 'hsa:8503', 'hsa:5290'}
13 S {'hsa:5293', 'hsa:5291', 'hsa:5296', 'hsa:5295', 'hsa:8503', 'hsa:5290'}
13 S {'hsa:5293', 'hsa:5291', 'hsa:5296', 'hsa:5295', 'hsa:8503', 'hsa:5290'}
13 S {'hsa:5293', 'hsa:5291', 'hsa:5296', 'hsa:5295', 'hsa:8503', 'hsa:5290'}
13 S {'hsa:5293', 'hsa:529

In [85]:
genes = pd.read_csv(os.path.join(inputDirectory, "genes.csv"))

In [86]:
genes

Unnamed: 0.1,Unnamed: 0,name,A1BG,ADA,CDH2,AKT3,100008588,100008589,100009676,MED6,...,THOC1,REC8,RCE1,HNRNPDL,DMTF1,PPP4R1,SLC12A6,PTBP3,DGCR2,SCO2
0,0,GSM1574423,5.513040,8.105445,5.538153,5.367775,11.146230,11.589834,5.901013,8.079373,...,7.198096,6.152714,6.291999,7.194089,7.440856,7.791873,5.683778,8.471226,7.106159,7.440375
1,1,GSM1574424,5.238013,7.019328,5.188991,5.401298,11.728307,10.464450,6.265463,8.179442,...,7.045319,6.022443,6.045065,7.297708,7.821551,7.663454,5.675778,8.700451,6.962812,7.005557
2,2,GSM1574425,5.708308,7.230270,5.507846,5.397361,12.047896,11.318551,6.224461,8.074019,...,7.213119,6.178724,5.782921,6.998591,7.510197,7.663987,5.664503,8.643570,7.205886,7.787051
3,3,GSM1574426,5.740369,7.396263,5.442095,5.296547,11.650462,11.303002,6.022263,8.177189,...,7.098039,6.058239,5.673125,7.204400,7.893798,7.376775,5.750436,8.928652,6.890701,7.143114
4,4,GSM1574427,5.932976,7.238479,6.143618,5.671180,11.718869,10.859258,6.023132,8.228727,...,6.648952,6.114541,5.814955,7.399123,7.605499,7.356980,5.543969,8.511676,6.663802,7.748388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,59,GSM1574482,5.884727,7.484607,5.790493,5.582019,11.553017,10.963190,5.950991,8.646998,...,7.091413,6.117294,5.996777,7.323042,7.529939,7.391352,5.715396,8.527401,7.160782,7.783217
60,60,GSM1574483,5.571254,7.993836,5.530246,5.470003,10.881541,10.127892,6.354627,8.418685,...,6.993226,6.276569,6.158704,7.178020,7.428225,7.736230,5.452297,8.670611,7.251531,8.328839
61,61,GSM1574484,5.606271,7.757515,5.618650,5.494501,11.678308,11.502444,5.985153,8.202394,...,7.030866,6.064598,5.797958,7.296972,7.784769,7.516425,5.852640,8.801789,6.655766,7.288676
62,62,GSM1574485,5.808886,7.268714,5.222689,5.519814,12.081060,10.591896,6.233185,8.026523,...,7.125517,6.292087,5.787027,6.969160,7.399395,7.505281,5.655602,9.048492,7.017329,8.029034
