In [178]:
from hypergraphs import *
from kegg import *
import pandas as pd
import os
import logging
from Bio.KEGG.KGML.KGML_parser import read
from ast import literal_eval
from urllib.request import urlopen
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
import requests
from Levenshtein import ratio

In [2]:
organismDict = {"hsa": "Homo sapiens (human)"}
outputDirectory = "/Users/boldi/Desktop/pw/"
inputDirectory = "../../LaTeX/Data/KEGG-Pathways/Nov2023/"
dataDirectory = "../../LaTeX/Data/KEGG-Pathways"

In [3]:
pws = pd.read_csv(os.path.join(inputDirectory, "KEGG-pathways.txt"), header=0, 
            names=["Pathway ID", "Pathway type", "Pathway name"], dtype="string", keep_default_na=False)


In [4]:
superpathwayDict = {
    "Involved": [(x,y) for x,y in zip(pws["Pathway ID"], pws["Pathway name"])]
}

In [5]:
superpathway = "Involved"
organism = "hsa"

In [6]:
safeSuperpathway = makesafe(superpathway)

In [7]:
def words2set(words):
    """
        Given a space-separated list of words, returns it as a set of words
    """
    return set(words.split())

In [8]:
from typing import NamedTuple

class Relation(NamedTuple):
    """
        Represents a relation. It is characterized by:
        - a type (e.g. "PPRel") 
        - a list of subtypes (e.g. [('activation', '-->'), ('indirect effect', '..>')])
        - a set of components (source)
        - a list of sets of components (target)
    """
    relType: str
    relSubtypes: list
    source: set
    target: list

    @classmethod
    def fromKEGG(cls, relation, pathway):
        """
            Constructor: builds a Relation from a KEGG relation
        """
        if relation.entry2.name == "undefined":
            s = []
            for component in relation.entry2.components:
                s += [words2set(pathway.entries[component.id].name)]
        else:
            s = [words2set(relation.entry2.name)]
        return cls(relation.type, relation.subtypes, words2set(relation.entry1.name), s)
        

In [9]:
def relationsFromKEGGpathway(pathway):
    """
        Given a KEGG pathway, returns the list of relations it contains (represented as Relation).
    """
    return [Relation.fromKEGG(r, pathway) for r in pathway.relations]
    

In [10]:
def relationsFromFile(dataDirectory, organism, pathwayID):
    """
        Given a pathway file (specified by a root dataDirectory, the name of the organism subdirectory, and the name
        of the file [organism+pathwayID.xml]), reads it and returns the list of its relations.
    """
    pathway = KGML_parser.read(open(os.path.join(dataDirectory, organism, organism+pathwayID+".xml"), "r"))
    return relationsFromKEGGpathway(pathway)

In [11]:
def relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway):
    """
        Given a superpathDict (whose keys are superpathway names and whose values are list of
        pairs (pathwayID, pathwayName)), reads all the pathway file for a specific superpathway name
        and returns a dictionary whose keys are the pathwayID's and whose values are the list of relations.
    """
    rel = {}
    for k,v in superpathwayDict[superpathway]:
        #logging.info("Reading", k, v)
        rel[k] = relationsFromFile(dataDirectory, organism, k)
    return rel

In [12]:
def subtypes(rel):
    """
        Given a map whose values are relations, accumulate all subtypes appearing and assign them a number.
        The result is a map from subtype string to number.
    """
    all_subtypes = set([])
    for pathwayid, relations in rel.items():
        for relation in relations:
            all_subtypes |= set([str(relation.relSubtypes)])
        
    s = sorted(list(all_subtypes))
    subtype2color = {v:k for k,v in enumerate(s)}

    return subtype2color

In [168]:
def set2can(s):
    """
        Convert a set to a string in a canonical way.
    """
    return str(sorted(list(s)))

def can2set(c):
    """
        Does the converse of set2can.
    """
    return set(literal_eval(c))

In [171]:
def indices(rel):
    """
        Given a map whose values are lists of relations, it considers all the relations one by one, and attributes a unique id 
        to each element (i.e., set of components appearing as source or in the target of some relation) and block (the set of
        element id appearing as target of some relation).
        This function returns the dictionaries to move from/to an element or block to the corresponding id.
        Elements are string representations of sorted lists of strings.
        Blocks are string representations of sorted lists of ints.
    """
    element2id = {}
    id2element = {}
    block2id = {}
    id2block = {}

    for relations in rel.values():
        for relation in relations:
            s = set2can(relation.source)
            if s not in element2id.keys():
                element2id[s] = len(element2id)
            targetset = set([])
            for targ in relation.target:
                t = set2can(targ)
                if t not in element2id.keys():
                    element2id[t] = len(element2id)
                targetset |= set([element2id[t]])
            ts = set2can(targetset)
            if ts not in block2id.keys():
                block2id[ts] = len(block2id)

    for k,v in element2id.items():
        id2element[v] = k
    for k,v in block2id.items():
        id2block[v] = k
    return element2id, id2element, block2id, id2block


In [179]:
def search_gene_KEGG(geneID):
    """
        Search for gene on KEGG. 
        
        Returns list of names.
    """
    url1 = "https://www.kegg.jp/entry/"+geneID
    response1 = requests.get(url1, allow_redirects=False)    
    soup = BeautifulSoup(response1.text, "html.parser")
    tds = [tds for tds in soup.find_all("td", {"class": "td11 defd"})]
    names = tds[0].getText().strip().split(", ")
    return names

In [202]:
rel = relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway)
elemnt2id, id2element, block2id, id2block = indices(rel)
subtype2color = subtypes(rel)
elements = list(set.union(*[can2set(k) for k in element2id.keys()]))

In [231]:
name2keggFilename = os.path.join(inputDirectory, "name2kegg.pkl")

if os.path.exists(name2keggFilename):
    print("Reading gene names")
    with open(name2keggFilename, "rb") as handle:
        name2gene = pickle.load(handle)
else:
    print("Producing gene names")
    d = {}
    nelements = len(elements)
    for count,element in enumerate(elements):
        if count % 10 == 0:
            print(f"{count}/{nelements}")
        if element.startswith("hsa"):
            names = search_gene_KEGG(element)
            for name in names:
                d[name] = element
    with open(name2keggFilename, "wb") as handle:
        pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)
    name2gene = d

Reading gene names


In [232]:
G = nx.DiGraph()
for relations in rel.values():
    for relation in relations:
        source = element2id[set2can(relation.source)]
        for targ in relation.target:
            target = element2id[set2can(targ)]
            G.add_edge(source, target, color=subtype2color[str(relation.relSubtypes)])

In [233]:
from qf.cc import cardon_crochemore_colored, cardon_crochemore

cc=cardon_crochemore_colored(G)

In [258]:
genes = pd.read_csv(os.path.join(inputDirectory, "genes.csv"))
patients = genes["name"].values.tolist()

In [281]:
classification = pd.read_csv(os.path.join(inputDirectory, "classification.csv"))
patient2class = {patient: classification.loc[classification["name"]==patient]["classification"].values.flatten().tolist()[0] for patient in patients}

In [282]:
genes

Unnamed: 0.1,Unnamed: 0,name,A1BG,ADA,CDH2,AKT3,100008588,100008589,100009676,MED6,...,THOC1,REC8,RCE1,HNRNPDL,DMTF1,PPP4R1,SLC12A6,PTBP3,DGCR2,SCO2
0,0,GSM1574423,5.513040,8.105445,5.538153,5.367775,11.146230,11.589834,5.901013,8.079373,...,7.198096,6.152714,6.291999,7.194089,7.440856,7.791873,5.683778,8.471226,7.106159,7.440375
1,1,GSM1574424,5.238013,7.019328,5.188991,5.401298,11.728307,10.464450,6.265463,8.179442,...,7.045319,6.022443,6.045065,7.297708,7.821551,7.663454,5.675778,8.700451,6.962812,7.005557
2,2,GSM1574425,5.708308,7.230270,5.507846,5.397361,12.047896,11.318551,6.224461,8.074019,...,7.213119,6.178724,5.782921,6.998591,7.510197,7.663987,5.664503,8.643570,7.205886,7.787051
3,3,GSM1574426,5.740369,7.396263,5.442095,5.296547,11.650462,11.303002,6.022263,8.177189,...,7.098039,6.058239,5.673125,7.204400,7.893798,7.376775,5.750436,8.928652,6.890701,7.143114
4,4,GSM1574427,5.932976,7.238479,6.143618,5.671180,11.718869,10.859258,6.023132,8.228727,...,6.648952,6.114541,5.814955,7.399123,7.605499,7.356980,5.543969,8.511676,6.663802,7.748388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,59,GSM1574482,5.884727,7.484607,5.790493,5.582019,11.553017,10.963190,5.950991,8.646998,...,7.091413,6.117294,5.996777,7.323042,7.529939,7.391352,5.715396,8.527401,7.160782,7.783217
60,60,GSM1574483,5.571254,7.993836,5.530246,5.470003,10.881541,10.127892,6.354627,8.418685,...,6.993226,6.276569,6.158704,7.178020,7.428225,7.736230,5.452297,8.670611,7.251531,8.328839
61,61,GSM1574484,5.606271,7.757515,5.618650,5.494501,11.678308,11.502444,5.985153,8.202394,...,7.030866,6.064598,5.797958,7.296972,7.784769,7.516425,5.852640,8.801789,6.655766,7.288676
62,62,GSM1574485,5.808886,7.268714,5.222689,5.519814,12.081060,10.591896,6.233185,8.026523,...,7.125517,6.292087,5.787027,6.969160,7.399395,7.505281,5.655602,9.048492,7.017329,8.029034


In [283]:
namedGenes = set(name2gene.values())
columns = set(genes.columns.values.tolist()[2:])
node2columns = {}
for node in G.nodes():
    nodeGenes = can2set(id2element[node])
    s = []
    for gene in nodeGenes & namedGenes:
        for k,v in name2gene.items():
            if v == gene:
                if k in columns:
                    s += [k]
    node2columns[node]=s

In [286]:
for klass in set(cc.values()):
    for node in G.nodes():
        if cc[node] == klass:
            for patient in patients[:10]:
                print(node, 
                      "->", 
                      genes.loc[genes["name"] == patient][node2columns[node]].values.flatten().tolist(),
                      patient2class[patient]
                     )
    print()

268 -> [6.510771] healthy
268 -> [6.73014733333] healthy
268 -> [6.57660666667] healthy
268 -> [6.88865366667] diseased
268 -> [6.64805633333] healthy
268 -> [6.79450333333] diseased
268 -> [6.51031366667] diseased
268 -> [6.91681633333] healthy
268 -> [7.53295966667] diseased
268 -> [7.001618] healthy

25 -> [9.0069465] healthy
25 -> [8.799019] healthy
25 -> [9.07799] healthy
25 -> [8.6117495] diseased
25 -> [8.6472875] healthy
25 -> [8.407615] diseased
25 -> [8.638943] diseased
25 -> [8.7527485] healthy
25 -> [8.8694285] diseased
25 -> [8.8882625] healthy

134 -> [] healthy
134 -> [] healthy
134 -> [] healthy
134 -> [] diseased
134 -> [] healthy
134 -> [] diseased
134 -> [] diseased
134 -> [] healthy
134 -> [] diseased
134 -> [] healthy
223 -> [10.867582, 11.518238, 12.319121] healthy
223 -> [10.348589, 11.3404475, 11.965034] healthy
223 -> [8.866663, 10.208119, 11.279551] healthy
223 -> [10.554719, 11.883176, 12.286764] diseased
223 -> [9.387803, 10.6325765, 11.267876] healthy
223 -

In [251]:
genes.loc[genes["name"] == "GSM1574423"][["AKT3", "ADA"]].values.flatten().tolist()

[5.36777466667, 8.105445]

In [257]:
genes["name"].values.tolist()

['GSM1574423',
 'GSM1574424',
 'GSM1574425',
 'GSM1574426',
 'GSM1574427',
 'GSM1574428',
 'GSM1574429',
 'GSM1574430',
 'GSM1574431',
 'GSM1574432',
 'GSM1574433',
 'GSM1574434',
 'GSM1574435',
 'GSM1574436',
 'GSM1574437',
 'GSM1574438',
 'GSM1574439',
 'GSM1574440',
 'GSM1574441',
 'GSM1574442',
 'GSM1574443',
 'GSM1574444',
 'GSM1574445',
 'GSM1574446',
 'GSM1574447',
 'GSM1574448',
 'GSM1574449',
 'GSM1574450',
 'GSM1574451',
 'GSM1574452',
 'GSM1574453',
 'GSM1574454',
 'GSM1574455',
 'GSM1574456',
 'GSM1574457',
 'GSM1574458',
 'GSM1574459',
 'GSM1574460',
 'GSM1574461',
 'GSM1574462',
 'GSM1574463',
 'GSM1574464',
 'GSM1574465',
 'GSM1574466',
 'GSM1574467',
 'GSM1574468',
 'GSM1574469',
 'GSM1574470',
 'GSM1574471',
 'GSM1574472',
 'GSM1574473',
 'GSM1574474',
 'GSM1574475',
 'GSM1574476',
 'GSM1574477',
 'GSM1574478',
 'GSM1574479',
 'GSM1574480',
 'GSM1574481',
 'GSM1574482',
 'GSM1574483',
 'GSM1574484',
 'GSM1574485',
 'GSM1574486']

In [280]:
{patient: classification.loc[classification["name"]==patient]["classification"].values.flatten().tolist()[0] for patient in patients}

{'GSM1574423': 'healthy',
 'GSM1574424': 'healthy',
 'GSM1574425': 'healthy',
 'GSM1574426': 'diseased',
 'GSM1574427': 'healthy',
 'GSM1574428': 'diseased',
 'GSM1574429': 'diseased',
 'GSM1574430': 'healthy',
 'GSM1574431': 'diseased',
 'GSM1574432': 'healthy',
 'GSM1574433': 'diseased',
 'GSM1574434': 'diseased',
 'GSM1574435': 'diseased',
 'GSM1574436': 'healthy',
 'GSM1574437': 'healthy',
 'GSM1574438': 'diseased',
 'GSM1574439': 'healthy',
 'GSM1574440': 'diseased',
 'GSM1574441': 'diseased',
 'GSM1574442': 'diseased',
 'GSM1574443': 'diseased',
 'GSM1574444': 'diseased',
 'GSM1574445': 'diseased',
 'GSM1574446': 'healthy',
 'GSM1574447': 'diseased',
 'GSM1574448': 'diseased',
 'GSM1574449': 'diseased',
 'GSM1574450': 'diseased',
 'GSM1574451': 'healthy',
 'GSM1574452': 'diseased',
 'GSM1574453': 'diseased',
 'GSM1574454': 'diseased',
 'GSM1574455': 'diseased',
 'GSM1574456': 'healthy',
 'GSM1574457': 'diseased',
 'GSM1574458': 'healthy',
 'GSM1574459': 'healthy',
 'GSM1574460': 