In [1]:
from hypergraphs import *
from kegg import *
import pandas as pd
import os
import logging

from Bio.KEGG.KGML.KGML_parser import read
from ast import literal_eval
from urllib.request import urlopen
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
import requests
from Levenshtein import ratio
import scipy.stats
import pathlib
from qf.cc import cardon_crochemore_colored, cardon_crochemore
import random

In [2]:
organismDict = {"hsa": "Homo sapiens (human)"}
outputDirectory = "/Users/boldi/Desktop/pw/"
inputDirectory = "../../LaTeX/Data/KEGG-Pathways/Nov2023/"
dataDirectory = "../../LaTeX/Data/KEGG-Pathways"

In [3]:
pws = pd.read_csv(os.path.join(inputDirectory, "KEGG-pathways.txt"), header=0, 
            names=["Pathway ID", "Pathway type", "Pathway name"], dtype="string", keep_default_na=False)


In [4]:
superpathwayDict = {
    "Involved": [(x,y) for x,y in zip(pws["Pathway ID"], pws["Pathway name"])]
}

In [5]:
superpathway = "Involved"
organism = "hsa"

In [6]:
safeSuperpathway = makesafe(superpathway)

In [7]:
def words2set(words):
    """
        Given a space-separated list of words, returns it as a set of words
    """
    return set(words.split())

In [8]:
from typing import NamedTuple

class Relation(NamedTuple):
    """
        Represents a relation. It is characterized by:
        - a type (e.g. "PPRel") 
        - a list of subtypes (e.g. [('activation', '-->'), ('indirect effect', '..>')])
        - a set of components (source)
        - a list of sets of components (target)
    """
    relType: str
    relSubtypes: list
    source: set
    target: list

    @classmethod
    def fromKEGG(cls, relation, pathway):
        """
            Constructor: builds a Relation from a KEGG relation
        """
        if relation.entry2.name == "undefined":
            s = []
            for component in relation.entry2.components:
                s += [words2set(pathway.entries[component.id].name)]
        else:
            s = [words2set(relation.entry2.name)]
        return cls(relation.type, relation.subtypes, words2set(relation.entry1.name), s)
        

In [9]:
def relationsFromKEGGpathway(pathway):
    """
        Given a KEGG pathway, returns the list of relations it contains (represented as Relation).
    """
    return [Relation.fromKEGG(r, pathway) for r in pathway.relations]
    

In [10]:
def pathwayFromFile(dataDirectory, organism, pathwayID):
    return KGML_parser.read(open(os.path.join(dataDirectory, organism, organism+pathwayID+".xml"), "r"))

In [11]:
def relationsFromFile(dataDirectory, organism, pathwayID):
    """
        Given a pathway file (specified by a root dataDirectory, the name of the organism subdirectory, and the name
        of the file [organism+pathwayID.xml]), reads it and returns the list of its relations.
    """
    pathway = pathwayFromFile(dataDirectory, organism, pathwayID)
    return relationsFromKEGGpathway(pathway)

In [12]:
def relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway):
    """
        Given a superpathDict (whose keys are superpathway names and whose values are list of
        pairs (pathwayID, pathwayName)), reads all the pathway file for a specific superpathway name
        and returns a dictionary whose keys are the pathwayID's and whose values are the list of relations.
    """
    rel = {}
    for k,v in superpathwayDict[superpathway]:
        #logging.info("Reading", k, v)
        rel[k] = relationsFromFile(dataDirectory, organism, k)
    return rel

In [13]:
def subtypes(rel):
    """
        Given a map whose values are relations, accumulate all subtypes appearing and assign them a number.
        The result is a map from subtype string to number.
    """
    all_subtypes = set([])
    for pathwayid, relations in rel.items():
        for relation in relations:
            all_subtypes |= set([str(relation.relSubtypes)])
        
    s = sorted(list(all_subtypes))
    subtype2color = {v:k for k,v in enumerate(s)}

    return subtype2color

In [14]:
def set2can(s):
    """
        Convert a set to a string in a canonical way.
    """
    return str(sorted(list(s)))

def can2set(c):
    """
        Does the converse of set2can.
    """
    return set(literal_eval(c))

In [15]:
def indices(rel):
    """
        Given a map whose values are lists of relations, it considers all the relations one by one, and attributes a unique id 
        to each element (i.e., set of components appearing as source or in the target of some relation) and block (the set of
        element id appearing as target of some relation).
        This function returns the dictionaries to move from/to an element or block to the corresponding id.
        Elements are string representations of sorted lists of strings.
        Blocks are string representations of sorted lists of ints.
    """
    element2id = {}
    id2element = {}
    block2id = {}
    id2block = {}

    for relations in rel.values():
        for relation in relations:
            s = set2can(relation.source)
            if s not in element2id.keys():
                element2id[s] = len(element2id)
            targetset = set([])
            for targ in relation.target:
                t = set2can(targ)
                if t not in element2id.keys():
                    element2id[t] = len(element2id)
                targetset |= set([element2id[t]])
            ts = set2can(targetset)
            if ts not in block2id.keys():
                block2id[ts] = len(block2id)

    for k,v in element2id.items():
        id2element[v] = k
    for k,v in block2id.items():
        id2block[v] = k
    return element2id, id2element, block2id, id2block


In [16]:
def search_gene_KEGG(geneID):
    """
        Search for gene on KEGG. 
        
        Returns list of names.
    """
    url1 = "https://www.kegg.jp/entry/"+geneID
    response1 = requests.get(url1, allow_redirects=False)    
    soup = BeautifulSoup(response1.text, "html.parser")
    tds = [tds for tds in soup.find_all("td", {"class": "td11 defd"})]
    names = tds[0].getText().strip().split(", ")
    return names

In [17]:
rel = relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway)
element2id, id2element, block2id, id2block = indices(rel)
subtype2color = subtypes(rel)
elements = list(set.union(*[can2set(k) for k in element2id.keys()]))

In [18]:
# name2gene is a function mapping names to gene IDs (hsa:xxxx)
# e.g. KCNE3, BRGDA6 etc are all mapped to hsa:10008 (see https://www.kegg.jp/entry/hsa:10008)
# This map was produced by scraping kegg, but this has been done only once.
# After that, we just read a pkl file

name2keggFilename = os.path.join(inputDirectory, "name2kegg.pkl")

if os.path.exists(name2keggFilename):
    print("Reading gene names")
    with open(name2keggFilename, "rb") as handle:
        name2gene = pickle.load(handle)
else:
    print("Producing gene names")
    d = {}
    nelements = len(elements)
    for count,element in enumerate(elements):
        if count % 10 == 0:
            print(f"{count}/{nelements}")
        if element.startswith("hsa"):
            names = search_gene_KEGG(element)
            for name in names:
                d[name] = element
    with open(name2keggFilename, "wb") as handle:
        pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)
    name2gene = d

Reading gene names


In [19]:
# Read patient data
print("Reading patient data")
genes = pd.read_csv(os.path.join(inputDirectory, "genes.csv"))
patients = genes["name"].values.tolist()
classification = pd.read_csv(os.path.join(inputDirectory, "classification.csv"))

patient2class = {patient: classification.loc[classification["name"]==patient]["classification"].values.flatten().tolist()[0] for patient in patients}

Reading patient data


In [20]:
# Dividing genes into two frames (one for healthy individuals and one for diseased individuals)
genesHealthy = genes.loc[genes["name"].apply(lambda x: patient2class[x])=="healthy"]
genesDiseased = genes.loc[genes["name"].apply(lambda x: patient2class[x])=="diseased"]

In [21]:
# Building the graph of all relations contained in rel values
# Nodes are IDs, and node x correspond to the set of genes (and/or compound) id2element[x]
print("Building the relation graph")
G = nx.DiGraph() 
for relations in rel.values():
    for relation in relations:
        source = element2id[set2can(relation.source)]
        for targ in relation.target:
            target = element2id[set2can(targ)]
            G.add_edge(source, target, color=subtype2color[str(relation.relSubtypes)])
            

# Computing the minimum fibres
cc=cardon_crochemore_colored(G)

colors = list(matplotlib.colors.CSS4_COLORS.values())
usedColors = 0

cc2fibresize = {}
cc2fibre = {}
ccnontrivial = {}
nontrivialclass2col = {}

fibres = []
node2fibre = {}
for v in set(cc.values()):
    fibre = [x for x in G.nodes() if cc[x] == v]
    fibres += [fibre]
    for x in fibre:
        node2fibre[x] = fibre
    cc2fibre[v] = fibre
    cc2fibresize[v] = len(fibre)
    if len(fibre) > 0:
        ccnontrivial[v] = cc[v]
        nontrivialclass2col[v] = colors[usedColors]
        usedColors += 1

Building the relation graph


In [22]:
# Build node2columns, mapping each node of G to the list of genes corresponding to it
# for which we know a value (in the genes dataframe), sorted
#

print("Mapping genes to columns")
namedGenes = set(name2gene.values())     
columns = set(genes.columns.values.tolist()[2:])
node2columns = {}
for node in G.nodes():
    nodeGenes = can2set(id2element[node])
    s = []
    for gene in nodeGenes & namedGenes:
        for k,v in name2gene.items():
            if v == gene:
                if k in columns:
                    s += [k]
    node2columns[node]=sorted(s)

Mapping genes to columns


In [23]:
s = ""
for pair in superpathwayDict[superpathway]:
    pathwayID = pair[0]
    pathway = pathwayFromFile(dataDirectory, organism, pathwayID)
    canvas = KGMLCanvas(pathway)
    for k in pathway.entries:
        name = pathway.entries[k].name
        canonicalName = set2can(words2set(name))
        if canonicalName in element2id.keys():
            node = element2id[canonicalName]
            klass = cc[node]
            if klass in nontrivialclass2col:
                pathway.entries[k].graphics[0].bgcolor = nontrivialclass2col[klass]
                pathway.entries[k].graphics[0].fgcolor = nontrivialclass2col[klass]
            else:
                pathway.entries[k].graphics[0].bgcolor = "#FFFFFF"
                pathway.enries[k].graphics[0].fgcolor = "#FFFFFF"
            s += pathwayID + "\t" + str(node) + "\t" + str(klass) +"\t"+pathway.entries[k].graphics[0].name+"\t"+canonicalName + "\n"
    canvas.import_imagemap = True
    pdfName = organism + pathwayID + ".pdf"
    pathlib.Path(os.path.join("/Users/boldi/Desktop/", organism, safeSuperpathway)).mkdir(parents=True, exist_ok=True)
    canvas.draw(os.path.join("/Users/boldi/Desktop/", organism, safeSuperpathway, pdfName))
with open(os.path.join("/Users/boldi/Desktop/", organism, safeSuperpathway, "all.tsv"), "wt") as file:
    file.write(s)

In [24]:
def columns2nodes(columns, node2columns, ignoreEmpty = True):
    """
        Returns a map from columns to the nodes where the column appears.
    """
    res = {} 
    for c in columns:
        s = set([])
        for no, co in node2columns.items():
            if c in co:
                s |= set([no])
        if len(s) > 0 or not ignoreEmpty:
            res[c] = s
    return res

c2n = columns2nodes(columns, node2columns)
usableColumns = list(c2n.keys())

In [25]:
def areRelated(column1, column2, column2nodes, node2color, inDegThreshold = 0):
    nodes1 = column2nodes[column1]
    nodes2 = column2nodes[column2]
    colors1 = set([node2color[node] for node in nodes1])
    colors2 = set([node2color[node] for node in nodes2])
    commonColors = colors1 & colors2
    if len(commonColors) == 0:
        return False
    maxInDeg = max([G.in_degree(cc2fibre[c][0]) for c in commonColors])
    return maxInDeg >= inDegThreshold

In [63]:
(node2columns[249],node2columns[253])

(['VWF'], ['GP9'])

In [68]:
out = []
activationColors = set([])
for subtype, color in subtype2color.items():
    if "activation" in subtype:
        activationColors |= set([color])
for s,t,d in G.edges(data=True):
    if d["color"] in activationColors:
        tausH = []
        tausD = []
        for sg in node2columns[s]:
            for tg in node2columns[t]:
                expr1 = genesHealthy.loc[:,sg].values.tolist()
                expr2 = genesHealthy.loc[:,tg].values.tolist()
                tausH += [scipy.stats.kendalltau(expr1, expr2).statistic]
                expr1 = genesDiseased.loc[:,sg].values.tolist()
                expr2 = genesDiseased.loc[:,tg].values.tolist()
                tausD += [scipy.stats.kendalltau(expr1, expr2).statistic]
        if len(tausH) == 0:
            continue
        meanH = pd.DataFrame(tausH).mean().values[0]
        meanD = pd.DataFrame(tausD).mean().values[0]
        if len(tausH) > 1:
            stdH = pd.DataFrame(tausH).std().values[0]
            stdD = pd.DataFrame(tausD).std().values[0]
        else:
            stdH = 0
            stdD = 0
        out += [("{:4d}\t{:4d}\t{:5.2f}±{:5.2f}\t{:5.2f}±{:5.2f}".format(s,t, meanH, stdH, meanD, stdD), meanH-meanD)]
out.sort(key=lambda x: abs(x[1]), reverse=True)
for o,v in out:
    print(o,"\t",v)

 249	 253	-0.16± 0.00	 0.30± 0.00 	 -0.4622268537416605
  77	 112	 0.51± 0.00	 0.14± 0.00 	 0.36765584214504116
  98	  96	 0.17± 0.05	-0.19± 0.06 	 0.35986670530281073
  13	  46	-0.16± 0.00	 0.13± 0.00 	 -0.28861199652274705
 227	  10	-0.01± 0.00	-0.30± 0.00 	 0.2860620110113011
 198	 173	-0.06± 0.00	 0.20± 0.00 	 -0.25911330049261083
 249	 252	-0.11± 0.00	 0.14± 0.00 	 -0.2514994493423496
  25	 231	 0.05± 0.01	 0.28± 0.18 	 -0.23309185743262822
  23	  11	-0.27± 0.11	-0.04± 0.12 	 -0.2326137351492321
  62	   7	 0.00± 0.08	-0.22± 0.04 	 0.21926977687626775
 265	  49	 0.24± 0.11	 0.02± 0.10 	 0.21706751666183713
  27	  34	-0.14± 0.12	-0.34± 0.10 	 0.20588240172417813
  26	  27	 0.18± 0.01	 0.38± 0.07 	 -0.19842893363646308
 226	 227	-0.06± 0.06	 0.13± 0.13 	 -0.19681251811069256
 133	 134	-0.15± 0.14	 0.04± 0.18 	 -0.19248526997005694
 249	 251	 0.14± 0.00	 0.33± 0.00 	 -0.18984279040123606
 266	 226	-0.08± 0.00	 0.10± 0.16 	 -0.1813387423935091
 258	 227	 0.15± 0.34	-0.03± 0.27 	 0.1797

In [84]:
reltaus = []
unreltaus = []

while len(reltaus) < 300 or len(unreltaus) < 300:
    gene1 = random.choice(usableColumns)
    gene2 = random.choice(usableColumns)
    if gene1 == gene2:
        continue
    expr1 = genesHealthy.loc[:,gene1].values.tolist()
    expr2 = genesHealthy.loc[:,gene2].values.tolist()
    tau = scipy.stats.kendalltau(expr1, expr2).statistic
    if areRelated(gene1, gene2, c2n, cc, 1):
        if len(reltaus) < 300:
            reltaus += [tau]
    else:
        if len(unreltaus) < 300:
            unreltaus += [tau]


In [64]:
len(reltaus)

300

In [65]:
scipy.stats.ks_2samp(reltaus, unreltaus, alternative="two-sided")

KstestResult(statistic=0.06666666666666667, pvalue=0.5182685170344619, statistic_location=0.029556650246305414, statistic_sign=1)

In [66]:
scipy.stats.ks_2samp(reltaus, unreltaus, alternative="less")

KstestResult(statistic=0.03, pvalue=0.7636919236655174, statistic_location=-0.08374384236453201, statistic_sign=-1)

In [67]:
scipy.stats.ks_2samp(reltaus, unreltaus, alternative="greater")

KstestResult(statistic=0.06666666666666667, pvalue=0.26392295576403385, statistic_location=0.029556650246305414, statistic_sign=1)

In [70]:
pd.DataFrame(unreltaus).describe()

Unnamed: 0,0
count,300.0
mean,0.006053
std,0.181016
min,-0.458128
25%,-0.1133
50%,0.009852
75%,0.1133
max,0.541872


In [69]:
scipy.stats.ks_2samp(reltaus, unreltaus, alternative="two-sided")

KstestResult(statistic=0.06666666666666667, pvalue=0.5182685170344619, statistic_location=0.029556650246305414, statistic_sign=1)

In [32]:
scipy.stats.ks_2samp(reltaus, unreltaus, alternative="less")

KstestResult(statistic=0.030089771789858444, pvalue=8.370404893436451e-08, statistic_location=-0.06419753086419754, statistic_sign=-1)

In [33]:
scipy.stats.ks_2samp(reltaus, unreltaus, alternative="greater")

KstestResult(statistic=0.00322026084251692, pvalue=0.8279876070229103, statistic_location=0.1728400330446471, statistic_sign=1)

In [34]:
G.number_of_edges()

376

In [35]:
pd.DataFrame(reltaus).describe()

Unnamed: 0,0
count,9984.0
mean,0.027991
std,0.210601
min,-0.561576
25%,-0.098522
50%,0.014815
75%,0.128079
max,1.0


In [36]:
pd.DataFrame(unreltaus).describe()

Unnamed: 0,0
count,90016.0
mean,0.011352
std,0.180171
min,-0.630542
25%,-0.1133
50%,0.009852
75%,0.128079
max,0.753695


In [55]:
fibres

[[225],
 [260],
 [13],
 [7],
 [14],
 [17],
 [170],
 [29],
 [23, 26],
 [33],
 [139],
 [230],
 [129],
 [151],
 [64],
 [74],
 [123],
 [96, 105, 124],
 [86, 87],
 [67],
 [68],
 [62],
 [44],
 [61, 70, 78, 126],
 [8],
 [102],
 [41],
 [21],
 [0],
 [45],
 [22],
 [232],
 [236],
 [10],
 [138, 192],
 [130, 134],
 [133],
 [120],
 [149],
 [43],
 [229],
 [51],
 [217],
 [241, 242],
 [48],
 [50],
 [5],
 [104],
 [101],
 [11],
 [4],
 [30],
 [9, 20],
 [27],
 [93, 137, 169, 178, 189, 223, 228],
 [147],
 [100],
 [19],
 [71],
 [246],
 [231],
 [18],
 [153],
 [150, 152, 154],
 [156],
 [221],
 [103],
 [12],
 [47],
 [166],
 [90, 91],
 [224],
 [113],
 [59],
 [85, 88, 92, 94, 95],
 [25,
  66,
  98,
  110,
  111,
  132,
  136,
  145,
  155,
  159,
  177,
  193,
  198,
  201,
  202,
  208,
  214,
  227,
  235,
  237,
  239,
  240,
  251,
  254,
  258,
  259,
  262,
  263,
  264,
  265,
  267],
 [83, 253],
 [162],
 [37],
 [3],
 [220, 222],
 [54],
 [160,
  163,
  165,
  168,
  172,
  173,
  174,
  175,
  176,
  179,


In [60]:
cc2fibresize

{0: 1,
 1: 1,
 2: 1,
 3: 1,
 4: 1,
 5: 1,
 6: 1,
 7: 1,
 8: 2,
 9: 1,
 10: 1,
 11: 1,
 12: 1,
 13: 1,
 14: 1,
 15: 1,
 16: 1,
 17: 3,
 18: 2,
 19: 1,
 20: 1,
 21: 1,
 22: 1,
 23: 4,
 24: 1,
 25: 1,
 26: 1,
 27: 1,
 28: 1,
 29: 1,
 30: 1,
 31: 1,
 32: 1,
 33: 1,
 34: 2,
 35: 2,
 36: 1,
 37: 1,
 38: 1,
 39: 1,
 40: 1,
 41: 1,
 42: 1,
 43: 2,
 44: 1,
 45: 1,
 46: 1,
 47: 1,
 48: 1,
 49: 1,
 50: 1,
 51: 1,
 52: 2,
 53: 1,
 54: 7,
 55: 1,
 56: 1,
 57: 1,
 58: 1,
 59: 1,
 60: 1,
 61: 1,
 62: 1,
 63: 3,
 64: 1,
 65: 1,
 66: 1,
 67: 1,
 68: 1,
 69: 1,
 70: 2,
 71: 1,
 72: 1,
 73: 1,
 74: 5,
 75: 31,
 76: 2,
 77: 1,
 78: 1,
 79: 1,
 80: 2,
 81: 1,
 82: 26,
 83: 2,
 84: 1,
 85: 52,
 86: 1,
 87: 1,
 88: 3,
 89: 1,
 90: 1,
 91: 1,
 92: 1,
 93: 1,
 94: 1,
 95: 1,
 96: 1,
 97: 1,
 98: 2,
 99: 1,
 100: 1,
 101: 2,
 102: 1,
 103: 1,
 104: 1,
 105: 3,
 106: 1,
 107: 1,
 108: 1,
 109: 5,
 110: 2,
 111: 1,
 112: 1,
 113: 2,
 114: 1,
 115: 1,
 116: 1,
 117: 1,
 118: 2,
 119: 1,
 120: 1,
 121: 2}

In [58]:
print(len(set(cc.values())))

122


In [26]:
subtype2color[str(relation.relSubtypes)]

3

In [27]:
subtype2color

{"[('activation', '-->'), ('dephosphorylation', '-p')]": 0,
 "[('activation', '-->'), ('indirect effect', '..>')]": 1,
 "[('activation', '-->'), ('phosphorylation', '+p')]": 2,
 "[('activation', '-->')]": 3,
 "[('binding/association', '---')]": 4,
 "[('compound', 170), ('activation', '-->'), ('indirect effect', '..>')]": 5,
 "[('compound', 24)]": 6,
 "[('compound', 25)]": 7,
 "[('compound', 277), ('activation', '-->')]": 8,
 "[('expression', '-->')]": 9,
 "[('indirect effect', '..>')]": 10,
 "[('inhibition', '--|'), ('dephosphorylation', '-p')]": 11,
 "[('inhibition', '--|'), ('indirect effect', '..>')]": 12,
 "[('inhibition', '--|'), ('phosphorylation', '+p')]": 13,
 "[('inhibition', '--|'), ('ubiquitination', '+u')]": 14,
 "[('inhibition', '--|')]": 15,
 "[('phosphorylation', '+p'), ('indirect effect', '..>')]": 16,
 "[('phosphorylation', '+p')]": 17,
 "[('state change', '...')]": 18,
 '[]': 19}

In [None]:
for relation in 