In [1]:
from hypergraphs import *
from kegg import *

In [26]:
from urllib.request import urlopen
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
import requests
from Levenshtein import ratio
from chemspipy import ChemSpider
import logging
import pathlib
from Bio.KEGG.KGML.KGML_parser import read
import scipy

In [3]:
superpathwayDict = {
    "Carbohydrate metabolism":
[
	("00010", "Glycolysis / Gluconeogenesis"),
	("00020", "Citrate cycle (TCA cycle)"),
	("00030", "Pentose phosphate pathway"),
	("00040", "Pentose and glucuronate interconversions"),
	("00051", "Fructose and mannose metabolism"),
	("00052", "Galactose metabolism"),
	("00053", "Ascorbate and aldarate metabolism"),
	("00500", "Starch and sucrose metabolism"),
	("00520", "Amino sugar and nucleotide sugar metabolism"),
	("00620", "Pyruvate metabolism"),
	("00630", "Glyoxylate and dicarboxylate metaboliscm"),
	("00640", "Propanoate metabolism"),
	("00650", "Butanoate metabolism"),
	("00562", "Inositol phosphate metabolism")
],
"Energy metabolism":
[
	("00190", "Oxidative phosphorylation"),
	("00910", "Nitrogen metabolism"),
	("00920", "Sulfur metabolism")
],
"Lipid metabolism":
[	("00061", "Fatty acid biosynthesis"),
	("00062", "Fatty acid elongation"),	
	("00071", "Fatty acid degradation"),
	("00100", "Steroid biosynthesis"),	
	("00120", "Primary bile acid biosynthesis"),
	("00140", "Steroid hormone biosynthesis"),
	("00561", "Glycerolipid metabolism"),
	("00564", "Glycerophospholipid metabolism"),
	("00565", "Ether lipid metabolism"),
	("00600", "Sphingolipid metabolism"),
	("00590", "Arachidonic acid metabolism"),
	("00591", "Linoleic acid metabolism"),
	("00592", "alpha-Linolenic acid metabolism"),
	("01040", "Biosynthesis of unsaturated fatty acids")
],
"Nucleotide metabolism":
[
	("00230", "Purine metabolism"),
	("00240", "Pyrimidine metabolism")
],
"Amino acid metabolism":
[
	("00250", "Alanine, aspartate and glutamate metabolism"),
	("00260", "Glycine, serine and threonine metabolism"),
	("00270", "Cysteine and methionine metabolism"),
	("00280", "Valine, leucine and isoleucine degradation"),
	("00290", "Valine, leucine and isoleucine biosynthesis"),
	("00310", "Lysine degradation"),
	("00220", "Arginine biosynthesis"),
	("00330", "Arginine and proline metabolism"),
	("00340", "Histidine metabolism"),
	("00350", "Tyrosine metabolism"),
	("00360", "Phenylalanine metabolism"),
	("00380", "Tryptophan metabolism"),
	("00400", "Phenylalanine, tyrosine and tryptophan biosynthesis")
],
"Metabolism of other amino acids":
[
	("00410", "beta-Alanine metabolism"),
	("00430", "Taurine and hypotaurine metabolism"),
	("00440", "Phosphonate and phosphinate metabolism"),
	("00450", "Selenocompound metabolism"),
	("00470", "D-Amino acid metabolism"),
	("00480", "Glutathione metabolism")
],
"Glycan biosynthesis and metabolism":
[
	("00510", "N-Glycan biosynthesis"),
	("00513", "Various types of N-glycan biosynthesis"),
	("00512", "Mucin type O-glycan biosynthesis"),
	("00515", "Mannose type O-glycan biosynthesis"),
	("00514", "Other types of O-glycan biosynthesis"),
	("00532", "Glycosaminoglycan biosynthesis - chondroitin sulfate / dermatan sulfate"),
	("00534", "Glycosaminoglycan biosynthesis - heparan sulfate / heparin"),
	("00533", "Glycosaminoglycan biosynthesis - keratan sulfate"),
	("00531", "Glycosaminoglycan degradation"),
	("00563", "Glycosylphosphatidylinositol (GPI)-anchor biosynthesis"),
	("00601", "Glycosphingolipid biosynthesis - lacto and neolacto series"),
	("00603", "Glycosphingolipid biosynthesis - globo and isoglobo series"),
	("00604", "Glycosphingolipid biosynthesis - ganglio series"),
	("00511", "Other glycan degradation")
],
"Metabolism of cofactors and vitamins":
[
	("00730", "Thiamine metabolism"),
	("00740", "Riboflavin metabolism"),
	("00750", "Vitamin B6 metabolism"),
	("00760", "Nicotinate and nicotinamide metabolism"),
	("00770", "Pantothenate and CoA biosynthesis"),
	("00780", "Biotin metabolism"),
	("00785", "Lipoic acid metabolism"),
	("00790", "Folate biosynthesis"),
	("00670", "One carbon pool by folate"),
	("00830", "Retinol metabolism"),
	("00860", "Porphyrin metabolism"),
	("00130", "Ubiquinone and other terpenoid-quinone biosynthesis")
],
"Metabolism of terpenoids and polyketides":
[
	("00900", "Terpenoid backbone biosynthesis")
],
"Biosynthesis of other secondary metabolites":
[
	("00232", "Caffeine metabolism"),
	("00524", "Neomycin, kanamycin and gentamicin biosynthesis")
],
"Xenobiotics biodegradation and metabolism":
[
	("00980", "Metabolism of xenobiotics by cytochrome P450"),
	("00982", "Drug metabolism - cytochrome P450"),
	("00983", "Drug metabolism - other enzymes")
]
}

In [4]:
organismDict = {"hsa": "Homo sapiens (human)"}

In [5]:
outputDirectory = "/Users/boldi/Desktop/pw/"
dataDirectory = "../../LaTeX/Data/KEGG-Pathways/"

In [6]:
def lratio(needle, haystack):
    """
        Given a string needle and a list of strings haystack, returns the maximum 
        Levenshtein ratio between needle and the elements of haystack (case insensitive match). This value ranges
        from 1 (the needle is present in the haystack) to 0.
    """
    return max([ratio(needle.lower(), hay.lower()) for hay in haystack])

In [7]:
def search_compound_KEGG(compound_name, compound_formula=None):
    """
        Search for compound on KEGG. If compound_formula is provided, the formula is used instead.
        
        Returns ID, list of names as from KEGG, best Levenshtein ratio (w.r.t. the names).
        If nothing is found, IDs and list are both None, and ratio is 0.0.
    """
    if compound_formula is None:
        url1 = "https://www.kegg.jp/kegg-bin/search_ligand?query=" + quote_plus(compound_name) + "&column=entry%2Bname%2Bformula&DATABASE=compound&STEP=1000"
    else:
        url1 = "https://www.kegg.jp/kegg-bin/search_ligand?query=" + quote_plus(compound_formula) + "&column=entry%2Bname%2Bformula&DATABASE=compound&STEP=1000"
    response1 = requests.get(url1, allow_redirects=False)    
    url2 = "https://www.kegg.jp/kegg-bin/" + response1.headers["Location"]
    response2 = requests.get(url2)
    soup = BeautifulSoup(response2.text, "html.parser")
    tds = [tds for tds in soup.find_all("td", {"class": "data1"})]
    cd = []
    d = {}
    for i in range(len(tds)//5):
        compound_id = tds[5*i+1].getText()
        names = tds[5*i+3].getText(strip=True, separator="|").split("|")
        cd += [compound_id]
        d[compound_id]=names
    if cd:
        i = max(range(len(cd)), key = lambda i: lratio(compound_name, d[cd[i]])) # Best match
        return cd[i], d[cd[i]], lratio(compound_name, d[cd[i]])
    else:
        return None, None, 0.0

In [8]:
def search_compound_chemspider(compound_name, chemspider_api_key="ZOoKYwWPa9AIpeSu3f90LWZ9NHn5TY2A"):
    """
        Search for compound on ChemSpider, using a valid API key. 
        
        Returns common name, molecular formula, best Levenshtein ratio (w.r.t. the names).
        If nothing is found, the first two entries are both None, and ratio is 0.0.
    """

    if chemspider_api_key is None:
        return None, None, 0.0
    cs = ChemSpider(chemspider_api_key)
    results = cs.search(compound_name)
    results.wait()
    if results.status != "Complete":
        logging.info(f"Failed search from ChemSpider, status: {results.status}")
        logging.info(f"Message: {results.message}")
        return None, None, 0.0
    if len(results) == 0:
        return None, None, 0.0
    i = max(range(len(results)), key = lambda i: lratio(compound_name, results[i].common_name))
    return results[i].common_name, results[i].molecular_formula, lratio(compound_name, results[i].common_name)

In [9]:
def search_compound(compound_name):
    """
        Search for compound on KEGG and (as a last resort, if no or insufficient match is found) on ChemSpider and
        then again on KEGG through molecular formula. 
        
        Returns ID, list of names as from KEGG, best Levenshtein ratio (w.r.t. the names).
        If nothing is found, IDs and list are both None, and ratio is 0.0.
    """    
    logging.info(f"Looking for {compound_name} on KEGG")
    cid, cnames, keggratio = search_compound_KEGG(compound_name)
    logging.info(f"Best match {cnames} with ratio {keggratio}")
    if keggratio < 0.5: #Bad or no match
        logging.info(f"Insufficient! Trying ChemSpider")
        spname, spformula, spratio = search_compound_chemspider(compound_name)
        logging.info(f"ChemSpider best match {spname} with ratio {spratio} and formula {spformula}")
        if spformula is None:
            return cid, cnames, keggratio
        compound_formula = spformula.translate(str.maketrans("", "", "_{}"))   # Remove special characters
        logging.info(f"Looking for {compound_formula} on KEGG")
        sid, snames, skeggratio = search_compound_KEGG(compound_name, compound_formula)
        logging.info(f"Best match {snames} with ratio {skeggratio}")
        if skeggratio > keggratio:
            return sid, snames, skeggratio
    return cid, cnames, keggratio

In [10]:
def convert_list_to_csv(input_filename, output_filename):
    basename = input_filename
    with open(input_filename) as f:
        compounds = f.read().splitlines()
    with open(output_filename, "w") as f:
        for compound in compounds:
            if len(compound.strip()) == 0:
                continue
            cid, cnames, lr = search_compound(compound)
            if cid is not None:
                jnames = " | ".join(cnames)
                f.write(f"{cid},\"{jnames}\",\"{compound}\",{lr}\n")
            else:
                f.write(f"?,?,\"{compound}\",{lr}\n")
            f.flush()

In [11]:
def read_compound_csv(filename, min_levenshtein=0.9):
    res = []
    with open(filename, "r") as f:
        csvreader = csv.reader(f)
        for row in csvreader:
            if float(row[3]) < min_levenshtein:
                continue
            res += [(row[0], row[2])]
    return res

In [12]:
valA = matplotlib.colors.cnames["red"]
valB = matplotlib.colors.cnames["blue"]

yellow = matplotlib.colors.cnames["yellow"]

def writeKEGGpdfcolors(outputDirectory, organism, superpathway, superpathwayDict, compound2color):
    """
    """
    index = []
    safeSuperpathway = makesafe(superpathway)
    for pw in superpathwayDict[superpathway]:
        countColor = {}
        for v in set(compound2color.values()):
            countColor[v] = set([])
        try:
            pathway = KGML_parser.read(kegg_get(organism + pw[0], "kgml"))
        except:
            print("Pathway", pw, "could not be downloaded: ignoring")
            continue
        canvas = KGMLCanvas(pathway)
        for k in pathway.entries:
            t = pathway.entries[k].type
            #pathway.entries[k].graphics[0].bgcolor = "#FFFFFF"   
            if t == "compound" and pathway.entries[k].is_reactant:
                compound = pathway.entries[k].name[4:]
                if compound in compound2color:
                    pathway.entries[k].graphics[0].bgcolor = compound2color[compound]
                    pathway.entries[k].graphics[0].fgcolor = compound2color[compound]
                    countColor[compound2color[compound]] |= set([compound])
                else:
                    pathway.entries[k].graphics[0].bgcolor = yellow
                    pathway.entries[k].graphics[0].bgcolor = yellow
        canvas.import_imagemap = True
        pdfName = organism + pw[0] + ".pdf"
        pathlib.Path(os.path.join(outputDirectory, organism, safeSuperpathway)).mkdir(parents=True, exist_ok=True)
        canvas.draw(os.path.join(outputDirectory, organism, safeSuperpathway, pdfName))
        index += [(pdfName, pw[1], len(countColor[valA]), len(countColor[valB]))]
    return index

In [13]:
def writeIndex(directory, index, title):
    with open(os.path.join(directory, "index.html"), "w") as file:
        file.write("<!DOCTYPE html>\n<html>\n")
        file.write(f"\t<h1>{title}</h1>\n")
        file.write("\t<table border=1>\n")
        file.write("\t\t<thead><tr><th>Pathway<th>ListA<th>ListB</thead>\n")
        file.write("\t\t<tbody>\n")
        for link, anchor, cA, cB in index:
            file.write("\t\t<tr><td><a href=\"{}\">{}</a><td>{}<td>{}\n".format(link, anchor, cA, cB))
        file.write("\t</tbody>\n")
        file.write("\t</table>\n")
        file.write("</html>\n")


In [14]:
tA = read_compound_csv("../../LaTeX/Data/KEGG-Pathways/List-of-compounds/listA-complete.csv")
tB = read_compound_csv("../../LaTeX/Data/KEGG-Pathways/List-of-compounds/listB-complete.csv")

compound2color = {}
for x in tA:
    compound2color[x[0]] = valA
for x in tB:
    compound2color[x[0]] = valB

In [15]:
### PRODUCE all KEGG pdfs and index.html

for organism in []:#organismDict.keys():
    superIndex = []
    for superpathway in superpathwayDict.keys():
        safeSuperpathway = makesafe(superpathway)
        index = writeKEGGpdfcolors(outputDirectory, organism, superpathway, superpathwayDict, compound2color)
        sumA = sum([x[2] for x in index])
        sumB = sum([x[3] for x in index])
        writeIndex(os.path.join(outputDirectory, organism, safeSuperpathway), index, 
               f"{organismDict[organism]}, {superpathway} (A: {sumA}, B: {sumB})")
        superIndex += [(safeSuperpathway, superpathway, sumA, sumB)]
    sumA = sum([x[2] for x in superIndex])
    sumB = sum([x[3] for x in superIndex])
    writeIndex(os.path.join(outputDirectory, organism), superIndex, 
               f"{organismDict[organism]} (A: {sumA}, B: {sumB})")

In [16]:
### PRODUCE a dictionary compounds with the set of compounds appearing in the whole organism, in each
### superpathway and in each pathway
compounds = {}
for organism in organismDict.keys():
    compounds[organism] = set([])
    for superpathway in superpathwayDict.keys():
        compounds[superpathway] = set([])
        for pw in superpathwayDict[superpathway]:
            pathway = read(open(os.path.join(dataDirectory, organism, f"{organism}{pw[0]}.xml"), "r"))
            compounds[pw[0]] = set([x.name[4:] for x in pathway.compounds if x.is_reactant])
            compounds[superpathway] |= compounds[pw[0]]
            compounds[organism] |= compounds[pw[0]]

In [17]:

def printLine(key, compoundDict, listsOfInterest, label):
    """
        Interrogate compoundDict with key, and obtain a set.
        Print the key, the label, the size of the set, and for each of the (key,value) pairs in the
        dictionary, the size of the intersection between the set and the value
    """
    whole = compoundDict[key]
    nwhole = len(whole)
    intersections = ",".join([str(len(whole & value)) for key, value in listsOfInterest.items()])
    print(f"\"{key}\",\"{label}\",{nwhole},{intersections}")
            
tA = read_compound_csv("../../LaTeX/Data/KEGG-Pathways/List-of-compounds/listA-complete.csv")
tB = read_compound_csv("../../LaTeX/Data/KEGG-Pathways/List-of-compounds/listB-complete.csv")
setA = set([x[0] for x in tA])
setB = set([x[0] for x in tB])


        
for organism in organismDict.keys():
    printLine(organism, compounds, {'A': setA, 'B': setB}, "hsa")
    for superpathway in superpathwayDict.keys():
        printLine(superpathway, compounds, {'A': setA, 'B': setB}, superpathway)
        for pw in superpathwayDict[superpathway]:
            printLine(pw[0], compounds, {'A': setA, 'B': setB}, pw[1])

"hsa","hsa",1563,39,12
"Carbohydrate metabolism","Carbohydrate metabolism",218,13,5
"00010","Glycolysis / Gluconeogenesis",26,3,0
"00020","Citrate cycle (TCA cycle)",20,0,0
"00030","Pentose phosphate pathway",22,3,0
"00040","Pentose and glucuronate interconversions",19,1,1
"00051","Fructose and mannose metabolism",20,1,0
"00052","Galactose metabolism",27,0,0
"00053","Ascorbate and aldarate metabolism",9,0,1
"00500","Starch and sucrose metabolism",18,0,2
"00520","Amino sugar and nucleotide sugar metabolism",42,3,1
"00620","Pyruvate metabolism",23,0,0
"00630","Glyoxylate and dicarboxylate metaboliscm",32,0,1
"00640","Propanoate metabolism",22,2,1
"00650","Butanoate metabolism",15,2,1
"00562","Inositol phosphate metabolism",30,1,0
"Energy metabolism","Energy metabolism",14,2,0
"00190","Oxidative phosphorylation",0,0,0
"00910","Nitrogen metabolism",6,0,0
"00920","Sulfur metabolism",8,2,0
"Lipid metabolism","Lipid metabolism",442,7,2
"00061","Fatty acid biosynthesis",48,1,0
"00062","Fatty a

In [18]:
H, sr2r, r2ss, rid2rname, mid2mname = read_hg_from_SBML([os.path.join(dataDirectory, organism, organism + pw[0] + "-sbml.xml") for spw in superpathwayDict.values() for pw in spw])



In [19]:
G, dd = hg2g(H)
G = nx.DiGraph(G)
nodes = [x for x in list(G.nodes()) if x.startswith('C')]

In [20]:
centralities = {
    "closeness": nx.closeness_centrality,
    "katz": nx.katz_centrality,
    "betweenness": nx.betweenness_centrality,
    "harmonic": nx.harmonic_centrality
}


In [21]:
for centrality_name, centrality_function in centralities.items():
    centrality = centrality_function(G)
    valsA = [centrality[x] for x in setA if x in centrality]
    nA = len(valsA)
    avA = np.average(valsA)
    valsB = [centrality[x] for x in setB if x in centrality]
    nB = len(valsB)
    avB = np.average(valsB)
    avRandA = np.average([np.average([centrality[x] for x in np.random.choice(nodes, nA, replace=False)]) for i in range(1000)])
    avRandB = np.average([np.average([centrality[x] for x in np.random.choice(nodes, nB, replace=False)]) for i in range(1000)])
    print(f"{centrality_name:10}\tA: {avA:8.5f} ({100*avA/avRandA:.2f}%)\tB: {avB:8.5f} ({100*avB/avRandB:.2f}%)")

closeness 	A:  0.01212 (161.06%)	B:  0.01102 (147.70%)
katz      	A:  0.01822 (108.28%)	B:  0.01804 (107.05%)
betweenness	A:  0.00485 (235.80%)	B:  0.00439 (219.99%)
harmonic  	A: 58.38375 (167.68%)	B: 52.76201 (154.30%)


In [38]:
for centrality_name, centrality_function in centralities.items():
    centrality = centrality_function(G)
    population = [centrality[x] for x in nodes]
    print(f"{centrality_name:10}\t", end="")
    for label_name, label_set in {"A": setA, "B": setB}.items():
        subpopulation = [centrality[x] for x in label_set if x in centrality]
        ks_result = scipy.stats.ks_2samp(population, subpopulation, alternative="greater")
        print(f"{label_name}: p-value={ks_result.pvalue:15.8}, sign={ks_result.statistic_sign}\t\t", end="")
    print()

closeness 	A: p-value=   0.0014769179, sign=1		B: p-value=     0.20202631, sign=1		
katz      	A: p-value=  6.6143943e-05, sign=1		B: p-value=    0.025797127, sign=1		
betweenness	A: p-value=   0.0030748883, sign=1		B: p-value=    0.045849783, sign=1		
harmonic  	A: p-value=   0.0016366921, sign=1		B: p-value=     0.18819796, sign=1		
