In [1]:
from hypergraphs import *
from kegg import *
import pandas as pd
import os
import logging

from Bio.KEGG.KGML.KGML_parser import read
import Bio.KEGG.KGML.KGML_pathway
from ast import literal_eval
from urllib.request import urlopen
from urllib.parse import quote_plus
from bs4 import BeautifulSoup
import requests
from Levenshtein import ratio
import scipy.stats
import pathlib
from qf.cc import cardon_crochemore_colored, cardon_crochemore
import random

In [2]:
organismDict = {"hsa": "Homo sapiens (human)"}
outputDirectory = "/Users/boldi/Desktop/pw/"
inputDirectory = "../../LaTeX/Data/KEGG-Pathways/Nov2023/"
dataDirectory = "../../LaTeX/Data/KEGG-Pathways"

In [3]:
pws = pd.read_csv(os.path.join(inputDirectory, "KEGG-pathways.txt"), header=0, 
            names=["Pathway ID", "Pathway type", "Pathway name"], dtype="string", keep_default_na=False)


In [4]:
superpathwayDict = {
    "Involved": [(x,y) for x,y in zip(pws["Pathway ID"], pws["Pathway name"])]
}

In [5]:
superpathway = "Involved"
organism = "hsa"

In [6]:
safeSuperpathway = makesafe(superpathway)

In [7]:
def words2set(words):
    """
        Given a space-separated list of words, returns it as a set of words
    """
    return set(words.split())

In [8]:
from typing import NamedTuple

class Relation(NamedTuple):
    """
        Represents a relation. It is characterized by:
        - a type (e.g. "PPRel") 
        - a list of subtypes (e.g. [('activation', '-->'), ('indirect effect', '..>')])
        - a set of components (source)
        - a list of sets of components (target)
    """
    relType: str
    relSubtypes: list
    source: set
    target: list
    sourceg: Bio.KEGG.KGML.KGML_pathway.Graphics
    targetg: Bio.KEGG.KGML.KGML_pathway.Graphics
        
    @classmethod
    def fromKEGG(cls, relation, pathway):
        """
            Constructor: builds a Relation from a KEGG relation
        """
        if relation.entry2.name == "undefined":
            s = []
            for component in relation.entry2.components:
                s += [words2set(pathway.entries[component.id].name)]
            targetg = pathway.entries[component.id].graphics[0]
        else:
            s = [words2set(relation.entry2.name)]
            targetg = relation.entry2.graphics[0]
        return cls(relation.type, relation.subtypes, words2set(relation.entry1.name), s, targetg, relation.entry1.graphics[0])
        

In [9]:
def relationsFromKEGGpathway(pathway):
    """
        Given a KEGG pathway, returns the list of relations it contains (represented as Relation).
    """
    return [Relation.fromKEGG(r, pathway) for r in pathway.relations]
    

In [10]:
def pathwayFromFile(dataDirectory, organism, pathwayID):
    return KGML_parser.read(open(os.path.join(dataDirectory, organism, organism+pathwayID+".xml"), "r"))

In [11]:
def relationsFromFile(dataDirectory, organism, pathwayID):
    """
        Given a pathway file (specified by a root dataDirectory, the name of the organism subdirectory, and the name
        of the file [organism+pathwayID.xml]), reads it and returns the list of its relations.
    """
    pathway = pathwayFromFile(dataDirectory, organism, pathwayID)
    return relationsFromKEGGpathway(pathway)

In [12]:
def relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway):
    """
        Given a superpathDict (whose keys are superpathway names and whose values are list of
        pairs (pathwayID, pathwayName)), reads all the pathway file for a specific superpathway name
        and returns a dictionary whose keys are the pathwayID's and whose values are the list of relations.
    """
    rel = {}
    for k,v in superpathwayDict[superpathway]:
        #logging.info("Reading", k, v)
        rel[k] = relationsFromFile(dataDirectory, organism, k)
    return rel

In [13]:
def subtypes(rel):
    """
        Given a map whose values are relations, accumulate all subtypes appearing and assign them a number.
        The result is a map from subtype string to number.
    """
    all_subtypes = set([])
    for pathwayid, relations in rel.items():
        for relation in relations:
            all_subtypes |= set([str(relation.relSubtypes)])
        
    s = sorted(list(all_subtypes))
    subtype2color = {v:k for k,v in enumerate(s)}

    return subtype2color

In [14]:
def set2can(s):
    """
        Convert a set to a string in a canonical way.
    """
    return str(sorted(list(s)))

def can2set(c):
    """
        Does the converse of set2can.
    """
    return set(literal_eval(c))

In [15]:
def indices(rel):
    """
        Given a map whose values are lists of relations, it considers all the relations one by one, and attributes a unique id 
        to each element (i.e., set of components appearing as source or in the target of some relation) and block (the set of
        element id appearing as target of some relation).
        This function returns the dictionaries to move from/to an element or block to the corresponding id.
        Elements are string representations of sorted lists of strings.
        Blocks are string representations of sorted lists of ints.
    """
    element2id = {}
    id2element = {}
    block2id = {}
    id2block = {}

    for relations in rel.values():
        for relation in relations:
            s = set2can(relation.source)
            if s not in element2id.keys():
                element2id[s] = len(element2id)
            targetset = set([])
            for targ in relation.target:
                t = set2can(targ)
                if t not in element2id.keys():
                    element2id[t] = len(element2id)
                targetset |= set([element2id[t]])
            ts = set2can(targetset)
            if ts not in block2id.keys():
                block2id[ts] = len(block2id)

    for k,v in element2id.items():
        id2element[v] = k
    for k,v in block2id.items():
        id2block[v] = k
    return element2id, id2element, block2id, id2block


In [16]:
def search_gene_KEGG(geneID):
    """
        Search for gene on KEGG. 
        
        Returns list of names.
    """
    url1 = "https://www.kegg.jp/entry/"+geneID
    response1 = requests.get(url1, allow_redirects=False)    
    soup = BeautifulSoup(response1.text, "html.parser")
    tds = [tds for tds in soup.find_all("td", {"class": "td11 defd"})]
    names = tds[0].getText().strip().split(", ")
    return names

In [17]:
rel = relationsForSuperpathway(dataDirectory, organism, superpathwayDict, superpathway)
element2id, id2element, block2id, id2block = indices(rel)
subtype2color = subtypes(rel)
elements = list(set.union(*[can2set(k) for k in element2id.keys()]))

In [18]:
# name2gene is a function mapping names to gene IDs (hsa:xxxx)
# e.g. KCNE3, BRGDA6 etc are all mapped to hsa:10008 (see https://www.kegg.jp/entry/hsa:10008)
# This map was produced by scraping kegg, but this has been done only once.
# After that, we just read a pkl file

name2keggFilename = os.path.join(inputDirectory, "name2kegg.pkl")

if os.path.exists(name2keggFilename):
    print("Reading gene names")
    with open(name2keggFilename, "rb") as handle:
        name2gene = pickle.load(handle)
else:
    print("Producing gene names")
    d = {}
    nelements = len(elements)
    for count,element in enumerate(elements):
        if count % 10 == 0:
            print(f"{count}/{nelements}")
        if element.startswith("hsa"):
            names = search_gene_KEGG(element)
            for name in names:
                d[name] = element
    with open(name2keggFilename, "wb") as handle:
        pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)
    name2gene = d

Reading gene names


In [19]:
# Read patient data
print("Reading patient data")
genes = pd.read_csv(os.path.join(inputDirectory, "genes.csv"))
patients = genes["name"].values.tolist()
classification = pd.read_csv(os.path.join(inputDirectory, "classification.csv"))

patient2class = {patient: classification.loc[classification["name"]==patient]["classification"].values.flatten().tolist()[0] for patient in patients}

Reading patient data


In [20]:
# Dividing genes into two frames (one for healthy individuals and one for diseased individuals)
genesHealthy = genes.loc[genes["name"].apply(lambda x: patient2class[x])=="healthy"]
genesDiseased = genes.loc[genes["name"].apply(lambda x: patient2class[x])=="diseased"]

In [21]:
# Building the graph of all relations contained in rel values
# Nodes are IDs, and node x correspond to the set of genes (and/or compound) id2element[x]
print("Building the relation graph")
G = nx.DiGraph() 
for relations in rel.values():
    for relation in relations:
        source = element2id[set2can(relation.source)]
        for targ in relation.target:
            target = element2id[set2can(targ)]
            G.add_edge(source, target, color=subtype2color[str(relation.relSubtypes)])
            

# Computing the minimum fibres
cc=cardon_crochemore_colored(G)

colors = list(matplotlib.colors.CSS4_COLORS.values())
usedColors = 0

cc2fibresize = {}
cc2fibre = {}
ccnontrivial = {}
nontrivialclass2col = {}

fibres = []
node2fibre = {}
for v in set(cc.values()):
    fibre = [x for x in G.nodes() if cc[x] == v]
    fibres += [fibre]
    for x in fibre:
        node2fibre[x] = fibre
    cc2fibre[v] = fibre
    cc2fibresize[v] = len(fibre)
    if len(fibre) > 0:
        ccnontrivial[v] = cc[v]
        nontrivialclass2col[v] = colors[usedColors]
        usedColors += 1

Building the relation graph


In [22]:
# Build node2columns, mapping each node of G to the list of genes corresponding to it
# for which we know a value (in the genes dataframe), sorted
#

print("Mapping genes to columns")
namedGenes = set(name2gene.values())     
columns = set(genes.columns.values.tolist()[2:])
node2columns = {}
for node in G.nodes():
    nodeGenes = can2set(id2element[node])
    s = []
    for gene in nodeGenes & namedGenes:
        for k,v in name2gene.items():
            if v == gene:
                if k in columns:
                    s += [k]
    node2columns[node]=sorted(s)

Mapping genes to columns


In [23]:
s = ""
for pair in superpathwayDict[superpathway]:
    pathwayID = pair[0]
    pathway = pathwayFromFile(dataDirectory, organism, pathwayID)
    canvas = KGMLCanvas(pathway)
    for k in pathway.entries:
        name = pathway.entries[k].name
        canonicalName = set2can(words2set(name))
        if canonicalName in element2id.keys():
            node = element2id[canonicalName]
            klass = cc[node]
            if klass in nontrivialclass2col:
                pathway.entries[k].graphics[0].bgcolor = nontrivialclass2col[klass]
                pathway.entries[k].graphics[0].fgcolor = nontrivialclass2col[klass]
            else:
                pathway.entries[k].graphics[0].bgcolor = "#FFFFFF"
                pathway.enries[k].graphics[0].fgcolor = "#FFFFFF"
            s += pathwayID + "\t" + str(node) + "\t" + str(klass) +"\t"+pathway.entries[k].graphics[0].name+"\t"+canonicalName + "\n"
    canvas.import_imagemap = True
    pdfName = organism + pathwayID + ".pdf"
    pathlib.Path(os.path.join("/Users/boldi/Desktop/", organism, safeSuperpathway)).mkdir(parents=True, exist_ok=True)
    canvas.draw(os.path.join("/Users/boldi/Desktop/", organism, safeSuperpathway, pdfName))
with open(os.path.join("/Users/boldi/Desktop/", organism, safeSuperpathway, "all.tsv"), "wt") as file:
    file.write(s)

In [24]:
def columns2nodes(columns, node2columns, ignoreEmpty = True):
    """
        Returns a map from columns to the nodes where the column appears.
    """
    res = {} 
    for c in columns:
        s = set([])
        for no, co in node2columns.items():
            if c in co:
                s |= set([no])
        if len(s) > 0 or not ignoreEmpty:
            res[c] = s
    return res

c2n = columns2nodes(columns, node2columns)
usableColumns = list(c2n.keys())

In [25]:
def areRelated(column1, column2, column2nodes, node2color, inDegThreshold = 0):
    nodes1 = column2nodes[column1]
    nodes2 = column2nodes[column2]
    colors1 = set([node2color[node] for node in nodes1])
    colors2 = set([node2color[node] for node in nodes2])
    commonColors = colors1 & colors2
    if len(commonColors) == 0:
        return False
    maxInDeg = max([G.in_degree(cc2fibre[c][0]) for c in commonColors])
    return maxInDeg >= inDegThreshold

In [26]:
(node2columns[249],node2columns[253])

([], [])

In [27]:
out = []
outd = {}
activationColors = set([])
for subtype, color in subtype2color.items():
    if "activation" in subtype:
        activationColors |= set([color])
for s,t,d in G.edges(data=True):
    if d["color"] in activationColors:
        tausH = []
        tausD = []
        for sg in node2columns[s]:
            for tg in node2columns[t]:
                expr1 = genesHealthy.loc[:,sg].values.tolist()
                expr2 = genesHealthy.loc[:,tg].values.tolist()
                tausH += [scipy.stats.kendalltau(expr1, expr2).statistic]
                expr1 = genesDiseased.loc[:,sg].values.tolist()
                expr2 = genesDiseased.loc[:,tg].values.tolist()
                tausD += [scipy.stats.kendalltau(expr1, expr2).statistic]
        if len(tausH) == 0:
            continue
        meanH = pd.DataFrame(tausH).mean().values[0]
        meanD = pd.DataFrame(tausD).mean().values[0]
        if len(tausH) > 1:
            stdH = pd.DataFrame(tausH).std().values[0]
            stdD = pd.DataFrame(tausD).std().values[0]
        else:
            stdH = 0
            stdD = 0
        out += [("{:4d}\t{:4d}\t{:5.2f}±{:5.2f}\t{:5.2f}±{:5.2f}".format(s,t, meanH, stdH, meanD, stdD), meanH-meanD)]
        outd["({},{})".format(s,t)] = meanH-meanD
out.sort(key=lambda x: abs(x[1]), reverse=True)
#for o,v in out:
#    print(o,"\t",v)

In [31]:
from reportlab.lib.units import inch
import PIL.Image
from io import BytesIO
import reportlab.pdfgen.canvas
import reportlab.lib.colors

def significantTau(tau):
    return abs(tau) > 0.2

def convertTauValue(tau):
    return 3*math.exp(abs(tau)*3)

def convertTauColor(tau):
    if tau < 0:
        return reportlab.lib.colors.red
    else:
        return reportlab.lib.colors.green

#from __future__ import nested_scopes

def get_temp_imagefilename(url):
    """Return filename of temporary file containing downloaded image.

    Create a new temporary file to hold the image file at the passed URL
    and return the filename.
    """
    img = urlopen(url).read()
    im = PIL.Image.open(BytesIO(img))
    # im.transpose(Image.FLIP_TOP_BOTTOM)
    f = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
    fname = f.name
    f.close()
    im.save(fname, "PNG")
    return fname


#def enhance_method(klass, method_name, replacement):
#    'replace a method with an enhanced version'
#    method = getattr(klass, method_name)
#    def enhanced(*args, **kwds): return replacement(*args, **kwds)
#    setattr(klass, method_name, enhanced)

def new_draw(self, filename, colrel):
        """Add the map elements to the drawing."""
        # Instantiate the drawing, first
        # size x_max, y_max for now - we can add margins, later
        if self.import_imagemap:
            # We're drawing directly on the image, so we set the canvas to the
            # same size as the image
            if os.path.isfile(self.pathway.image):
                imfilename = self.pathway.image
            else:
                imfilename = get_temp_imagefilename(self.pathway.image)
            im = PIL.Image.open(imfilename)
            cwidth, cheight = im.size
        else:
            # No image, so we set the canvas size to accommodate visible
            # elements
            cwidth, cheight = (self.pathway.bounds[1][0], self.pathway.bounds[1][1])
        # Instantiate canvas
        self.drawing = reportlab.pdfgen.canvas.Canvas(
            filename,
            bottomup=0,
            pagesize=(
                cwidth * (1 + 2 * self.margins[0]),
                cheight * (1 + 2 * self.margins[1]),
            ),
        )
        self.drawing.setFont(self.fontname, self.fontsize)
        # Transform the canvas to add the margins
        self.drawing.translate(
            self.margins[0] * self.pathway.bounds[1][0],
            self.margins[1] * self.pathway.bounds[1][1],
        )
        # Add the map image, if required
        if self.import_imagemap:
            self.drawing.saveState()
            self.drawing.scale(1, -1)
            self.drawing.translate(0, -cheight)
            self.drawing.drawImage(imfilename, 0, 0)
            self.drawing.restoreState()
        # Add the reactions, compounds and maps
        # Maps go on first, to be overlaid by more information.
        # By default, they're slightly transparent.
        if self.show_maps:
            self.__add_maps()
        if self.show_reaction_entries:
            pass#self.__add_reaction_entries()
        if self.show_orthologs:
            pass#self.__add_orthologs()
        if self.show_compounds:
            pass#self.__add_compounds()
        if self.show_genes:
            pass#self.__add_genes()
        # TODO: complete draw_relations code
        # if self.draw_relations:
        #    self.__add_relations()
        # Write the pathway map to PDF
        for sg,tg,width,color in colrel:
            self.drawing.setLineWidth(width)
            self.drawing.setStrokeColor(color)
            self.drawing.line(sg.x,sg.y,tg.x,tg.y)
        self.drawing.save()
        
#enhance_method(KGMLCanvas, 'draw',  new_draw)
setattr(KGMLCanvas, 'new_draw', new_draw)

s = ""
for pair in superpathwayDict[superpathway]:
    pathwayID = pair[0]
    pathway = pathwayFromFile(dataDirectory, organism, pathwayID)
    
    colrel = []
    for relation in rel[pathwayID]:
        source = element2id[set2can(relation.source)]
        for targ in relation.target:
            target = element2id[set2can(targ)]    
            key = "({},{})".format(source,target)
            if key in outd:
                if significantTau(outd[key]):
                    colrel += [(relation.sourceg, relation.targetg, convertTauValue(outd[key]), convertTauColor(outd[key]))]

    canvas = KGMLCanvas(pathway)
    pathlib.Path(os.path.join("/Users/boldi/Desktop/", organism, safeSuperpathway + "-cola")).mkdir(parents=True, exist_ok=True)

    for k in pathway.entries:
        name = pathway.entries[k].name
        canonicalName = set2can(words2set(name))
        if canonicalName in element2id.keys():
            node = element2id[canonicalName]
            klass = cc[node]
            if klass in nontrivialclass2col:
                pathway.entries[k].graphics[0].bgcolor = nontrivialclass2col[klass]
                pathway.entries[k].graphics[0].fgcolor = nontrivialclass2col[klass]
            else:
                pathway.entries[k].graphics[0].bgcolor = "#FFFFFF"
                pathway.enries[k].graphics[0].fgcolor = "#FFFFFF"
            s += pathwayID + "\t" + str(node) + "\t" + str(klass) +"\t"+pathway.entries[k].graphics[0].name+"\t"+canonicalName + "\n"
    
            
    canvas.import_imagemap = True
    pdfName = organism + pathwayID + ".pdf"
    pathlib.Path(os.path.join("/Users/boldi/Desktop/", organism, safeSuperpathway + "-col")).mkdir(parents=True, exist_ok=True)
    canvas.new_draw(os.path.join("/Users/boldi/Desktop/", organism, safeSuperpathway+ "-col", pdfName), colrel)
#with open(os.path.join("/Users/boldi/Desktop/", organism, safeSuperpathway + "-col", "all.tsv"), "wt") as file:
#    file.write(s)