In [1]:
import xml.etree.ElementTree as ET
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
tree = ET.parse('/Users/arthur_0804/Desktop/Thesis/RelevantDocs/topics2017.xml')

In [3]:
root = tree.getroot()

In [4]:
disease_list = []
for i in range(0,30):
        disease_list.append(root[i][0].text)

In [5]:
for x in disease_list:
    print(x)

Liposarcoma
Colon cancer
Meningioma
Breast cancer
Melanoma
Melanoma
Lung cancer
Lung cancer
Gastrointestinal stromal tumor
Lung adenocarcinoma
Gastric cancer
Colon cancer
Cholangiocarcinoma
Cholangiocarcinoma
Cervical cancer
Pancreatic cancer
Prostate cancer
Pancreatic cancer
Colorectal cancer
Liposarcoma
Lung adenocarcinoma
Lung cancer
Breast cancer
Lung cancer
Lung adenocarcinoma
Breast cancer
Pancreatic adenocarcinoma
Pancreatic ductal adenocarcinoma
Ampullary carcinoma
Pancreatic adenocarcinoma


In [6]:
gene_list = []
for i in range(0,30):
    gene_list.append(root[i][1].text)

In [7]:
for x in gene_list:
    print(x)

CDK4 Amplification
KRAS (G13D), BRAF (V600E)
NF2 (K322), AKT1(E17K)
FGFR1 Amplification, PTEN (Q171)
BRAF (V600E), CDKN2A Deletion
NRAS (Q61K)
EGFR (L858R)
EML4-ALK Fusion transcript
KIT Exon 9 (A502_Y503dup)
KRAS (G12C)
PIK3CA (E545K)
BRAF (V600E)
BRCA2
IDH1 (R132H)
STK11
CDKN2A
PTEN Inactivating
CDK6 Amplification
FGFR1 Amplification
MDM2 Amplification
ALK Fusion
ERBB2 Amplification
PTEN Loss
NTRK1
MET Amplification
NRAS Amplification
KRAS, TP53
ERBB3
KRAS
RB1, TP53, KRAS


In [8]:
# test case: split by comma and strip space
my_string = "RB1, TP53, KRAS"
result = [x.strip() for x in my_string.split(',')]
print(result)

['RB1', 'TP53', 'KRAS']


# 1. MESH expansion

1. Retrieve MeSH ID of the disease from wikidata
2. use the MeSH ID in MeSH SPARQL by ?s meshv:identifier "id" . <br>
    e.g. ?s meshv:identifier "D008080"
3. specify other predicates

In [9]:
# function of getting mesh id from Wikidata
def getMeshIDFromLabel(disease):
    sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
    disease = disease.lower()
    disease = """'""" + disease + """'"""
    # first we need to get the label
    query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                SELECT ?s ?p ?o
                WHERE { ?s rdfs:label""" + disease + """@en}"""
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    if len(result) > 0:
        # retrieve the result from Wikidata
        label = result[0]["s"]["value"]
        # make it as a subject URI
        subject = """<""" + label + """>"""
        sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
        # then use the label as subject to retrieve
        # wdt:P486 is the predicate for mesh id
        query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                SELECT ?o WHERE { """ + subject + """ wdt:P486 ?o.}"""
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        result = results["results"]["bindings"]
        if len(result) > 0:
            # get mesh id
            meshid = result[0]["o"]["value"]
        else:
            # 1 indicates no result because there is no predicate of "meshid"
            meshid = "1"
    else:
        # 2 indicates no result because this term does not exist in Wikidata
        # we can try aliases or dbpedia
        meshid = "2"
    return meshid

In [10]:
disease_dict_mesh = {}
# put the result into the dictionary
for x in disease_list:
    meshid = getMeshIDFromLabel(x)
    disease_dict_mesh[x] = meshid

In [11]:
for (k,v) in  disease_dict_mesh.items(): 
    if(v == '1'):
        print("no result because there is no predicate of 'meshid' Disease: {}; MeSH ID: {}".format(k,v))
    elif (v == '2'):
        print("no result because this term does not exist in Wikidata Disease: {}; MeSH ID: {}".format(k,v))
    else:
        print("Disease: {}; MeSH ID: {}".format(k,v))

Disease: Liposarcoma; MeSH ID: D008080
Disease: Colon cancer; MeSH ID: D003110
Disease: Meningioma; MeSH ID: D008579
Disease: Breast cancer; MeSH ID: D001943
Disease: Melanoma; MeSH ID: D008545
Disease: Lung cancer; MeSH ID: D002283
Disease: Gastrointestinal stromal tumor; MeSH ID: D046152
no result because this term does not exist in Wikidata Disease: Lung adenocarcinoma; MeSH ID: 2
no result because this term does not exist in Wikidata Disease: Gastric cancer; MeSH ID: 2
Disease: Cholangiocarcinoma; MeSH ID: D018281
Disease: Cervical cancer; MeSH ID: D002583
no result because this term does not exist in Wikidata Disease: Pancreatic cancer; MeSH ID: 2
Disease: Prostate cancer; MeSH ID: D011471
Disease: Colorectal cancer; MeSH ID: D015179
no result because there is no predicate of 'meshid' Disease: Pancreatic adenocarcinoma; MeSH ID: 1
Disease: Pancreatic ductal adenocarcinoma; MeSH ID: D021441
no result because this term does not exist in Wikidata Disease: Ampullary carcinoma; MeSH ID

There are some terms which may use other names, and names in our data set are aliases

In [12]:
def getMeshIDFromAlias(disease):
    sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
    disease = disease.lower()
    disease = """'""" + disease + """'"""
    query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                select ?s
                where{ ?s <http://www.w3.org/2004/02/skos/core#altLabel>""" + disease +"""@en .}"""
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    if len(result) > 0:
        altLabel = result[0]["s"]["value"]
        # make it as a subject URI
        subject = """<""" + altLabel + """>"""
        sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
        # then use the label as subject to retrieve
        # wdt:P486 is the predicate for mesh id
        query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                SELECT ?o WHERE { """ + subject + """ wdt:P486 ?o.}"""
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        result = results["results"]["bindings"]
        
        if len(result) > 0:
            # get mesh id
            meshid = result[0]["o"]["value"]
        else:
            # 1 indicates no result because there is no predicate of "meshid"
            meshid = "1"    
    else:
        # still cannot find this entity
        meshid = "2"
        
    return meshid

In [13]:
for (k,v) in  disease_dict_mesh.items(): 
    if(v == '2'):
        meshid = getMeshIDFromAlias(k)
        disease_dict_mesh[k] = meshid

In [14]:
for (k,v) in disease_dict_mesh.items():
    print(k,v)

Liposarcoma D008080
Colon cancer D003110
Meningioma D008579
Breast cancer D001943
Melanoma D008545
Lung cancer D002283
Gastrointestinal stromal tumor D046152
Lung adenocarcinoma C538231
Gastric cancer D013274
Cholangiocarcinoma D018281
Cervical cancer D002583
Pancreatic cancer 2
Prostate cancer D011471
Colorectal cancer D015179
Pancreatic adenocarcinoma 1
Pancreatic ductal adenocarcinoma D021441
Ampullary carcinoma 1


# MESH - Query parents and children

In [15]:
# query parents
def getMeshParent(id):
    sparql = SPARQLWrapper("http://id.nlm.nih.gov/mesh/sparql")
    sparql.setQuery("""
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
        PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
        PREFIX mesh2015: <http://id.nlm.nih.gov/mesh/2015/>
        PREFIX mesh2016: <http://id.nlm.nih.gov/mesh/2016/>
        PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/>
        PREFIX mesh2018: <http://id.nlm.nih.gov/mesh/2018/>
        PREFIX mesh2019: <http://id.nlm.nih.gov/mesh/2019/>
        SELECT DISTINCT ?s ?p ?o
        FROM <http://id.nlm.nih.gov/mesh>
        WHERE {mesh:""" + id + """ meshv:broaderDescriptor ?o.}""")
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    parent_disease = ""
    if len(result) > 0:
        for result in results["results"]["bindings"]:
             parent_disease += result["o"]["value"] + " "
    else:
        parent_disease = "None"
    return parent_disease

In [16]:
# query child
def getMeshChild(id):
    sparql = SPARQLWrapper("http://id.nlm.nih.gov/mesh/sparql")
    sparql.setQuery("""
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
        PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
        PREFIX mesh2015: <http://id.nlm.nih.gov/mesh/2015/>
        PREFIX mesh2016: <http://id.nlm.nih.gov/mesh/2016/>
        PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/>
        PREFIX mesh2018: <http://id.nlm.nih.gov/mesh/2018/>
        PREFIX mesh2019: <http://id.nlm.nih.gov/mesh/2019/>
        SELECT DISTINCT ?s ?p ?o
        FROM <http://id.nlm.nih.gov/mesh>
        WHERE {
          ?s meshv:broaderDescriptor mesh:""" + id +  """.}""")
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    child_disease = ""
    if len(result) > 0:
        for result in results["results"]["bindings"]:
             child_disease += result["s"]["value"] + " "
    else:
        child_disease = "None"
    return child_disease

In [17]:
# get label by mesh ID
def getDiseaseName(id):
    if id != "None":
        subject = """<""" + id + """>"""
        sparql = SPARQLWrapper("http://id.nlm.nih.gov/mesh/sparql")
        sparql.setQuery("""
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
            PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
            PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
            PREFIX mesh2015: <http://id.nlm.nih.gov/mesh/2015/>
            PREFIX mesh2016: <http://id.nlm.nih.gov/mesh/2016/>
            PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/>
            PREFIX mesh2018: <http://id.nlm.nih.gov/mesh/2018/>
            PREFIX mesh2019: <http://id.nlm.nih.gov/mesh/2019/>
            SELECT DISTINCT ?s ?p ?o
            FROM <http://id.nlm.nih.gov/mesh>
            WHERE {""" + subject + """ <http://www.w3.org/2000/01/rdf-schema#label>  ?o.}""")
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        result = results["results"]["bindings"]
        if len(result) > 0:
            return result[0]["o"]["value"]
        else:
            return "None"
    else:
        return "None"

In [18]:
# change the position of terms in some diseases
# e.g. "Neoplasms, Adipose Tissue" to "Adipose Tissue Neoplasms"
def normalizeTerms(label):
    newterm = label.split(",")[1].strip() + " " + label.split(",")[0].strip()
    return newterm

Iterate through the dictionary, find parent/child mesh ID for each disease, and then return the label

In [49]:
def getAllMeshExpansionTerms(disease_meshid):
    parent_disease = getMeshParent(disease_meshid)
    parent_disease = parent_disease.strip()
    child_disease = getMeshChild(disease_meshid)
    child_disease = child_disease.strip()
    expansion_terms = parent_disease + " " + child_disease
    expansion_terms_list = [x.strip() for x in expansion_terms.split(' ')]
    
    expansion_terms = " "
    
    for x in expansion_terms_list:
        label = getDiseaseName(x)
        if label != "None":
            if label.find(",") != -1:
                label = normalizeTerms(label)
            expansion_terms = expansion_terms + " " + label
    
    expansion_terms = expansion_terms.strip()
    return expansion_terms

In [79]:
# write into xml
tree = ET.parse('/Users/arthur_0804/Desktop/Thesis/RelevantDocs/topics2017.xml')
root = tree.getroot()

for i in range(len(root)):
    original_disease = root[i][0].text
    expansion_terms = getAllMeshExpansionTerms(disease_dict_mesh[original_disease])
    if expansion_terms != "":
        root[i][0].text = expansion_terms
    else:
        root[i][0].text = original_disease
tree.write("mesh_expanded.xml")

# 2. Disease Ontology expansion

In [67]:
# function of getting mesh id from Wikidata
def getDOIDFromLabel(disease):
    sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
    disease = disease.lower()
    disease = """'""" + disease + """'"""
    # first we need to get the label
    query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                SELECT ?s ?p ?o
                WHERE { ?s rdfs:label""" + disease + """@en}"""
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    if len(result) > 0:
        # retrieve the result from Wikidata
        label = result[0]["s"]["value"]
        # make it as a subject URI
        subject = """<""" + label + """>"""
        sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
        # then use the label as subject to retrieve
        # wdt:P699 is the predicate for do id
        query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                SELECT ?o WHERE { """ + subject + """ wdt:P699 ?o.}"""
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        result = results["results"]["bindings"]
        if len(result) > 0:
            # get do id
            doid = result[0]["o"]["value"]
        else:
            # 1 indicates no result because there is no predicate of "doid"
            doid = "1"
    else:
        # 2 indicates no result because this term does not exist in Wikidata
        doid = "2"
        
    return doid

In [68]:
disease_dict_do = {}
# put the result into the dictionary
for x in disease_list:
    doid = getDOIDFromLabel(x)
    disease_dict_do[x] = doid

In [69]:
def getDOIDFromAlias(disease):
    sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
    disease = disease.lower()
    disease = """'""" + disease + """'"""
    query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                select ?s
                where{ ?s <http://www.w3.org/2004/02/skos/core#altLabel>""" + disease +"""@en .}"""
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    if len(result) > 0:
        altLabel = result[0]["s"]["value"]
        # make it as a subject URI
        subject = """<""" + altLabel + """>"""
        sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
        # then use the label as subject to retrieve
        # wdt:P486 is the predicate for mesh id
        query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                SELECT ?o WHERE { """ + subject + """ wdt:P699 ?o.}"""
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        result = results["results"]["bindings"]
        if len(result) > 0:
            # get do id
            doid = result[0]["o"]["value"]
        else:
            # 1 indicates no result because there is no predicate of "doid"
            doid = "1"
    else:
        # 2 indicates no result because this term does not exist in Wikidata
        doid = "2"   
    return doid

In [70]:
for (k,v) in  disease_dict_do.items(): 
    if(v == '2'):
        doid = getDOIDFromAlias(k)
        disease_dict_do[k] = doid

In [71]:
for (k,v) in disease_dict_do.items():
    print(k,v)

Liposarcoma DOID:3382
Colon cancer DOID:219
Meningioma DOID:3565
Breast cancer DOID:1612
Melanoma DOID:1909
Lung cancer DOID:1324
Gastrointestinal stromal tumor DOID:9253
Lung adenocarcinoma DOID:3910
Gastric cancer DOID:10534
Cholangiocarcinoma DOID:4947
Cervical cancer DOID:4362
Pancreatic cancer 2
Prostate cancer DOID:10283
Colorectal cancer DOID:9256
Pancreatic adenocarcinoma DOID:4074
Pancreatic ductal adenocarcinoma 1
Ampullary carcinoma DOID:4932


# DO - Query parents and children

In [72]:
import requests

In [73]:
def getDOParent(doid):
    doparents = ""
    url = 'http://www.disease-ontology.org/api/metadata/' + doid
    r = requests.get(url)
    result = r.json()
    if 'parents' in result:
        parents = result["parents"]
        for x in parents:
            doparents += x[1] + ","
    return doparents

In [74]:
def getDOChild(doid):
    dochild = ""
    url = 'http://www.disease-ontology.org/api/metadata/' + doid
    r = requests.get(url)
    result = r.json()
    if 'children' in result:
        children = result["children"]
        for x in children:
            dochild += x[0] + ","
    return dochild

In [80]:
def getAllDOExpansionTerms(doid):
    parents = getDOParent(doid)
    children = getDOChild(doid)
    expansion_terms = parents + children
    expansion_terms = expansion_terms[:-1]
    expansion_terms_list = [x.strip() for x in expansion_terms.split(',')]
    expansion_terms = " "
    for x in expansion_terms_list:
        expansion_terms = expansion_terms + " " + x
    expansion_terms = expansion_terms.strip()
    return expansion_terms

In [82]:
# write into xml
tree = ET.parse('/Users/arthur_0804/Desktop/Thesis/RelevantDocs/topics2017.xml')
root = tree.getroot()

for i in range(len(root)):
    original_disease = root[i][0].text
    doid = disease_dict_do[original_disease]
    if doid != "1" and doid != "2":
        expansion_terms = getAllDOExpansionTerms(doid)
        root[i][0].text = expansion_terms
    else:
        root[i][0].text = original_disease
tree.write("do_expanded.xml")