In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
import xml.etree.ElementTree as ET

In [2]:
tree = ET.parse('/Users/arthur_0804/Desktop/Thesis/RelevantDocs/topics2017.xml')

In [3]:
root = tree.getroot()

In [4]:
disease_list = []
for i in range(0,30):
        disease_list.append(root[i][0].text)

In [5]:
# function of getting mesh id from Wikidata
def getMeshIDFromLabel(disease):
    sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
    disease = disease.lower()
    disease = """'""" + disease + """'"""
    # first we need to get the label
    query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                SELECT ?s ?p ?o
                WHERE { ?s rdfs:label""" + disease + """@en}"""
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    if len(result) > 0:
        # retrieve the result from Wikidata
        label = result[0]["s"]["value"]
        # make it as a subject URI
        subject = """<""" + label + """>"""
        sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
        # then use the label as subject to retrieve
        # wdt:P486 is the predicate for mesh id
        query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                SELECT ?o WHERE { """ + subject + """ wdt:P486 ?o.}"""
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        result = results["results"]["bindings"]
        if len(result) > 0:
            # get mesh id
            meshid = result[0]["o"]["value"]
        else:
            # 1 indicates no result because there is no predicate of "meshid"
            meshid = "1"
    else:
        # 2 indicates no result because this term does not exist in Wikidata
        # we can try aliases or dbpedia
        meshid = "2"
    return meshid

In [6]:
disease_dict_mesh = {}
# put the result into the dictionary
for x in disease_list:
    meshid = getMeshIDFromLabel(x)
    disease_dict_mesh[x] = meshid

In [7]:
for (k,v) in  disease_dict_mesh.items(): 
    if(v == '1'):
        print("no result because there is no predicate of 'meshid' Disease: {}; MeSH ID: {}".format(k,v))
    elif (v == '2'):
        print("no result because this term does not exist in Wikidata Disease: {}; MeSH ID: {}".format(k,v))
    else:
        print("Disease: {}; MeSH ID: {}".format(k,v))

Disease: Liposarcoma; MeSH ID: D008080
Disease: Colon cancer; MeSH ID: D003110
Disease: Meningioma; MeSH ID: D008579
Disease: Breast cancer; MeSH ID: D001943
Disease: Melanoma; MeSH ID: D008545
Disease: Lung cancer; MeSH ID: D008175
Disease: Gastrointestinal stromal tumor; MeSH ID: D046152
no result because this term does not exist in Wikidata Disease: Lung adenocarcinoma; MeSH ID: 2
no result because this term does not exist in Wikidata Disease: Gastric cancer; MeSH ID: 2
Disease: Cholangiocarcinoma; MeSH ID: D018281
Disease: Cervical cancer; MeSH ID: D002583
Disease: Pancreatic cancer; MeSH ID: D010190
Disease: Prostate cancer; MeSH ID: D011471
Disease: Colorectal cancer; MeSH ID: D015179
no result because there is no predicate of 'meshid' Disease: Pancreatic adenocarcinoma; MeSH ID: 1
Disease: Pancreatic ductal adenocarcinoma; MeSH ID: D021441
no result because this term does not exist in Wikidata Disease: Ampullary carcinoma; MeSH ID: 2


There are some terms which may use other names, and names in our data set are aliases

In [8]:
def getMeshIDFromAlias(disease):
    sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
    disease = disease.lower()
    disease = """'""" + disease + """'"""
    query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                select ?s
                where{ ?s <http://www.w3.org/2004/02/skos/core#altLabel>""" + disease +"""@en .}"""
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    if len(result) > 0:
        altLabel = result[0]["s"]["value"]
        # make it as a subject URI
        subject = """<""" + altLabel + """>"""
        sparql = SPARQLWrapper("https://query.wikidata.org/bigdata/namespace/wdq/sparql")
        # then use the label as subject to retrieve
        # wdt:P486 is the predicate for mesh id
        query = """PREFIX wikibase: <http://wikiba.se/ontology#>
                PREFIX wd: <http://www.wikidata.org/entity/>
                PREFIX wdt: <http://www.wikidata.org/prop/direct/> 
                PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                SELECT ?o WHERE { """ + subject + """ wdt:P486 ?o.}"""
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        result = results["results"]["bindings"]
        
        if len(result) > 0:
            # get mesh id
            meshid = result[0]["o"]["value"]
        else:
            # 1 indicates no result because there is no predicate of "meshid"
            meshid = "1"    
    else:
        # still cannot find this entity
        meshid = "2"
        
    return meshid

In [9]:
for (k,v) in  disease_dict_mesh.items(): 
    if(v == '2'):
        meshid = getMeshIDFromAlias(k)
        disease_dict_mesh[k] = meshid

In [10]:
disease_dict_mesh['Pancreatic adenocarcinoma'] = 'D010190'
disease_dict_mesh['Lung adenocarcinoma'] = "D008175"
for (k,v) in disease_dict_mesh.items():
    print(k,v)

Liposarcoma D008080
Colon cancer D003110
Meningioma D008579
Breast cancer D001943
Melanoma D008545
Lung cancer D008175
Gastrointestinal stromal tumor D046152
Lung adenocarcinoma D008175
Gastric cancer D013274
Cholangiocarcinoma D018281
Cervical cancer D002583
Pancreatic cancer D010190
Prostate cancer D011471
Colorectal cancer D015179
Pancreatic adenocarcinoma D010190
Pancreatic ductal adenocarcinoma D021441
Ampullary carcinoma 1


In [11]:
def getDiseaseName(id):
    '''
    Get the MeSH label for a given MeSH id
    '''
        
    if id != "None":
        subject = """mesh:""" + id
        sparql = SPARQLWrapper("http://id.nlm.nih.gov/mesh/sparql")
        sparql.setQuery("""
            PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>
            PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
            PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
            PREFIX mesh2015: <http://id.nlm.nih.gov/mesh/2015/>
            PREFIX mesh2016: <http://id.nlm.nih.gov/mesh/2016/>
            PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/>
            PREFIX mesh2018: <http://id.nlm.nih.gov/mesh/2018/>
            PREFIX mesh2019: <http://id.nlm.nih.gov/mesh/2019/>
            SELECT DISTINCT ?s ?p ?o
            FROM <http://id.nlm.nih.gov/mesh>
            WHERE {""" + subject + """ <http://www.w3.org/2000/01/rdf-schema#label>  ?o.}""")
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        result = results["results"]["bindings"]
        if len(result) > 0:
            return result[0]["o"]["value"]
        else:
            return "None"
    else:
        return "None"

In [12]:
getDiseaseName("D008080")

'Liposarcoma'

This function returns the MeSH tree node by MeSH ID

In [13]:
def getDiseaseTreeNode(id):
    '''
    Get the MeSH node id for a given MeSH id
    '''
        
    subject = """mesh:""" + id
    sparql = SPARQLWrapper("http://id.nlm.nih.gov/mesh/sparql")
    sparql.setQuery("""
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
        PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
        PREFIX mesh2015: <http://id.nlm.nih.gov/mesh/2015/>
        PREFIX mesh2016: <http://id.nlm.nih.gov/mesh/2016/>
        PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/>
        PREFIX mesh2018: <http://id.nlm.nih.gov/mesh/2018/>
        PREFIX mesh2019: <http://id.nlm.nih.gov/mesh/2019/>
        SELECT DISTINCT ?s ?p ?o
        FROM <http://id.nlm.nih.gov/mesh>
        WHERE {""" + subject + """ meshv:treeNumber  ?o.}""")
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    
    tree_node_list = []
    if len(result) > 0:
        for i in result:
            tree_node_list.append(i["o"]["value"])
    
    for i in range(0,len(tree_node_list)):
        tree_node_list[i] = tree_node_list[i][27:]
    
    return tree_node_list

In [14]:
getDiseaseTreeNode("D008080")

['C04.557.450.550.420', 'C04.557.450.795.465']

In [15]:
# we only use the disease part (starts with "C")
records = []
meshFile = '/Users/arthur_0804/Desktop/mtrees2019.bin'
with open(meshFile, mode='r') as file:
    for line in file:
        record = [x.strip() for x in line.split(";")]
        if record[1][0] == "C":
            records.append(record)

In [16]:
def getParentNode(nodeid):
    '''
    Get the parent node id for a given node
    '''
    return nodeid[:-4]

In [17]:
def getChildrenNode(nodeid):
    '''
    Get all the chilren node id for a given node
    '''
    children = []
    for record in records:
        tree_node = record[1]
        if (tree_node[:len(nodeid)] == nodeid) and (tree_node != nodeid) and (len(tree_node)-len(nodeid) == 4):
            children.append(tree_node)
    return children

In [18]:
def getAllTerms(nodeid):
    '''
    Get the parent/children/sibling/sibling's children node id for a given node
    '''
    
    results = []
    
    # get parent
    parent_node = getParentNode(nodeid)
    results.append(parent_node)
    
    # get children
    results += getChildrenNode(nodeid)
    
    # get siblings
    siblings = getChildrenNode(parent_node)
    results += siblings
    
    # get siblings' children
    for sibling in siblings:
        sibling_children = getChildrenNode(sibling)
        results += sibling_children
    
    return results

In [19]:
# test
getAllTerms("C04.557.450.550.420")

['C04.557.450.550',
 'C04.557.450.550.420.425',
 'C04.557.450.550.100',
 'C04.557.450.550.125',
 'C04.557.450.550.400',
 'C04.557.450.550.420',
 'C04.557.450.550.710',
 'C04.557.450.550.400.500',
 'C04.557.450.550.420.425']

In [20]:
def getMeshidByNode(nodeid):
    '''
    Get the MeSH id for a given node id
    '''
    
    node = """mesh:""" + nodeid
    sparql = SPARQLWrapper("http://id.nlm.nih.gov/mesh/sparql")
    sparql.setQuery("""
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
        PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
        PREFIX mesh2015: <http://id.nlm.nih.gov/mesh/2015/>
        PREFIX mesh2016: <http://id.nlm.nih.gov/mesh/2016/>
        PREFIX mesh2017: <http://id.nlm.nih.gov/mesh/2017/>
        PREFIX mesh2018: <http://id.nlm.nih.gov/mesh/2018/>
        PREFIX mesh2019: <http://id.nlm.nih.gov/mesh/2019/>
        SELECT DISTINCT ?s ?p ?o
        FROM <http://id.nlm.nih.gov/mesh>
        WHERE { ?s meshv:treeNumber """ +  node + """}""")
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    result = results["results"]["bindings"]
    
    return result[0]["s"]["value"][27:]

In [21]:
# test
getMeshidByNode("C04.557.450.795.465")

'D008080'

In [22]:
# test
for node in getAllTerms("C04.557.450.550.420"):
    print(getMeshidByNode(node))

D018205
D018208
D018206
D018207
D008067
D008080
D018209
D062689
D018208


In [23]:
for (k,v) in disease_dict_mesh.items():
    disease = k
    nodes = getDiseaseTreeNode(v)
    
    print("disease: {}".format(k))
    for node in nodes:
        if node.count(".") >= 3:
            print("\tnode: {}".format(node))
            othernodes = getAllTerms(node)
            for othernode in othernodes:
                print("\t\t{}".format(getDiseaseName(getMeshidByNode(othernode))))

disease: Liposarcoma
	node: C04.557.450.550.420
		Neoplasms, Adipose Tissue
		Liposarcoma, Myxoid
		Angiolipoma
		Angiomyolipoma
		Lipoma
		Liposarcoma
		Myelolipoma
		Lipoblastoma
		Liposarcoma, Myxoid
	node: C04.557.450.795.465
		Sarcoma
		Liposarcoma, Myxoid
		Adenosarcoma
		Carcinosarcoma
		Chondrosarcoma
		Desmoplastic Small Round Cell Tumor
		Endometrial Stromal Tumors
		Fibrosarcoma
		Hemangiosarcoma
		Histiocytoma, Malignant Fibrous
		Leiomyosarcoma
		Liposarcoma
		Lymphangiosarcoma
		Mixed Tumor, Mesodermal
		Myosarcoma
		Myxosarcoma
		Osteosarcoma
		Phyllodes Tumor
		Sarcoma, Alveolar Soft Part
		Sarcoma, Clear Cell
		Sarcoma, Experimental
		Sarcoma, Kaposi
		Sarcoma, Myeloid
		Sarcoma, Small Cell
		Sarcoma, Synovial
		Carcinoma 256, Walker
		Chondrosarcoma, Clear Cell
		Chondrosarcoma, Mesenchymal
		Sarcoma, Endometrial Stromal
		Dermatofibrosarcoma
		Neurofibrosarcoma
		Liposarcoma, Myxoid
		Rhabdomyosarcoma
		Osteosarcoma, Juxtacortical
		Sarcoma, Ewing
		Sarcoma 37
		Sarc

		Jejunal Neoplasms
	node: C06.405.249.767
		Gastrointestinal Neoplasms
		Esophageal Neoplasms
		Gastrointestinal Stromal Tumors
		Intestinal Neoplasms
		Stomach Neoplasms
		Zollinger-Ellison Syndrome
		Esophageal Squamous Cell Carcinoma
		Cecal Neoplasms
		Colorectal Neoplasms
		Duodenal Neoplasms
		Ileal Neoplasms
		Immunoproliferative Small Intestinal Disease
		Jejunal Neoplasms
	node: C06.405.748.789
		Stomach Diseases
		Achlorhydria
		Diverticulosis, Stomach
		Duodenogastric Reflux
		Gastric Antral Vascular Ectasia
		Gastric Dilatation
		Gastric Outlet Obstruction
		Gastritis
		Gastroparesis
		Peptic Ulcer
		Postgastrectomy Syndromes
		Stomach Neoplasms
		Stomach Rupture
		Stomach Volvulus
		Zollinger-Ellison Syndrome
		Bile Reflux
		Pyloric Stenosis
		Gastritis, Atrophic
		Gastritis, Hypertrophic
		Duodenal Ulcer
		Esophagitis, Peptic
		Peptic Ulcer Perforation
		Stomach Ulcer
		Zollinger-Ellison Syndrome
		Dumping Syndrome
	node: C04.588.274.476.767
		Gastrointestinal Neoplasms


		Intestinal Neoplasms
		Adenomatous Polyposis Coli
		Colonic Neoplasms
		Colorectal Neoplasms, Hereditary Nonpolyposis
		Rectal Neoplasms
		Cecal Neoplasms
		Colorectal Neoplasms
		Duodenal Neoplasms
		Ileal Neoplasms
		Jejunal Neoplasms
		Appendiceal Neoplasms
		Adenomatous Polyposis Coli
		Colonic Neoplasms
		Colorectal Neoplasms, Hereditary Nonpolyposis
		Rectal Neoplasms
disease: Pancreatic adenocarcinoma
	node: C04.588.274.761
		Digestive System Neoplasms
		Adenoma, Islet Cell
		Carcinoma, Islet Cell
		Carcinoma, Pancreatic Ductal
		Pancreatic Intraductal Neoplasms
		Biliary Tract Neoplasms
		Gastrointestinal Neoplasms
		Liver Neoplasms
		Pancreatic Neoplasms
		Peritoneal Neoplasms
		Bile Duct Neoplasms
		Gallbladder Neoplasms
		Esophageal Neoplasms
		Intestinal Neoplasms
		Stomach Neoplasms
		Adenoma, Liver Cell
		Carcinoma, Hepatocellular
		Liver Neoplasms, Experimental
		Adenoma, Islet Cell
		Carcinoma, Islet Cell
		Carcinoma, Pancreatic Ductal
		Pancreatic Intraductal Neoplas