## MESH

In [2]:
import hashlib
import json
import xml.etree.ElementTree as et
import gzip
import os
import pandas as pd
from unidecode import unidecode

ImportError: No module named 'unidecode'

In [15]:
desc_df = pd.read_csv(os.path.join(DATA_DIR, 'descriptor-terms.tsv'), index_col=0)
desc_df.head()

Unnamed: 0,DescriptorUI,ConceptUI,PreferredConceptYN,TermUI,TermName,ConceptPreferredTermYN,RecordPreferredTermYN
0,D000001,M0000001,Y,T000002,Calcimycin,Y,Y
1,D000001,M0353609,N,T000001,A-23187,Y,N
2,D000001,M0353609,N,T000001,A 23187,N,N
3,D000001,M0353609,N,T000003,Antibiotic A23187,N,N
4,D000001,M0353609,N,T000003,"A23187, Antibiotic",N,N


In [None]:
"""<HeadingMappedToList>
   <HeadingMappedTo>
    <DescriptorReferredTo>
     <DescriptorUI>*D020739</DescriptorUI>
     <DescriptorName>
      <String>Brain Diseases, Metabolic, Inborn</String>
     </DescriptorName>
    </DescriptorReferredTo>
   </HeadingMappedTo>
  </HeadingMappedToList>
"""

In [6]:
def populate_mesh():
    """
    Initially populate mesh term. Only works starting with empty table

    """
    # Read MeSH descriptor and supplementary terms
    desc_df = pd.read_table(os.path.join(DATA_DIR, 'descriptor-terms.tsv'))
    supp_df = pd.read_table(os.path.join(DATA_DIR, 'supplemental-terms.tsv'))
    desc_df.TermName = desc_df.TermName.str.lower()
    supp_df.TermName = supp_df.TermName.str.lower()
    # Get the preferred name for each term
    preferred_name = dict()
    for term, df in desc_df.groupby("DescriptorUI"):
        preferred_name[term] = list(df[(df.PreferredConceptYN == "Y") & (df.ConceptPreferredTermYN == "Y") & (
            df.RecordPreferredTermYN == "Y")].TermName)[0]
    for term, df in supp_df.groupby("SupplementalRecordUI"):
        preferred_name[term] = list(df[(df.PreferredConceptYN == "Y") & (df.ConceptPreferredTermYN == "Y") & (
            df.RecordPreferredTermYN == "Y")].TermName)[0]
    # Fix unicode nonsense
    upreferred_name = {k: unidecode(v.decode('utf-8')) for k, v in preferred_name.items()}


def parse_mesh_parents(xml_path):
    """
    Get the heirarchy of mesh terms.
    Not doing anything with this yet...

    :param xml_path: path to "desc2016.xml" file
    :return: None
    """
    terms = list()
    f = open(xml_path)
    context = iter(et.iterparse(f, events=("start", "end")))
    event, root = next(context)
    for event, elem in context:
        if event == "end" and elem.tag == "DescriptorRecord":
            term = dict()
            term['mesh_id'] = elem.findtext('DescriptorUI')
            term['mesh_name'] = elem.findtext('DescriptorName/String')
            term['tree_numbers'] = [x.text for x in elem.findall('TreeNumberList/TreeNumber')]
            terms.append(term)
        root.clear()

    # stolen from https://github.com/dhimmel/mesh/blob/gh-pages/descriptors.ipynb
    # Determine ontology parents
    tree_number_to_id = {tn: term['mesh_id'] for term in terms for tn in term['tree_numbers']}
    for term in terms:
        parents = set()
        for tree_number in term['tree_numbers']:
            try:
                parent_tn, self_tn = tree_number.rsplit('.', 1)
                parents.add(tree_number_to_id[parent_tn])
            except ValueError:
                pass
        term['parents'] = list(parents)

    with open(os.path.join(DATA_DIR, 'mesh.json'), 'w') as f:
        json.dump(terms, f, indent=2)


def parse_mesh_xml(xml_path):
    # based on: https://github.com/dhimmel/mesh/blob/gh-pages/descriptors.ipynb
    if "desc" in os.path.basename(xml_path):
        record_name = "DescriptorRecord"
        recordUI = "DescriptorUI"
        out_tsv = "descriptor-terms.tsv"
    elif "supp" in os.path.basename(xml_path):
        record_name = "SupplementalRecord"
        recordUI = "SupplementalRecordUI"
        out_tsv = "supplemental-terms.tsv"
    else:
        raise ValueError("Unknown mesh xml type")

    f = open(xml_path)
    # for parsing an xml iteratively (without using 6gb of ram)
    context = iter(et.iterparse(f, events=("start", "end")))
    event, root = next(context)
    term_dicts = list()
    for event, elem in context:
        if event == "end" and record_name == elem.tag:
            for concept in elem.findall("ConceptList/Concept"):
                for term in concept.findall('TermList/Term'):
                    term_dict = {
                        recordUI: elem.findtext(recordUI),
                        'ConceptUI': concept.findtext('ConceptUI'),
                        'TermUI': term.findtext('TermUI'),
                        'TermName': term.findtext('String')
                    }
                    term_dict.update(concept.attrib)
                    term_dict.update(term.attrib)
                    term_dicts.append(term_dict)
        root.clear()

    columns = [recordUI, 'ConceptUI', 'PreferredConceptYN', 'TermUI', 'TermName', 'ConceptPreferredTermYN',
               'RecordPreferredTermYN', ]
    term_df = pd.DataFrame(term_dicts)[columns]
    term_df.to_csv(os.path.join(DATA_DIR, out_tsv), encoding='utf-8')



In [20]:
"""
Only updated once a year.
ftp://nlmpubs.nlm.nih.gov/online/mesh/2016/desc2016.xml
ftp://nlmpubs.nlm.nih.gov/online/mesh/2016/supp2016.xml
"""
DATA_DIR = "/home/gstupp/projects/biothings/mydisease/mydisease/data"
mesh_desc = os.path.join(DATA_DIR, "desc2016.xml")
mesh_supp = os.path.join(DATA_DIR, "supp2016.xml")
#parse_mesh_xml(mesh_desc)
#parse_mesh_xml(mesh_supp)
parse_mesh_parents(mesh_desc)
parse_mesh_parents(mesh_supp)
#populate_mesh()


In [49]:
parse_mesh_parents(mesh_supp)

In [50]:
with open(os.path.join(DATA_DIR, "mesh.json")) as f:
    mesh_terms = json.load(f)
print(len(mesh_terms))

0


In [40]:
mesh_terms = [m for m in mesh_terms if any(x.startswith("C") for x in m['tree_numbers'])]
print(len(mesh_terms))

4686


In [41]:
for term in mesh_terms:
    term['_id'] = "MESH:" + term['mesh_id']
    del term['mesh_id']

In [42]:
term

{'_id': 'MESH:D066263',
 'mesh_name': 'Protein Aggregation, Pathological',
 'parents': ['D001669', 'D010335'],
 'tree_numbers': ['C23.550.770', 'G02.111.668']}

In [44]:
from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.mesh
db.insert_many(mesh_terms)

<pymongo.results.InsertManyResult at 0x7f31c560aaf8>

In [45]:
db.count()

4686

In [48]:
db.find_one("MESH:C535306")