In [32]:
import hashlib
import json
import xml.etree.ElementTree as et
import gzip
import os
import pandas as pd
import io
import xml.etree.ElementTree as ET
import sys
import getopt
import locale
import time
from collections import defaultdict
from itertools import groupby, filterfalse
from collections import Counter

import networkx

In [33]:
"""
Only updated once a year.
https://www.nlm.nih.gov/mesh/download_mesh.html

Descriptor Records: ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/asciimesh/d2016.bin
Qualifier Records: ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/asciimesh/q2016.bin
Supplemental Records: ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/asciimesh/c2016.bin
"""
DATA_DIR = "/home/gstupp/projects/biothings/mydisease/mydisease/data"
mesh_desc_path = os.path.join(DATA_DIR, "d2016.bin")
mesh_supp_path = os.path.join(DATA_DIR, "c2016.bin")

## Parse a Descriptor Record
### all attributes listed here: https://www.nlm.nih.gov/mesh/dtype.html

Example record:

In [34]:
record = """*NEWRECORD
RECTYPE = D
MH = Brain Diseases, Metabolic, Inborn
DE = BRAIN DIS METAB INBORN
AQ = BL CF CI CL CO DH DI DT EC EH EM EN EP ET GE HI IM ME MI MO NU PA PC PP PS PX RA RH RI RT SU TH UR US VE VI
PRINT ENTRY = Central Nervous System Inborn Metabolic Diseases|T047|NON|BRD|NLM (2000)|991012|CNS INBORN METAB DIS|abcdefv
PRINT ENTRY = Familial Metabolic Brain Diseases|T047|NON|NRW|NLM (2000)|991012|FAMILIAL METAB BRAIN DIS|abcdefv
ENTRY = Brain Diseases, Metabolic, Familial|T047|NON|NRW|NLM (2000)|991012|BRAIN DIS METAB FAMILIAL|abcdefv
ENTRY = Brain Diseases, Metabolic, Inherited|T047|NON|NRW|NLM (2000)|991012|BRAIN DIS METAB INHERITED|abcdefv
MN = C10.228.140.163.100
MN = C16.320.565.189
MN = C18.452.132.100
MN = C18.452.648.189
FX = Intellectual Disability
MH_TH = NLM (2000)
ST = T047
AN = General, prefer specifics; DF: BRAIN DIS METAB INBORN
PI = Brain/metabolism (1968-1999)
PI = Hereditary Diseases (1968-1999)
MS = Brain disorders resulting from inborn metabolic errors, primarily from enzymatic defects which lead to substrate accumulation, product reduction, or increase in toxic metabolites through alternate pathways. The majority of these conditions are familial, however spontaneous mutation may also occur in utero.
PM = 2000
HN = 2000
MR = 20110705
DA = 19991103
DC = 1
DX = 20000101
UI = D020739"""

In [35]:
# note, only care about disease related attributes.
# for example, ignoring RN CAS REGISTRY/EC NUMBER/UNII CODE
attributes = {'MH': "term",
              'MN': "tree",
              'FX': "see_also",
              'ST': "semantic_type", # see: https://semanticnetwork.nlm.nih.gov/download/SemGroups.txt
              'MS': "note",
              'MR': "last_updated",
              'DC': "descriptor_class",
              'UI': "_id",
              'RECTYPE': "record_type",
              'synonyms':"synonyms"} # added by me from PRINT ENTRY & ENTRY

# TODO: parse PRINT ENTRY and ENTRY completely
# "'D-2-hydroxyglutaric aciduria|T047|EQV|OMIM (2013)|ORD (2010)|090615|abdeef'"

In [36]:
# read in semantic types
st_df = pd.read_csv("https://semanticnetwork.nlm.nih.gov/download/SemGroups.txt", delimiter="|",
           names = ["x0","x1","x2","x3"])
semantic_types = dict(zip(st_df['x2'],st_df['x3']))
disorder_df = st_df.query("x1 == 'Disorders'")[['x2','x3']]
disorders = set(disorder_df['x2'])
disorders

{'T019',
 'T020',
 'T033',
 'T037',
 'T046',
 'T047',
 'T048',
 'T049',
 'T050',
 'T184',
 'T190',
 'T191'}

In [37]:
# read in the mesh data
with open(mesh_desc_path) as f:
    mesh_desc = [x.strip() for x in f.readlines()]

In [38]:
# which attributes can have multiple values?
gb = filterfalse(lambda x: x[0], groupby(mesh_desc, lambda x:x=="*NEWRECORD"))
ds = []
for gb_record in gb:
    record = list(gb_record[1])
    d = dict(Counter([line.split("=",1)[0].strip() for line in record if "=" in line]))
    ds.append(d)
df = pd.DataFrame(ds).fillna(0)
list_attribs = set(df.columns[df.max()>1])
list_attribs

{'EC', 'ENTRY', 'FX', 'MH_TH', 'MN', 'PA', 'PI', 'PRINT ENTRY', 'RR', 'ST'}

In [45]:
# split into records
gb = filterfalse(lambda x: x[0], groupby(mesh_desc, lambda x:x=="*NEWRECORD"))

In [46]:
mesh_terms = []
for gb_record in gb:
    record = list(gb_record[1])
    d = defaultdict(list)
    for line in record:
        if "=" not in line:
            continue
        key = line.split("=",1)[0].strip()
        value = line.split("=",1)[1].strip()
        if key in list_attribs and key in attributes:
            d[attributes[key]].append(value)
        elif key in attributes and key not in {"PRINT ENTRY", "ENTRY"}:
            d[attributes[key]] = value
        elif key == "PRINT ENTRY":
            d['synonyms'].append(value.split("|",1)[0])
        elif key == "ENTRY":
            d['synonyms'].append(value.split("|",1)[0])
            
    if not (set(d['semantic_type']) & disorders):
        continue
    d['semantic_type_id'] = d['semantic_type']
    d['semantic_type'] = [semantic_types[c] for c in d['semantic_type']]
    d['_id'] = "mesh:" + d['_id']
    mesh_terms.append(dict(d))

In [49]:
[x for x in mesh_terms if x['_id'] == "mesh:D020739"][0]

{'_id': 'mesh:D020739',
 'descriptor_class': '1',
 'last_updated': '20110705',
 'note': 'Brain disorders resulting from inborn metabolic errors, primarily from enzymatic defects which lead to substrate accumulation, product reduction, or increase in toxic metabolites through alternate pathways. The majority of these conditions are familial, however spontaneous mutation may also occur in utero.',
 'record_type': 'D',
 'see_also': ['Intellectual Disability'],
 'semantic_type': ['Disease or Syndrome'],
 'semantic_type_id': ['T047'],
 'synonyms': ['Central Nervous System Inborn Metabolic Diseases',
  'Familial Metabolic Brain Diseases',
  'Inborn Errors of Metabolism, Brain',
  'Metabolic Diseases, Inborn, Brain',
  'Brain Diseases, Metabolic, Familial',
  'Brain Diseases, Metabolic, Inherited',
  'Brain Syndrome, Metabolic, Inborn',
  'CNS Metabolic Disorders, Inborn',
  'Central Nervous System Inborn Metabolic Disorders',
  'Encephalopathies, Metabolic, Inborn',
  'Familial Metabolic Dis

## Parse a Supplemental Record
### all attributes listed here: https://www.nlm.nih.gov/mesh/ctype.html

Example record:

In [50]:
record = """*NEWRECORD
RECTYPE = C
NM = 2-Hydroxyglutaricaciduria
RN = 0
SY = 2-Hga|T047|EQV|GHR (2014)|130418|abdef
SY = 2-Hydroxyglutaric Aciduria|T047|EQV|GHR (2014)|130418|abdef
SY = Combined D-2- and L-2-hydroxyglutaric aciduria|T047|EQV|ORD (2010)|090615|abdef
SY = D-2-hydroxyglutaric aciduria|T047|EQV|OMIM (2013)|ORD (2010)|090615|abdeef
SY = L-2-Hydroxyglutaric Acidemia|T047|EQV|OMIM (2013)|111115|abdef
SY = L-2-hydroxyglutaric aciduria|T047|EQV|OMIM (2013)|ORD (2010)|090615|abdeef
HM = Brain Diseases, Metabolic, Inborn
NM_TH = ORD (2010)
ST = T047
FR = 38
NO = Hereditary neurometabolic disorders characterized by DEVELOPMENTAL DELAY; EPILEPSY; HYPOTONIA, and dysmorphic features. Severe cases of D2HGA are homogeneous and are characterized by early infantile-onset epileptic encephalopathy and, CARDIOMYOPATHY. The mild phenotype has a more variable clinical presentation. In L2HGA, patients may also present with ATAXIA; MEGALENCEPHALY, and speech difficulties and the condition deteriorates over time. Mutations in the D2HGDH gene have been identified for D2HGA (OMIM: 600721) and the L2HGDH gene for L2HGA (OMIM: 236792).
DA = 20100625
MR = 20150808
UI = C535306
"""

In [51]:
attributes = {'HM': "mapped_to",
              'MR': "last_updated",
              'NM': "tree",
              'NO': "note",
              'RECTYPE': "record_type",
              'ST': "semantic_type",
              'SY': "synonym",
              'UI': "_id"}

In [52]:
# read in the mesh data
with open(mesh_supp_path) as f:
    mesh_supp = [x.strip() for x in f.readlines()]

In [53]:
# which attributes can have multiple values?
gb = filterfalse(lambda x: x[0], groupby(mesh_supp, lambda x:x=="*NEWRECORD"))
ds = []
for gb_record in gb:
    record = list(gb_record[1])
    d = dict(Counter([line.split("=",1)[0].strip() for line in record if "=" in line]))
    ds.append(d)
df = pd.DataFrame(ds).fillna(0)
list_attribs = set(df.columns[df.max()>1])
list_attribs

{'HM', 'II', 'NM_TH', 'PA', 'PI', 'RR', 'SO', 'ST', 'SY'}

In [54]:
# split into records
gb = filterfalse(lambda x: x[0], groupby(mesh_supp, lambda x:x=="*NEWRECORD"))

In [55]:
mesh_supp_terms = []
for gb_record in gb:
    record = list(gb_record[1])
    d = defaultdict(list)
    for line in record:
        if "=" not in line:
            continue
        key = line.split("=",1)[0].strip()
        value = line.split("=",1)[1].strip()
        if key in list_attribs and key in attributes:
            d[attributes[key]].append(value)
        elif key in attributes and key not in {"SY"}:
            d[attributes[key]] = value
        elif key == "SY":
            d['synonyms'].append(value.split("|",1)[0])
            
    if not (set(d['semantic_type']) & disorders):
        continue
    d['semantic_type_id'] = d['semantic_type']
    d['semantic_type'] = [semantic_types[c] for c in d['semantic_type']]
    d['_id'] = "mesh:" + d['_id']
    mesh_supp_terms.append(dict(d))

In [57]:
[x for x in mesh_supp_terms if x['_id'] == "mesh:C535306"][0]

{'_id': 'mesh:C535306',
 'last_updated': '20150808',
 'mapped_to': ['Brain Diseases, Metabolic, Inborn'],
 'note': 'Hereditary neurometabolic disorders characterized by DEVELOPMENTAL DELAY; EPILEPSY; HYPOTONIA, and dysmorphic features. Severe cases of D2HGA are homogeneous and are characterized by early infantile-onset epileptic encephalopathy and, CARDIOMYOPATHY. The mild phenotype has a more variable clinical presentation. In L2HGA, patients may also present with ATAXIA; MEGALENCEPHALY, and speech difficulties and the condition deteriorates over time. Mutations in the D2HGDH gene have been identified for D2HGA (OMIM: 600721) and the L2HGDH gene for L2HGA (OMIM: 236792).',
 'record_type': 'C',
 'semantic_type': ['Disease or Syndrome'],
 'semantic_type_id': ['T047'],
 'synonym': ['2-Hga|T047|EQV|GHR (2014)|130418|abdef',
  '2-Hydroxyglutaric Aciduria|T047|EQV|GHR (2014)|130418|abdef',
  'Combined D-2- and L-2-hydroxyglutaric aciduria|T047|EQV|ORD (2010)|090615|abdef',
  'D-2-hydroxyglu

In [58]:
print(len(mesh_terms))
print(len(mesh_supp_terms))
print(len(mesh_terms)+len(mesh_supp_terms))

4871
6539
11410


In [59]:
a=len([x for x in mesh_terms if 'Disease or Syndrome' in x['semantic_type']])
b=len([x for x in mesh_supp_terms if 'Disease or Syndrome' in x['semantic_type']])
print(a)
print(b)
print(a+b)

2997
6248
9245


In [60]:
from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.mesh
db.drop()
db.insert_many(mesh_terms)
db.insert_many(mesh_supp_terms)

<pymongo.results.InsertManyResult at 0x7f14624dc948>

In [61]:
db.count()

11410

In [62]:
db.find_one("mesh:C535306")

{'_id': 'mesh:C535306',
 'last_updated': '20150808',
 'mapped_to': ['Brain Diseases, Metabolic, Inborn'],
 'note': 'Hereditary neurometabolic disorders characterized by DEVELOPMENTAL DELAY; EPILEPSY; HYPOTONIA, and dysmorphic features. Severe cases of D2HGA are homogeneous and are characterized by early infantile-onset epileptic encephalopathy and, CARDIOMYOPATHY. The mild phenotype has a more variable clinical presentation. In L2HGA, patients may also present with ATAXIA; MEGALENCEPHALY, and speech difficulties and the condition deteriorates over time. Mutations in the D2HGDH gene have been identified for D2HGA (OMIM: 600721) and the L2HGDH gene for L2HGA (OMIM: 236792).',
 'record_type': 'C',
 'semantic_type': ['Disease or Syndrome'],
 'semantic_type_id': ['T047'],
 'synonym': ['2-Hga|T047|EQV|GHR (2014)|130418|abdef',
  '2-Hydroxyglutaric Aciduria|T047|EQV|GHR (2014)|130418|abdef',
  'Combined D-2- and L-2-hydroxyglutaric aciduria|T047|EQV|ORD (2010)|090615|abdef',
  'D-2-hydroxyglu