"""
<kindDef reference="false">
<name>PHYSIOLOGIC_EFFECT_KIND</name>
<code>C6</code>
<id>7</id>
<namespace>NDF-RT2 Public</namespace>
</kindDef>
"""

"""
<conceptDef>
<name>FLUOXETINE</name>
<code>C14796</code>
<id>14797</id>
<namespace>NDF-RT2 Public</namespace>
<primitive/>
<kind>C8</kind>
<definingConcepts><concept>C288632</concept><concept>C635102</concept></definingConcepts>

<definingRoles>
<role><some/><name>C34</name><value>C1438</value></role>
<role><some/><name>C30</name><value>C546</value></role>
<role><some/><name>C34</name><value>C4584</value></role>
...
</definingRoles>

<properties>
<property><name>C98</name><value>Ingredient</value></property>
<property><name>C140</name><value>C0016365</value></property>
<property><name>C264</name><value>FLUOXETINE</value></property>
<property><name>C818</name><value>4493</value></property>
<property><name>C262721</name><value>50.416^2381^Active/Master</value>
    <qualifiers><qualifier><name>C28476375667298</name><value>50.416</value></qualifier>
    <qualifier><name>C28489595891377</name><value>2381</value></qualifier>
    <qualifier><name>Q147</name><value>Active/Master</value></qualifier>
    </qualifiers>
</property>
...
</properties>
</conceptDef>
"""

In [11]:
class NDFRT_item:
    def __init__(self, name = "", code = "", kind = "", roles = [], props = []):
        self.name = name
        self.code = code
        self.kind = kind
        self.roles = roles
        self.props = props
        self.drug_used_for_treatment = []
    def __repr__(self):
        return "\n".join([self.name, self.code, self.kind, str(self.roles), str(self.props)])
    def __str__(self):
        return "\n".join([self.name, self.code, self.kind, str(self.roles), str(self.props)])
    def format_drug(self):
        d = {'name': self.props.get('Display_Name',[''])[0],
            'level': self.props.get('Level',[''])[0],
            'fda_unii': self.props.get('FDA_UNII',[''])[0],
            'nui': self.props.get('NUI',[''])[0],
            'rxnorm_cui': self.props.get('RxNorm_CUI',[''])[0],
            'umls_cui': self.props.get('UMLS_CUI',[''])[0]
            }
        return {k:v for k,v in d.items() if v}
    def format_disease(self):
        d = {'name': self.props['Display_Name'][0],
             '_id': 'umls_cui:' + self.props['UMLS_CUI'][0],
             'synonyms': self.props.get('Synonym',[]),
             'xref': {
                'mesh': self.props.get('MeSH_DUI',[]),
                'nui': self.props.get('NUI',[]),
                'rxnorm_cui': self.props.get('RxNorm_CUI',[]),
                'snomedct_us_2016_03_01': self.props.get('SNOMED_CID',[]),
                },
            'drugs_used_for_treatment': self.drug_used_for_treatment
            }
        return d

In [12]:
import xml.etree.ElementTree as et
from collections import defaultdict
items = dict()
kinds = dict()
tree = et.parse('/home/gstupp/projects/biothings/mydisease/mydisease/data/NDFRT_Public_2016.09.06_TDE.xml')
root = tree.getroot()
for conceptdef in root.findall('kindDef'):
    name = conceptdef.find('name').text
    code = conceptdef.find('code').text
    kinds[code] = name
for conceptdef in root.findall('roleDef'):
    name = conceptdef.find('name').text
    code = conceptdef.find('code').text
    kinds[code] = name
for conceptdef in root.findall('propertyDef'):
    name = conceptdef.find('name').text
    code = conceptdef.find('code').text
    kinds[code] = name
for conceptdef in root.findall('conceptDef'):
    name = conceptdef.find('name').text
    code = conceptdef.find('code').text
    kind = conceptdef.find('kind').text
    roles_xml = conceptdef.find('definingRoles').findall("role")
    roles = defaultdict(list)
    for role in roles_xml:
        roles[kinds[role.find("name").text]].append(role.find("value").text)
    properties_xml = conceptdef.find('properties').findall("property")
    props = defaultdict(list)
    for prop in properties_xml:
        props[kinds[prop.find("name").text]].append(prop.find("value").text)
    
    items[code] = NDFRT_item(name=name,code=code,kind=kind,roles=dict(roles),props=dict(props))

In [13]:
items['C14796'].format_drug()

{'fda_unii': '01K63SUP8D',
 'level': 'Ingredient',
 'name': 'FLUOXETINE',
 'nui': 'N0000147852',
 'rxnorm_cui': '4493',
 'umls_cui': 'C0016365'}

In [14]:
items['C1438'].props

{'Display_Name': ['Bulimia'],
 'MeSH_CUI': ['M0003021'],
 'MeSH_DUI': ['D002032'],
 'MeSH_Definition': ['Eating an excess amount of food in a short period of time, as seen in the disorder of BULIMIA NERVOSA. It is caused by an abnormal craving for food, or insatiable hunger also known as "ox hunger".'],
 'MeSH_Name': ['Bulimia'],
 'NUI': ['N0000000630'],
 'RxNorm_CUI': ['1024255'],
 'SNOMED_CID': ['78004001'],
 'Synonym': ['Binge Eating'],
 'UMLS_CUI': ['C0006370']}

In [15]:
## We only want to may_treat links
diseases = {k:v for k,v in items.items() if v.kind == 'C16'}
drugs = {k:v for k,v in items.items() if v.kind == 'C8'}

In [16]:
for drug in drugs.values():
    drug_treats = drug.roles.get('may_treat {NDFRT}', [])
    for dt in drug_treats:
        diseases[dt].drug_used_for_treatment.append(drug.format_drug())

In [17]:
diseases[dt].drug_used_for_treatment

[{'fda_unii': '8GTS82S83M',
  'level': 'Ingredient',
  'name': 'DIPHENHYDRAMINE',
  'nui': 'N0000147816',
  'rxnorm_cui': '3498',
  'umls_cui': 'C0012522'},
 {'fda_unii': 'RO16TQF95Y',
  'level': 'Ingredient',
  'name': 'TRIFLUPROMAZINE',
  'nui': 'N0000148043',
  'rxnorm_cui': '10805',
  'umls_cui': 'C0040989'},
 {'fda_unii': '8ETK1WAF6R',
  'level': 'Ingredient',
  'name': 'THIETHYLPERAZINE',
  'nui': 'N0000148028',
  'rxnorm_cui': '10471',
  'umls_cui': 'C0039865'},
 {'fda_unii': 'JB937PER5C',
  'level': 'Ingredient',
  'name': 'DIMENHYDRINATE',
  'nui': 'N0000146389',
  'rxnorm_cui': '3444',
  'umls_cui': 'C0012381'},
 {'fda_unii': '7S5I7G3JQL',
  'level': 'Ingredient',
  'name': 'DEXAMETHASONE',
  'nui': 'N0000146273',
  'rxnorm_cui': '3264',
  'umls_cui': 'C0011777'},
 {'fda_unii': '7J8897W37S',
  'level': 'Ingredient',
  'name': 'DRONABINOL',
  'nui': 'N0000146833',
  'rxnorm_cui': '10402',
  'umls_cui': 'C0039663'},
 {'fda_unii': 'L4YEB44I46',
  'level': 'Ingredient',
  'name':

In [18]:
diseases['C1438'].format_disease()

{'_id': 'umls_cui:C0006370',
 'drugs_used_for_treatment': [{'fda_unii': '2U1W68TROF',
   'level': 'Ingredient',
   'name': 'MAPROTILINE',
   'nui': 'N0000147895',
   'rxnorm_cui': '6646',
   'umls_cui': 'C0024778'},
  {'fda_unii': '01K63SUP8D',
   'level': 'Ingredient',
   'name': 'FLUOXETINE',
   'nui': 'N0000147852',
   'rxnorm_cui': '4493',
   'umls_cui': 'C0016365'},
  {'fda_unii': '2DS058H2CF',
   'level': 'Ingredient',
   'name': 'FENFLURAMINE',
   'nui': 'N0000147845',
   'rxnorm_cui': '4328',
   'umls_cui': 'C0015827'}],
 'name': 'Bulimia',
 'synonyms': ['Binge Eating'],
 'xref': {'mesh': ['D002032'],
  'nui': ['N0000000630'],
  'rxnorm_cui': ['1024255'],
  'snomedct_us_2016_03_01': ['78004001']}}

In [27]:
diseases = {x.format_disease()['_id']:x.format_disease() for x in diseases.values()}

4802

In [26]:
from collections import Counter
Counter().most_common()

[('umls_cui:C0009450', 2),
 ('umls_cui:C0035921', 1),
 ('umls_cui:C0012624', 1),
 ('umls_cui:C0026267', 1),
 ('umls_cui:C0042164', 1),
 ('umls_cui:C0266463', 1),
 ('umls_cui:C0022744', 1),
 ('umls_cui:C0038986', 1),
 ('umls_cui:C0004943', 1),
 ('umls_cui:C0023533', 1),
 ('umls_cui:C0026603', 1),
 ('umls_cui:C0031880', 1),
 ('umls_cui:C0016788', 1),
 ('umls_cui:C0007766', 1),
 ('umls_cui:C0033581', 1),
 ('umls_cui:C0031887', 1),
 ('umls_cui:C0018566', 1),
 ('umls_cui:C0751422', 1),
 ('umls_cui:C0011881', 1),
 ('umls_cui:C0032578', 1),
 ('umls_cui:C0015379', 1),
 ('umls_cui:C0206157', 1),
 ('umls_cui:C0020413', 1),
 ('umls_cui:C0265985', 1),
 ('umls_cui:C0024586', 1),
 ('umls_cui:C0021833', 1),
 ('umls_cui:C0004610', 1),
 ('umls_cui:C0018621', 1),
 ('umls_cui:C0013403', 1),
 ('umls_cui:C0019624', 1),
 ('umls_cui:C2350019', 1),
 ('umls_cui:C0238198', 1),
 ('umls_cui:C0037937', 1),
 ('umls_cui:C0010964', 1),
 ('umls_cui:C0085257', 1),
 ('umls_cui:C0028326', 1),
 ('umls_cui:C0032285', 1),
 