## Orphanet Rare Disease Ontology (ORDO)
http://www.orphadata.org/cgi-bin/inc/ordo_orphanet.inc.php

http://bioportal.bioontology.org/ontologies/ORDO

http://data.bioontology.org/ontologies/ORDO/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv

In [3]:
from mydisease.utils.common import list2dict
import json
import numpy as np
from collections import defaultdict
import pandas as pd
pd.set_option("display.width", 200)

In [11]:
columns_rename = {"http://data.bioontology.org/metadata/obo/part_of": "part_of",
                  "http://data.bioontology.org/metadata/treeView": "tree_view",
                  "http://www.ebi.ac.uk/efo/alternative_term": "alternative_term",
                  "http://www.ebi.ac.uk/efo/definition": "definition",
                  "http://www.ebi.ac.uk/efo/definition_citation": "definition_citation",
                  "http://www.ebi.ac.uk/efo/reason_for_obsolescence": "reason_for_obsolescence",
                  "http://www.geneontology.org/formats/oboInOwl#hasDbXref": "xref",
                  "http://www.orpha.net/ORDO/Orphanet_#symbol": "symbol",
                  "Synonyms": "synonyms",
                  "Obsolete": "obsolete",
                  "Class ID": "id",
                  "Preferred Label": "preferred_label",
                  "Parents": "parents",
                  "Definitions": "definitions"
                  }
df = pd.read_csv("/home/gstupp/projects/biothings/mydisease/mydisease/data/ORDO.csv.gz")
del df["http://www.geneontology.org/formats/oboInOwl#hasDbXref"]
# throw away columns that are all null
df = df[df.columns[df.isnull().sum()!=len(df)]]
df = df.rename(index=str, columns=columns_rename)
df = df[~df.obsolete]
df = df[df.symbol.isnull()]
del df['obsolete']

for col in ['parents','part_of','tree_view','id']:
    df[col] = df[col].str.replace("http://www.orpha.net/ORDO/", "").str.replace("_",":").str.lower()
list_attribs = ['synonyms','parents','part_of','tree_view','alternative_term']
for col in list_attribs:
    df[col] = df[col].str.split("|").copy()
df.rename(columns={'id':'_id'}, inplace=True)
df.head()

Unnamed: 0,_id,preferred_label,synonyms,definitions,parents,part_of,tree_view,alternative_term,definition,definition_citation,reason_for_obsolescence,symbol
0,orphanet:1390,Night blindness-skeletal anomalies-dysmorphism...,[Hunter-Thompson-Reed syndrome],,[orphanet:377789],"[orphanet:102285, orphanet:330206]","[orphanet:102285, orphanet:330206]",[Hunter-Thompson-Reed syndrome],,,,
2,orphanet:264694,Interstitial lung disease specific to infancy,[ILD specific to infancy],,"[orphanet:377794, orphanet:264665]",,,[ILD specific to infancy],,,,
3,orphanet:69735,Hypotrichosis-lymphedema-telangiectasia-renal ...,[Hypotrichosis-lymphedema-telangiectasia-membr...,Hypotrichosis - lymphedema - telangiectasia is...,[orphanet:377788],"[orphanet:89832, orphanet:102373]","[orphanet:89832, orphanet:102373]",[Hypotrichosis-lymphedema-telangiectasia-membr...,Hypotrichosis - lymphedema - telangiectasia is...,orphanet,,
4,orphanet:454,Acquired ichthyosis,,,[orphanet:377788],[orphanet:79354],[orphanet:79354],,,,,
5,orphanet:3451,West syndrome,"[Infantile spasms, Intellectual disability-hyp...",,[orphanet:377792],"[orphanet:183763, orphanet:182079, orphanet:98...","[orphanet:183763, orphanet:182079, orphanet:98...","[Infantile spasms, Intellectual disability-hyp...",,,,


In [19]:
df_records = df.apply(lambda x:x.dropna().to_dict(), axis=1)
d = {record["_id"]: record for record in df_records}

In [20]:
d['orphanet:723']

{'_id': 'orphanet:723',
 'parents': ['orphanet:377788'],
 'part_of': ['orphanet:163591'],
 'preferred_label': 'Pneumocystosis',
 'tree_view': ['orphanet:163591']}

# Load in cross refs
## Rare diseases and cross-referencing
## Keeping xref as those with exact match (E) or BTNT (broad term -> narrow term)

http://www.orphadata.org/cgi-bin/inc/product1.inc.php

In [21]:
import xml.etree.ElementTree as et
tree = et.parse('/home/gstupp/projects/biothings/mydisease/mydisease/data/en_product1.xml')
root = tree.getroot()

In [22]:
id_replace = {"umls": "umls_cui",
 "icd-10": "icd10cm"}
for disease in root.find("DisorderList"):    
    name = disease.find("Name").text
    orpha = "orphanet:" + disease.find("OrphaNumber").text
    references = disease.findall("ExternalReferenceList/ExternalReference")
    mapping = defaultdict(list)
    xrefs = []
    for ref in references:
        source = ref.find("Source").text.lower()
        source = id_replace.get(source, source)
        reference = ref.find("Reference").text
        mapping_relation = ref.find("DisorderMappingRelation/Name").text.split(" ",1)[0]
        if source == "icd10cm":
            reference = reference.replace("-", "").replace("*","").replace("+","")
        xref = source + ":" + reference
        mapping[mapping_relation].append(xref)
        if mapping_relation in {'E', 'BTNT'}:
            xrefs.append(xref)
    xrefs = list2dict(xrefs)
    synonyms = [x.text for x in disease.findall("SynonymList/Synonym")]
    if orpha in d:
        d[orpha].update({'xref': xrefs, 'mapping': dict(mapping)})
    else:
        d[orpha] = {'preferred_label': name, 'synonyms': synonyms,
                   'xref': xrefs, 'mapping': dict(mapping), '_id': orpha}

In [24]:
d['orphanet:723']

{'_id': 'orphanet:723',
 'mapping': {'E': ['icd10cm:B59'], 'NTBT': ['icd10cm:J17.3']},
 'parents': ['orphanet:377788'],
 'part_of': ['orphanet:163591'],
 'preferred_label': 'Pneumocystosis',
 'tree_view': ['orphanet:163591'],
 'xref': {'icd10cm': ['B59']}}

In [25]:
d['orphanet:3451']

{'_id': 'orphanet:3451',
 'alternative_term': ['Infantile spasms',
  'Intellectual disability-hypsarrhythmia syndrome'],
 'mapping': {'BTNT': ['omim:300672',
   'omim:308350',
   'omim:613477',
   'omim:613722',
   'omim:615006',
   'omim:616139',
   'omim:616341',
   'omim:617065'],
  'E': ['meddra:10021750', 'umls_cui:C0037769'],
  'NTBT': ['icd10cm:G40.4']},
 'parents': ['orphanet:377792'],
 'part_of': ['orphanet:183763',
  'orphanet:182079',
  'orphanet:98258',
  'orphanet:102369'],
 'preferred_label': 'West syndrome',
 'synonyms': ['Infantile spasms',
  'Intellectual disability-hypsarrhythmia syndrome'],
 'tree_view': ['orphanet:183763',
  'orphanet:182079',
  'orphanet:98258',
  'orphanet:102369'],
 'xref': {'meddra': ['10021750'],
  'omim': ['300672',
   '308350',
   '613477',
   '613722',
   '615006',
   '616139',
   '616341',
   '617065'],
  'umls_cui': ['C0037769']}}

In [30]:
from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.orphanet
db.drop()
db.insert_many(list(d.values()))

<pymongo.results.InsertManyResult at 0x7f54cf8e69d8>

In [31]:
db.find_one('orphanet:723')

{'_id': 'orphanet:723',
 'mapping': {'E': ['icd10cm:B59'], 'NTBT': ['icd10cm:J17.3']},
 'parents': ['orphanet:377788'],
 'part_of': ['orphanet:163591'],
 'preferred_label': 'Pneumocystosis',
 'tree_view': ['orphanet:163591'],
 'xref': {'icd10cm': ['B59']}}

In [28]:
from collections import Counter
from itertools import chain
Counter([x.split(":")[0] for x in chain(*[x.get('xref',[]) for x in d.values()])]).most_common(100)

[('omim', 4026),
 ('umls_cui', 2886),
 ('mesh', 1763),
 ('meddra', 1170),
 ('icd10cm', 747)]

In [29]:
# how many ICD10CM xrefs does each ORDO item have?
xrefs = list(db.find({'xref':{'$exists': True}},{'xref':1}))
Counter([[y.split(":")[0] for y in x['xref']].count("umls_cui") for x in xrefs])

Counter({0: 6513, 1: 2886})