## Orphanet Rare Disease Ontology (ORDO)
http://www.orphadata.org/cgi-bin/inc/ordo_orphanet.inc.php

http://bioportal.bioontology.org/ontologies/ORDO

http://data.bioontology.org/ontologies/ORDO/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv

In [1]:
import json
import numpy as np
from collections import defaultdict
import pandas as pd
pd.set_option("display.width", 200)

In [2]:
columns_rename = {"http://data.bioontology.org/metadata/obo/part_of": "part_of",
                  "http://data.bioontology.org/metadata/treeView": "tree_view",
                  "http://www.ebi.ac.uk/efo/alternative_term": "alternative_term",
                  "http://www.ebi.ac.uk/efo/definition": "definition",
                  "http://www.ebi.ac.uk/efo/definition_citation": "definition_citation",
                  "http://www.ebi.ac.uk/efo/reason_for_obsolescence": "reason_for_obsolescence",
                  "http://www.geneontology.org/formats/oboInOwl#hasDbXref": "xref",
                  "http://www.orpha.net/ORDO/Orphanet_#symbol": "symbol",
                  "Synonyms": "synonyms",
                  "Obsolete": "obsolete",
                  "Class ID": "id",
                  "Preferred Label": "preferred_label",
                  "Parents": "parents",
                  "Definitions": "definitions"
                  }
df = pd.read_csv("/home/gstupp/projects/biothings/mydisease/mydisease/data/ORDO.csv.gz")
del df["http://www.geneontology.org/formats/oboInOwl#hasDbXref"]
# throw away columns that are all null
df = df[df.columns[df.isnull().sum()!=len(df)]]
df = df.rename(index=str, columns=columns_rename)
df = df[~df.obsolete]
df = df[df.symbol.isnull()]
del df['obsolete']

for col in ['parents','part_of','tree_view','id']:
    df[col] = df[col].str.replace("http://www.orpha.net/ORDO/", "").str.replace("_",":")
list_attribs = ['synonyms','parents','part_of','tree_view','alternative_term']
for col in list_attribs:
    df[col] = df[col].str.split("|").copy()
df.rename(columns={'id':'_id'}, inplace=True)
df.head()

Unnamed: 0,_id,preferred_label,synonyms,definitions,parents,part_of,tree_view,alternative_term,definition,definition_citation,reason_for_obsolescence,symbol
0,Orphanet:1390,Night blindness-skeletal anomalies-dysmorphism...,[Hunter-Thompson-Reed syndrome],,[Orphanet:377789],"[Orphanet:102285, Orphanet:330206]","[Orphanet:102285, Orphanet:330206]",[Hunter-Thompson-Reed syndrome],,,,
2,Orphanet:264694,Interstitial lung disease specific to infancy,[ILD specific to infancy],,"[Orphanet:377794, Orphanet:264665]",,,[ILD specific to infancy],,,,
3,Orphanet:69735,Hypotrichosis-lymphedema-telangiectasia-renal ...,[Hypotrichosis-lymphedema-telangiectasia-membr...,Hypotrichosis - lymphedema - telangiectasia is...,[Orphanet:377788],"[Orphanet:89832, Orphanet:102373]","[Orphanet:89832, Orphanet:102373]",[Hypotrichosis-lymphedema-telangiectasia-membr...,Hypotrichosis - lymphedema - telangiectasia is...,orphanet,,
4,Orphanet:454,Acquired ichthyosis,,,[Orphanet:377788],[Orphanet:79354],[Orphanet:79354],,,,,
5,Orphanet:3451,West syndrome,"[Infantile spasms, Intellectual disability-hyp...",,[Orphanet:377792],"[Orphanet:183763, Orphanet:182079, Orphanet:98...","[Orphanet:183763, Orphanet:182079, Orphanet:98...","[Infantile spasms, Intellectual disability-hyp...",,,,


In [3]:
df_records = df.apply(lambda x:x.dropna().to_dict(), axis=1)
d = {record["_id"]: record for record in df_records}

In [4]:
d['Orphanet:723']

{'_id': 'Orphanet:723',
 'parents': ['Orphanet:377788'],
 'part_of': ['Orphanet:163591'],
 'preferred_label': 'Pneumocystosis',
 'tree_view': ['Orphanet:163591']}

# Load in cross refs
## Rare diseases and cross-referencing
## Keeping xref as those with exact match (E) or BTNT (broad term -> narrow term)

http://www.orphadata.org/cgi-bin/inc/product1.inc.php

In [5]:
import xml.etree.ElementTree as et
tree = et.parse('/home/gstupp/projects/biothings/mydisease/mydisease/data/en_product1.xml')
root = tree.getroot()

In [6]:
id_replace = {"MeSH": "MESH",
 "UMLS": "UMLS_CUI",
 "ICD-10": "ICD10CM"}
for disease in root.find("DisorderList"):    
    name = disease.find("Name").text
    orpha = "Orphanet:" + disease.find("OrphaNumber").text
    references = disease.findall("ExternalReferenceList/ExternalReference")
    mapping = defaultdict(list)
    xrefs = []
    for ref in references:
        source = ref.find("Source").text
        source = id_replace.get(source, source)
        reference = ref.find("Reference").text
        mapping_relation = ref.find("DisorderMappingRelation/Name").text.split(" ",1)[0]
        if source == "ICD10CM":
            reference = reference.replace("-", "").replace("*","").replace("+","")
        xref = source + ":" + reference
        mapping[mapping_relation].append(xref)
        if mapping_relation in {'E', 'BTNT'}:
            xrefs.append(xref)
    synonyms = [x.text for x in disease.findall("SynonymList/Synonym")]
    if orpha in d:
        d[orpha].update({'xref': xrefs, 'mapping': dict(mapping)})
    else:
        d[orpha] = {'preferred_label': name, 'synonyms': synonyms,
                   'xref': xrefs, 'mapping': dict(mapping), '_id': orpha}

In [7]:
d['Orphanet:723']

{'_id': 'Orphanet:723',
 'mapping': {'E': ['ICD10CM:B59'], 'NTBT': ['ICD10CM:J17.3']},
 'parents': ['Orphanet:377788'],
 'part_of': ['Orphanet:163591'],
 'preferred_label': 'Pneumocystosis',
 'tree_view': ['Orphanet:163591'],
 'xref': ['ICD10CM:B59']}

In [8]:
d['Orphanet:3451']

{'_id': 'Orphanet:3451',
 'alternative_term': ['Infantile spasms',
  'Intellectual disability-hypsarrhythmia syndrome'],
 'mapping': {'BTNT': ['OMIM:300672',
   'OMIM:308350',
   'OMIM:613477',
   'OMIM:613722',
   'OMIM:615006',
   'OMIM:616139',
   'OMIM:616341',
   'OMIM:617065'],
  'E': ['MedDRA:10021750', 'UMLS_CUI:C0037769'],
  'NTBT': ['ICD10CM:G40.4']},
 'parents': ['Orphanet:377792'],
 'part_of': ['Orphanet:183763',
  'Orphanet:182079',
  'Orphanet:98258',
  'Orphanet:102369'],
 'preferred_label': 'West syndrome',
 'synonyms': ['Infantile spasms',
  'Intellectual disability-hypsarrhythmia syndrome'],
 'tree_view': ['Orphanet:183763',
  'Orphanet:182079',
  'Orphanet:98258',
  'Orphanet:102369'],
 'xref': ['MedDRA:10021750',
  'OMIM:300672',
  'OMIM:308350',
  'OMIM:613477',
  'OMIM:613722',
  'OMIM:615006',
  'OMIM:616139',
  'OMIM:616341',
  'UMLS_CUI:C0037769',
  'OMIM:617065']}

In [9]:
from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.orphanet
db.drop()
db.insert_many(list(d.values()))

<pymongo.results.InsertManyResult at 0x7fa9839a61b0>

In [10]:
db.find_one('Orphanet:723')

{'_id': 'Orphanet:723',
 'mapping': {'E': ['ICD10CM:B59'], 'NTBT': ['ICD10CM:J17.3']},
 'parents': ['Orphanet:377788'],
 'part_of': ['Orphanet:163591'],
 'preferred_label': 'Pneumocystosis',
 'tree_view': ['Orphanet:163591'],
 'xref': ['ICD10CM:B59']}

In [11]:
from collections import Counter
from itertools import chain
Counter([x.split(":")[0] for x in chain(*[x.get('xref',[]) for x in d.values()])]).most_common(100)

[('OMIM', 6384),
 ('UMLS_CUI', 3159),
 ('MESH', 1843),
 ('ICD10CM', 1540),
 ('MedDRA', 1227)]

In [12]:
# how many ICD10CM xrefs does each ORDO item have?
xrefs = list(db.find({'xref':{'$exists': True}},{'xref':1}))
Counter([[y.split(":")[0] for y in x['xref']].count("UMLS_CUI") for x in xrefs])

Counter({0: 6513, 1: 2621, 2: 259, 3: 5, 5: 1})