In [161]:
# Orphanet Rare Disease Ontology (ORDO)
# http://www.orphadata.org/cgi-bin/inc/ordo_orphanet.inc.php
# http://bioportal.bioontology.org/ontologies/ORDO
# http://data.bioontology.org/ontologies/ORDO/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv

import json
import numpy as np
import pandas as pd
pd.set_option("display.width", 200)

columns_rename = {"http://data.bioontology.org/metadata/obo/part_of": "part_of",
                  "http://data.bioontology.org/metadata/treeView": "tree_view",
                  "http://www.ebi.ac.uk/efo/alternative_term": "alternative_term",
                  "http://www.ebi.ac.uk/efo/definition": "definition",
                  "http://www.ebi.ac.uk/efo/definition_citation": "definition_citation",
                  "http://www.ebi.ac.uk/efo/reason_for_obsolescence": "reason_for_obsolescence",
                  "http://www.geneontology.org/formats/oboInOwl#hasDbXref": "xref",
                  "http://www.orpha.net/ORDO/Orphanet_#symbol": "symbol",
                  "Synonyms": "synonyms",
                  "Obsolete": "obsolete",
                  "Class ID": "id",
                  "Preferred Label": "preferred_label",
                  "Parents": "parents",
                  "Definitions": "definitions"
                  }

In [163]:
from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.orphanet

In [188]:
df = pd.read_csv("/home/gstupp/projects/biothings/mydisease/mydisease/data/ORDO.csv.gz")
list_attribs = ['synonyms','parents','part_of','tree_view','alternative_term','xref']
# throw away columns that are all null
df = df[df.columns[df.isnull().sum()!=len(df)]]
df = df.rename(index=str, columns=columns_rename)
df = df[~df.obsolete]
df = df[df.symbol.isnull()]
del df['obsolete']
df.xref = df.xref.dropna().apply(lambda x:x.replace("MeSH:", "MESH:")).copy()
for col in ['parents','part_of','tree_view','id']:
    df[col] = df[col].str.replace("http://www.orpha.net/ORDO/", "").str.replace("_",":")
for col in list_attribs:
    df[col] = df[col].str.split("|").copy()
df.rename(columns={'id':'_id'}, inplace=True)
df.head()

Unnamed: 0,_id,preferred_label,synonyms,definitions,parents,part_of,tree_view,alternative_term,definition,definition_citation,reason_for_obsolescence,xref,symbol
0,Orphanet:1390,Night blindness-skeletal anomalies-dysmorphism...,[Hunter-Thompson-Reed syndrome],,[Orphanet:377789],"[Orphanet:102285, Orphanet:330206]","[Orphanet:102285, Orphanet:330206]",[Hunter-Thompson-Reed syndrome],,,,[ICD-10:Q87.8],
2,Orphanet:264694,Interstitial lung disease specific to infancy,[ILD specific to infancy],,"[Orphanet:377794, Orphanet:264665]",,,[ILD specific to infancy],,,,,
3,Orphanet:69735,Hypotrichosis-lymphedema-telangiectasia-renal ...,[Hypotrichosis-lymphedema-telangiectasia-membr...,Hypotrichosis - lymphedema - telangiectasia is...,[Orphanet:377788],"[Orphanet:89832, Orphanet:102373]","[Orphanet:89832, Orphanet:102373]",[Hypotrichosis-lymphedema-telangiectasia-membr...,Hypotrichosis - lymphedema - telangiectasia is...,orphanet,,"[OMIM:607823, OMIM:137940]",
4,Orphanet:454,Acquired ichthyosis,,,[Orphanet:377788],[Orphanet:79354],[Orphanet:79354],,,,,[ICD-10:L85.0],
5,Orphanet:3451,West syndrome,"[Infantile spasms, Intellectual disability-hyp...",,[Orphanet:377792],"[Orphanet:183763, Orphanet:182079, Orphanet:98...","[Orphanet:183763, Orphanet:182079, Orphanet:98...","[Infantile spasms, Intellectual disability-hyp...",,,,"[MedDRA:10021750, OMIM:616139, OMIM:613722, IC...",


In [189]:
df_records = df.apply(lambda x:x.dropna().to_dict(), axis=1)
d = {record["_id"]: record for record in df_records}

In [190]:
df_records[100]

{'_id': 'Orphanet:329894',
 'parents': ['Orphanet:377788'],
 'part_of': ['Orphanet:329888'],
 'preferred_label': 'Juvenile overlap myositis',
 'tree_view': ['Orphanet:329888'],
 'xref': ['ICD-10:M33.0']}

In [191]:
d['Orphanet:98306']

{'_id': 'Orphanet:98306',
 'alternative_term': ['FPLD'],
 'definition': 'Familial partial lipodystrophy (FPLD) is a group of rare genetic lipodystrophic syndromes characterized, in most cases, by fat loss from the limbs and buttocks, from childhood or early adulthood, and often associated with acanthosis nigricans, insulin resistance, diabetes, hypertriglyceridemia and liver steatosis.',
 'definition_citation': 'orphanet',
 'definitions': 'Familial partial lipodystrophy (FPLD) is a group of rare genetic lipodystrophic syndromes characterized, in most cases, by fat loss from the limbs and buttocks, from childhood or early adulthood, and often associated with acanthosis nigricans, insulin resistance, diabetes, hypertriglyceridemia and liver steatosis.',
 'parents': ['Orphanet:98305', 'Orphanet:377794'],
 'preferred_label': 'Familial partial lipodystrophy',
 'synonyms': ['FPLD'],
 'xref': ['UMLS:C0271694', 'ICD-10:E88.1', 'MESH:D052496']}

In [193]:
db.insert_many(list(df_records))

<pymongo.results.InsertManyResult at 0x7f2230dcbf30>

In [194]:
db.find_one('Orphanet:98306')

{'_id': 'Orphanet:98306',
 'alternative_term': ['FPLD'],
 'definition': 'Familial partial lipodystrophy (FPLD) is a group of rare genetic lipodystrophic syndromes characterized, in most cases, by fat loss from the limbs and buttocks, from childhood or early adulthood, and often associated with acanthosis nigricans, insulin resistance, diabetes, hypertriglyceridemia and liver steatosis.',
 'definition_citation': 'orphanet',
 'definitions': 'Familial partial lipodystrophy (FPLD) is a group of rare genetic lipodystrophic syndromes characterized, in most cases, by fat loss from the limbs and buttocks, from childhood or early adulthood, and often associated with acanthosis nigricans, insulin resistance, diabetes, hypertriglyceridemia and liver steatosis.',
 'parents': ['Orphanet:98305', 'Orphanet:377794'],
 'preferred_label': 'Familial partial lipodystrophy',
 'synonyms': ['FPLD'],
 'xref': ['UMLS:C0271694', 'ICD-10:E88.1', 'MESH:D052496']}

In [195]:
from collections import Counter
from itertools import chain
Counter([x.split(":")[0] for x in chain(*[x.get('xref',[]) for x in d.values()])]).most_common(100)

[('ICD-10', 7858),
 ('OMIM', 6677),
 ('UMLS', 3064),
 ('MESH', 1787),
 ('MedDRA', 1212)]