In [1]:
import pandas as pd

In [2]:
mimTitles_url = "http://data.omim.org/downloads/YusepqJtQDuqSPctv6tmVQ/mimTitles.txt"
geneMap_url = "http://data.omim.org/downloads/YusepqJtQDuqSPctv6tmVQ/genemap.txt"
geneMap2_url = "http://data.omim.org/downloads/YusepqJtQDuqSPctv6tmVQ/genemap2.txt"
morbidMap_url = "http://data.omim.org/downloads/YusepqJtQDuqSPctv6tmVQ/morbidmap.txt"
mimTitles_names = """Prefix	Mim Number	Preferred Title; symbol	Alternative Title(s); symbol(s)	Included Title(s); symbols""".split("\t")
geneMap_names = """Sort	Month	Day	Year	Cyto Location	Gene Symbols	Confidence	Gene Name	MIM Number	Mapping Method	Comments	Phenotypes	Mouse Gene Symbol""".split('\t')
geneMap2_names = """Chromosome	Genomic Position Start	Genomic Position End	Cyto Location	Computed Cyto Location	Mim Number	Gene Symbols	Gene Name	Approved Symbol	Entrez Gene ID	Ensembl Gene ID	Comments	Phenotypes	Mouse Gene Symbol/ID""".split("\t")
morbidMap_names = """Phenotype	Gene Symbols	MIM Number	Cyto Location""".split("\t")

#### Looks like geneMap and morbidMap are both in geneMap2

### mimTitles

In [27]:
mimTitles = pd.read_csv(mimTitles_url, sep='\t', comment='#', names=mimTitles_names)
mimTitles['Mim Number'] = mimTitles['Mim Number'].astype(str)
mimTitles.head()

Unnamed: 0,Prefix,Mim Number,Preferred Title; symbol,Alternative Title(s); symbol(s),Included Title(s); symbols
0,,100050,"AARSKOG SYNDROME, AUTOSOMAL DOMINANT",,
1,Percent,100070,"AORTIC ANEURYSM, FAMILIAL ABDOMINAL, 1; AAA1","ANEURYSM, ABDOMINAL AORTIC; AAA;; ABDOMINAL AO...",
2,Number Sign,100100,PRUNE BELLY SYNDROME; PBS,"ABDOMINAL MUSCLES, ABSENCE OF, WITH URINARY TR...",
3,,100200,ABDUCENS PALSY,,
4,Number Sign,100300,ADAMS-OLIVER SYNDROME 1; AOS1,"AOS;; ABSENCE DEFECT OF LIMBS, SCALP, AND SKUL...","APLASIA CUTIS CONGENITA, CONGENITAL HEART DEFE..."


In [28]:
# Fix prefix
prefix = {0: "Other, mainly phenotypes with suspected mendelian basis",
         "Asterisk": "Gene description",
         "Plus": "Gene and phenotype, combined",
         "Caret": "Obsolete",
         "Percent": "Phenotype description or locus, molecular basis unknown",
         "Number Sign": "Phenotype description, molecular basis known"}
mimTitles['type'] = mimTitles.Prefix.fillna(0).apply(prefix.get)
del mimTitles['Prefix']

In [29]:
# Fix Preferred Title; symbol
pts = mimTitles['Preferred Title; symbol'].apply(lambda x: pd.Series(x.split(';',1)))
pts.columns = ['title', 'symbol']
mimTitles = pd.concat([mimTitles, pts], axis=1)
del mimTitles['Preferred Title; symbol']
mimTitles.title = mimTitles.title.str.strip()
mimTitles.symbol = mimTitles.symbol.str.strip()

In [30]:
# Fix OMIM ID column
mimTitles.rename(columns={'Mim Number': "_id"}, inplace=True)
mimTitles._id = mimTitles._id.apply(lambda x:"OMIM:" + x)

In [31]:
mimTitles.head()

Unnamed: 0,_id,Alternative Title(s); symbol(s),Included Title(s); symbols,type,title,symbol
0,OMIM:100050,,,"Other, mainly phenotypes with suspected mendel...","AARSKOG SYNDROME, AUTOSOMAL DOMINANT",
1,OMIM:100070,"ANEURYSM, ABDOMINAL AORTIC; AAA;; ABDOMINAL AO...",,"Phenotype description or locus, molecular basi...","AORTIC ANEURYSM, FAMILIAL ABDOMINAL, 1",AAA1
2,OMIM:100100,"ABDOMINAL MUSCLES, ABSENCE OF, WITH URINARY TR...",,"Phenotype description, molecular basis known",PRUNE BELLY SYNDROME,PBS
3,OMIM:100200,,,"Other, mainly phenotypes with suspected mendel...",ABDUCENS PALSY,
4,OMIM:100300,"AOS;; ABSENCE DEFECT OF LIMBS, SCALP, AND SKUL...","APLASIA CUTIS CONGENITA, CONGENITAL HEART DEFE...","Phenotype description, molecular basis known",ADAMS-OLIVER SYNDROME 1,AOS1


In [32]:
mimTitles_records = [{k:v for k,v in zip(mimTitles.columns, list(record)[1:])} for record in mimTitles.to_records()]
mimTitles_records = [{k:v for k,v in record.items() if v==v} for record in mimTitles_records]
mimTitles_records[0]

{'_id': 'OMIM:100050',
 'title': 'AARSKOG SYNDROME, AUTOSOMAL DOMINANT',
 'type': 'Other, mainly phenotypes with suspected mendelian basis'}

In [33]:
# Fix Alternative Title(s); symbol(s)
for record in mimTitles_records:
    if 'Alternative Title(s); symbol(s)' in record:
        altlist = [x for x in record['Alternative Title(s); symbol(s)'].split(";;")]
        del record['Alternative Title(s); symbol(s)']
        record['alternative'] = []
        for alt in altlist:
            if not alt.count(";"):
                record['alternative'].append({'title': alt.strip()})
            else:
                record['alternative'].append({'title':alt.split(";",1)[0].strip(),'symbol':alt.split(";",1)[1].strip()})


In [34]:
# Fix Included Title(s); symbols
for record in mimTitles_records:
    if 'Included Title(s); symbols' in record:
        altlist = [x for x in record['Included Title(s); symbols'].split(";;")]
        del record['Included Title(s); symbols']
        record['included'] = []
        for alt in altlist:
            if not alt.count(";"):
                record['included'].append({'title': alt.strip()})
            else:
                record['included'].append({'title':alt.split(";",1)[0].strip(),'symbol':alt.split(";",1)[1].strip()})


In [35]:
[x for x in mimTitles_records if 'alternative' in x and 'included' in x][8]

{'_id': 'OMIM:102980',
 'alternative': [{'symbol': 'PACAP',
   'title': 'PITUITARY ADENYLATE CYCLASE-ACTIVATING POLYPEPTIDE'}],
 'included': [{'title': 'PACAP38, INCLUDED'},
  {'title': 'PACAP27, INCLUDED'},
  {'symbol': 'PRP, INCLUDED', 'title': 'PACAP-RELATED PEPTIDE, INCLUDED'}],
 'symbol': 'ADCYAP1',
 'title': 'ADENYLATE CYCLASE-ACTIVATING POLYPEPTIDE 1',
 'type': 'Gene description'}

In [36]:
from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.omim
db.drop()
db.insert_many(mimTitles_records)

<pymongo.results.InsertManyResult at 0x7f85395f0d80>

### geneMap2

In [None]:
geneMap2 = pd.read_csv(geneMap2_url, sep='\t', comment='#', names=geneMap2_names)
geneMap2.head()

In [None]:
geneMap2.Phenotypes[21]

What do brackets [ ], braces { }, a question mark (?), and the numbers (1)(2)(3)(4) mean in the Disorder column of the Gene Map?

Brackets, "[ ]", indicate "nondiseases," mainly genetic variations that lead to apparently abnormal laboratory test values (e.g., dysalbuminemic euthyroidal hyperthyroxinemia).

Braces, "{ }", indicate mutations that contribute to susceptibility to multifactorial disorders (e.g., diabetes, asthma) or to susceptibility to infection (e.g., malaria).

A question mark, "?", before the disease name indicates an unconfirmed or possibly spurious mapping.

The number in parentheses after the name of each disorder indicates the following: (1) the disorder was positioned by mapping of the wildtype gene; (2) the disease phenotype itself was mapped; (3) the molecular basis of the disorder is known; (4) the disorder is a chromosome deletion or duplication syndrome. Move the cursor over the number to display this information.