In [1]:
from pymongo import MongoClient
client = MongoClient()
db = client.mydisease.ctdbase

In [2]:
import gzip
import json
import os
from collections import defaultdict
from tqdm import tqdm
import pandas as pd

from mydisease import DATA_DIR
from mydisease.dataload.ctdbase.download_raw_data import relationships
pd.set_option('display.width', 1000)

In [3]:
def parse_diseaseid(did: str):
    """ 
    The 'DiseaseID' column sometimes starts with the identifier prefix, and sometime doesnt
    prefixes are {'MESH:','OMIM:'}
    if an ID starts with 'C' or 'D', its MESH, if its an integer: 'OMIM'
    """
    if did.startswith("OMIM:") or did.startswith("MESH:"):
        return did
    if did.startswith('C') or did.startswith('D'):
        return 'MESH:' + did
    if did.isdigit():
        return "OMIM:" + did
    print("warning: " + did)
    return did

In [4]:
def parse_csv_to_df(f):
    line = next(f)
    while not line.startswith("# Fields:"):
        line = next(f)
    # parse the column headers from the comments
    fields = next(f)[1:].strip().split(",")
    df = pd.read_csv(f, delimiter=",", comment="#")
    df.columns = fields

    # split pipe-delimited fields
    fields_split = set(['DirectEvidence', 'OmimIDs', 'PubMedIDs', 'InferenceGeneSymbols']) & set(fields)
    for field in fields_split:
        # don't split NaN
        field_split = df[field].dropna().astype(str).str.split("|")
        df[field][field_split.index] = field_split

    df['DiseaseID'] = df['DiseaseID'].map(parse_diseaseid)
    return df

In [5]:
def get_columns_to_keep(relationship):
    if relationship in {'GO_BP', 'GO_CC', 'GO_MF'}:
        columns_keep = ['GOID','InferenceGeneSymbols']
    elif relationship == "pathways":
        columns_keep = ['PathwayID','InferenceGeneSymbol']
    elif relationship == "chemicals":
        columns_keep = ['CasRN','ChemicalID','DirectEvidence','InferenceGeneSymbol','InferenceScore','OmimIDs','PubMedIDs']
    elif relationship == "genes":
        columns_keep = ['GeneID','DirectEvidence','InferenceScore','InferenceChemicalName','OmimIDs','PubMedIDs']
    return columns_keep

In [6]:
def parse_df(db, df, relationship):
    """
    df is parsed and added to mongodb (db)
    """
    columns_keep = get_columns_to_keep(relationship)
    total = len(set(df.DiseaseID))
    for diseaseID, subdf in tqdm(df.groupby("DiseaseID"), total=total):
        sub = subdf[columns_keep].to_dict(orient="records")
        sub = [{k:v for k,v in s.items() if v==v} for s in sub] # get rid of nulls
        db.update_one({'_id':diseaseID}, {'$set':{relationship: sub}}, upsert=True)

In [7]:
def process_genes(db, f):
    """
    # for the genes file, which is enormous, we need to do something different
    # basically same as others, but in chunks
    d is modified in place!!
    
    note: this will fail
    WriteError: Resulting document after update is larger than 16777216

    """
    chunksize = 100000
    names = ['GeneSymbol', 'GeneID', 'DiseaseName', 'DiseaseID', 'DirectEvidence',
             'InferenceChemicalName', 'InferenceScore', 'OmimIDs', 'PubMedIDs']
    for df in tqdm(pd.read_csv(f, delimiter=",", comment="#", header=None, chunksize=chunksize, 
                          low_memory=False, names=names), total = 49867785/chunksize):
        fields_split = ['DirectEvidence', 'OmimIDs', 'PubMedIDs']
        for field in fields_split:
            field_split = df[field].dropna().astype(str).str.split("|")
            df[field][field_split.index] = field_split
        columns_keep = get_columns_to_keep('genes')
        df['DiseaseID'] = df['DiseaseID'].map(parse_diseaseid)
        for diseaseID, subdf in df.groupby("DiseaseID"):
            sub = subdf[columns_keep].to_dict(orient="records")
            # get rid of nulls
            sub = [{k:v for k,v in s.items() if v==v} for s in sub]
            
            db.update_one({'_id':diseaseID}, {'$push':{relationship: {'$each': sub}}}, upsert=True)

In [8]:
for relationship, file_path in relationships.items():
    print(relationship)
    with gzip.open(os.path.join(DATA_DIR, file_path), 'rt', encoding='utf-8') as f:
        if relationship == "genes":
            #process_genes(db, f)
            pass
        else:
            df = parse_csv_to_df(f)
            parse_df(db, df, relationship)

chemicals


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100%|██████████| 5735/5735 [01:49<00:00, 52.57it/s] 


GO_CC


100%|██████████| 4297/4297 [00:11<00:00, 364.89it/s]


GO_BP


100%|██████████| 4298/4298 [00:14<00:00, 294.16it/s]


GO_MF


100%|██████████| 4281/4281 [00:09<00:00, 428.78it/s]


pathways


100%|██████████| 3556/3556 [00:08<00:00, 411.15it/s]
  0%|          | 0/498.67785 [00:00<?, ?it/s]

genes


  1%|          | 5/498.67785 [01:09<1:55:01, 13.98s/it]


WriteError: Resulting document after update is larger than 16777216

In [9]:
db.count()

5774

In [10]:
doc = db.find_one({'genes':{'$exists':True}})

In [12]:
from collections import Counter
Counter([x['_id'].split(":")[0] for x in db.find({},{'_id':1})])

Counter({'MESH': 5390, 'OMIM': 384})