In [1]:
%%capture 
from tqdm import notebook 
notebook.tqdm().pandas()

import requests
import json
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from alchemy.text_processing import remove_notes
from fuzzywuzzy import fuzz,process
from ftfy import fix_text
import ast


user_directory = 'X:'

In [2]:
import re

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in "McDonalds":')
ngrams('McDonalds')

All 3-grams in "McDonalds":


['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds']

In [3]:
response = requests.get("http://purl.obolibrary.org/obo/doid.obo")
soup = BeautifulSoup(response.content)

txt = response.text
split_txt = txt.split('[Term]')

term_df = pd.DataFrame()
other_labels = []
error_log = []

In [11]:
term_content = split_txt[3]
term_content = term_content.split('\n')
term_content

['',
 'id: DOID:0014667',
 'name: disease of metabolism',
 'def: "A disease that involving errors in metabolic processes of building or degradation of molecules." [url:http\\://www.ncbi.nlm.nih.gov/books/NBK22259/]',
 'subset: DO_AGR_slim',
 'subset: DO_CFDE_slim',
 'subset: DO_GXD_slim',
 'subset: NCIthesaurus',
 'synonym: "metabolic disease" EXACT []',
 'xref: ICD10CM:E88.9',
 'xref: ICD9CM:277.9',
 'xref: MESH:D008659',
 'xref: NCI:C3235',
 'xref: SNOMEDCT_US_2021_09_01:75934005',
 'xref: UMLS_CUI:C0025517',
 'is_a: DOID:4 ! disease',
 '',
 '']

In [13]:

for term_content in notebook.tqdm(split_txt[1:]):
    term_content = term_content.split('\n')
    while "" in term_content:
        term_content.remove('')

    term_id = np.nan
    term_name = np.nan
    term_relation = np.nan
    term_reference = []
    term_subset = []
    term_synonym = []
    term_def = np.nan
    term_alt_id = []
    term_obsolete = 'false'

    for item in term_content:
        if 'id:' == item[:3]:
            if "results_in_formation_of" in item:
                error_log += [term_content]
            term_id = item.replace('id:','').strip()
        elif 'name:' == item[:5]:
            term_name = item.replace('name:','').strip()
        elif 'alt_id:' == item[:7]:
            term_alt_id += [item.replace('alt_id:', '').strip()]


        elif 'is_a:' == item[:5]:
            term_relation = item.replace('is_a:','').strip()
            split_term_relation = term_relation.split('!')
            split_term_relation = [string.strip() for string in split_term_relation]
            term_relation = (split_term_relation[0], split_term_relation[1])
        elif 'xref:' == item[:5]:
            term_reference += [item.replace('xref:', '').strip()]
        elif 'synonym:' == item[:8]:
            term_synonym += [item.replace('synonym:', '').strip()]
        elif 'subset:' == item[:7]:
            term_subset += [item.replace('subset:', '').strip()] 
        elif 'def:' == item[:4]:
            term_def = item.replace('def:','').strip()

        elif 'is_obsolete:' == item[:12]:
            term_obsolete = item.replace('is_obsolete:','').strip()

        else:
            other_labels += [item.split(':')[0]]
            other_labels = list(set(other_labels))

    df = pd.DataFrame([[term_id,term_name,term_relation, term_reference, term_subset,term_synonym,term_def, term_alt_id, term_obsolete]], columns = ['doid','name', 'is_a', 'external_reference', 'subset', 'synonym', 'def', 'alterate_id', 'obsolete'])
    term_df = pd.concat([term_df, df])

  0%|          | 0/13406 [00:00<?, ?it/s]

In [14]:
term_df

Unnamed: 0,doid,name,is_a,external_reference,subset,synonym,def,alterate_id,obsolete
0,DOID:0001816,angiosarcoma,"(DOID:175, vascular cancer)","[ICDO:9120/3, MESH:D006394, NCI:C3088, NCI:C92...","[DO_cancer_slim, NCIthesaurus]","[""hemangiosarcoma"" EXACT []]","""A vascular cancer that derives_from the cells...","[DOID:267, DOID:4508]",false
0,DOID:0002116,pterygium,"(DOID:10124, corneal disease)",[UMLS_CUI:C0033999],[],"[""surfer's eye"" EXACT []]","""A corneal disease that is characterized by a ...",[],false
0,DOID:0014667,disease of metabolism,"(DOID:4, disease)","[ICD10CM:E88.9, ICD9CM:277.9, MESH:D008659, NC...","[DO_AGR_slim, DO_CFDE_slim, DO_GXD_slim, NCIth...","[""metabolic disease"" EXACT []]","""A disease that involving errors in metabolic ...",[],false
0,DOID:0040001,shrimp allergy,"(DOID:0060524, crustacean allergy)",[],[DO_IEDB_slim],[],"""A crustacean allergy that has_allergic_trigge...",[],false
0,DOID:0040002,aspirin allergy,"(DOID:0060500, drug allergy)","[SNOMEDCT_US_2021_09_01:293586001, UMLS_CUI:C0...",[DO_IEDB_slim],"[""acetylsalicylic acid allergy"" EXACT [], ""ASA...","""A drug allergy that has_allergic_trigger acet...",[],false
...,...,...,...,...,...,...,...,...,...
0,DOID:9989,obsolete metastasis to the orbit,,[],[],"[""metastatic tumor to the orbit"" EXACT [], ""se...",,[],true
0,DOID:999,hypereosinophilic syndrome,"(DOID:9500, leukocyte disease)","[GARD:2804, ICD10CM:D72.1, ICD9CM:288.3, MESH:...",[],"[""eosinophilia"" EXACT [], ""Eosinophilic leukoc...","""A leukocyte disease that is characterized by ...",[],false
0,DOID:9993,hypoglycemia,"(DOID:4194, glucose metabolism disease)","[ICD10CM:E16.2, ICD9CM:251.2, MESH:D007003, NC...",[NCIthesaurus],"[""Hypoglycaemia"" EXACT []]","""A glucose metabolism disease that is characte...",[],false
0,DOID:9995,obsolete endocrine and metabolic disturbances ...,,[],[],[],,[],true


In [19]:
term_df = term_df.iloc[:-1]
#term_df = term_df[term_df.obsolete != 'true']

In [20]:
definitions = []
for string in term_df['def']:
    try:
        def_tuple = remove_notes(string, start_sep = '{', end_sep = '}', with_notes=True)
        definition = def_tuple[0]
        definition_comment = def_tuple[1]
        def_tuple = remove_notes(definition, start_sep = '[', end_sep = ']', with_notes=True)
        definition = def_tuple[0]
        definition_links = def_tuple[1]

        definitions += [(definition[1:-1], definition_comment, definition_links)]
    except:
        definitions += [(np.nan, np.nan, np.nan)]

In [21]:
definition_df = pd.DataFrame(definitions, columns=['definition', 'definition_comments', 'definition_link'])

In [22]:
term_df = term_df.reset_index(drop=True)

In [24]:
synonyms_list = []
for synonym_list in notebook.tqdm(term_df.synonym):
    synonyms = []
    for string in synonym_list:
        def_tuple = remove_notes(string, start_sep = '[', end_sep = ']', with_notes=True)
        nstring = def_tuple[0].strip()
        synonym_detail = def_tuple[1]
        split_string = nstring.split('" ')
        synonym_name = split_string[0].replace('"','')
        synonym_type = split_string[1].strip()

        synonyms += [(synonym_name, synonym_type, synonym_detail)]
    synonyms_list += [synonyms]
term_df.synonym = synonyms_list

  0%|          | 0/16208 [00:00<?, ?it/s]

In [25]:
parent_df = pd.DataFrame()
for item in notebook.tqdm(term_df.is_a):
    if isinstance(item, float):
        df = pd.DataFrame([[np.nan, np.nan]], columns = ['parent_doid', 'parent_name'])
    else:
        df = pd.DataFrame([item], columns = ['parent_doid', 'parent_name'])
    parent_df = pd.concat([parent_df, df])
    
parent_df = parent_df.reset_index(drop=True)
term_df = pd.concat([term_df, parent_df], axis=1)

  0%|          | 0/16208 [00:00<?, ?it/s]

In [26]:
term_df

Unnamed: 0,doid,name,is_a,external_reference,subset,synonym,def,alterate_id,obsolete,parent_doid,parent_name
0,DOID:0001816,angiosarcoma,"(DOID:175, vascular cancer)","[ICDO:9120/3, MESH:D006394, NCI:C3088, NCI:C92...","[DO_cancer_slim, NCIthesaurus]","[(hemangiosarcoma, EXACT, )]","""A vascular cancer that derives_from the cells...","[DOID:267, DOID:4508]",false,DOID:175,vascular cancer
1,DOID:0002116,pterygium,"(DOID:10124, corneal disease)",[UMLS_CUI:C0033999],[],"[(surfer's eye, EXACT, )]","""A corneal disease that is characterized by a ...",[],false,DOID:10124,corneal disease
2,DOID:0014667,disease of metabolism,"(DOID:4, disease)","[ICD10CM:E88.9, ICD9CM:277.9, MESH:D008659, NC...","[DO_AGR_slim, DO_CFDE_slim, DO_GXD_slim, NCIth...","[(metabolic disease, EXACT, )]","""A disease that involving errors in metabolic ...",[],false,DOID:4,disease
3,DOID:0040001,shrimp allergy,"(DOID:0060524, crustacean allergy)",[],[DO_IEDB_slim],[],"""A crustacean allergy that has_allergic_trigge...",[],false,DOID:0060524,crustacean allergy
4,DOID:0040002,aspirin allergy,"(DOID:0060500, drug allergy)","[SNOMEDCT_US_2021_09_01:293586001, UMLS_CUI:C0...",[DO_IEDB_slim],"[(acetylsalicylic acid allergy, EXACT, ), (ASA...","""A drug allergy that has_allergic_trigger acet...",[],false,DOID:0060500,drug allergy
...,...,...,...,...,...,...,...,...,...,...,...
16203,DOID:9988,tertiary neurosyphilis,"(DOID:936, brain disease)","[ICD10CM:A52.3, ICD9CM:094, MESH:D009494, NCI:...","[gram-negative_bacterial_infectious_disease, N...","[(late neurosyphilis, EXACT, )]","""A tertiary syphilis that results in infection...",[],false,DOID:936,brain disease
16204,DOID:9989,obsolete metastasis to the orbit,,[],[],"[(metastatic tumor to the orbit, EXACT, ), (se...",,[],true,,
16205,DOID:999,hypereosinophilic syndrome,"(DOID:9500, leukocyte disease)","[GARD:2804, ICD10CM:D72.1, ICD9CM:288.3, MESH:...",[],"[(eosinophilia, EXACT, ), (Eosinophilic leukoc...","""A leukocyte disease that is characterized by ...",[],false,DOID:9500,leukocyte disease
16206,DOID:9993,hypoglycemia,"(DOID:4194, glucose metabolism disease)","[ICD10CM:E16.2, ICD9CM:251.2, MESH:D007003, NC...",[NCIthesaurus],"[(Hypoglycaemia, EXACT, )]","""A glucose metabolism disease that is characte...",[],false,DOID:4194,glucose metabolism disease


In [195]:
term_df.to_pickle(user_directory + '/knowledge_base/disease_ontology.pkl')

Unnamed: 0,doid,name,is_a,external_reference,subset,synonym,def,alterate_id,obsolete,parent_doid,parent_name
0,DOID:0001816,angiosarcoma,"(DOID:175, vascular cancer)","[ICDO:9120/3, MESH:D006394, NCI:C3088, NCI:C92...","[DO_cancer_slim, NCIthesaurus]","[(hemangiosarcoma, EXACT, )]","""A vascular cancer that derives_from the cells...","[DOID:267, DOID:4508]",false,DOID:175,vascular cancer
1,DOID:0002116,pterygium,"(DOID:10124, corneal disease)",[UMLS_CUI:C0033999],[],"[(surfer's eye, EXACT, )]","""A corneal disease that is characterized by a ...",[],false,DOID:10124,corneal disease
2,DOID:0014667,disease of metabolism,"(DOID:4, disease)","[ICD10CM:E88.9, ICD9CM:277.9, MESH:D008659, NC...","[DO_AGR_slim, DO_GXD_slim, NCIthesaurus]","[(metabolic disease, EXACT, )]","""A disease that involving errors in metabolic ...",[],false,DOID:4,disease
3,DOID:0040001,shrimp allergy,"(DOID:0060524, crustacean allergy)",[],[DO_IEDB_slim],[],"""A crustacean allergy that has_allergic_trigge...",[],false,DOID:0060524,crustacean allergy
4,DOID:0040002,aspirin allergy,"(DOID:0060500, drug allergy)","[SNOMEDCT_US_2020_09_01:293586001, UMLS_CUI:C0...",[DO_IEDB_slim],"[(acetylsalicylic acid allergy, EXACT, ), (ASA...","""A drug allergy that has_allergic_trigger acet...",[],false,DOID:0060500,drug allergy
...,...,...,...,...,...,...,...,...,...,...,...
10785,DOID:9986,orbit lymphoma,"(DOID:4143, orbital cancer)","[GARD:9719, MESH:C537131, NCI:C6244, SNOMEDCT_...",[NCIthesaurus],"[(Lymphoma of the orbit, EXACT, )]","""An orbital cancer that has _material_basis_in...",[],false,DOID:4143,orbital cancer
10786,DOID:9987,orbit sarcoma,"(DOID:4143, orbital cancer)","[NCI:C6095, SNOMEDCT_US_2020_09_01:699354006, ...",[NCIthesaurus],"[(orbital sarcoma, EXACT, )]","""An orbital cancer that has_material_basis_in ...",[],false,DOID:4143,orbital cancer
10787,DOID:9988,tertiary neurosyphilis,"(DOID:936, brain disease)","[ICD10CM:A52.3, ICD9CM:094, MESH:D009494, NCI:...","[gram-negative_bacterial_infectious_disease, N...","[(late neurosyphilis, EXACT, )]","""A tertiary syphilis that results in infection...",[],false,DOID:936,brain disease
10788,DOID:999,hypereosinophilic syndrome,"(DOID:9500, leukocyte disease)","[GARD:2804, ICD10CM:D72.1, ICD9CM:288.3, MESH:...",[],"[(eosinophilia, EXACT, ), (Eosinophilic leukoc...","""A leukocyte disease that is characterized by ...",[],false,DOID:9500,leukocyte disease


In [27]:
sources = []
for row in notebook.tqdm(term_df.iterrows(), total=term_df.shape[0]):
    reference_list = row[1].external_reference
    reference_dict = {}
    for reference in reference_list:
        reference_dict = {**reference_dict, **{reference.split(':')[0]:reference.split(':')[1]}}
    sources += [reference_dict]
term_df['xref'] = sources 
    


  0%|          | 0/16208 [00:00<?, ?it/s]

In [51]:
non_obsolete = term_df[term_df.obsolete != True]
mappings = non_obsolete[['doid', 'xref']]

doid_mapping_df = pd.DataFrame()
for idx, row in notebook.tqdm(mappings.iterrows(), total=mappings.shape[0]):
    doid = row['doid'].split(':')[1]
    xref = row['xref']
    if len(list(xref.keys())) == 0:
        pass
    else:
        df = pd.json_normalize(xref)
        df['DOID'] = doid
        doid_mapping_df = pd.concat([doid_mapping_df, df])

  0%|          | 0/16208 [00:00<?, ?it/s]

In [53]:
doid_mapping_df.to_csv(user_directory + '/mapping/doid_map.csv', index=False)

In [351]:
umls_list = []
for ref_dic in term_df.xref:
    try:
        umls_list += [ref_dic['UMLS_CUI']]
    except:
        pass

In [354]:
umls_synonym_df = pd.DataFrame()

for identifier in umls_list:

    url = 'https://ncim.nci.nih.gov/ncimbrowser/pages/concept_details.jsf?dictionary=NCI%20Metathesaurus&code=' + identifier + '&type=synonym'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html5lib')
    tables = soup.find_all('table')
    for table in tables:
        try:
            if table['role'] == 'presentation':
                pass
            else:
                print(table)
        except:
            synonym_table = pd.read_html(str(table))[0]
    synonym_table['umls_code'] = identifier
    umls_synonym_df = pd.concat([umls_synonym_df, synonym_table])

# OrphaNet

In [338]:
filename = 'C:/Users/micha/Downloads/Orphanet_Nomenclature_Pack_EN/ORPHAnomenclature_en.xml'
with open(filename, 'r') as f:
    xml = f.read()
    
soup = BeautifulSoup(xml)
html = soup.html.body.jdbor
disorder_list = html.disorderlist.find_all('disorder')

orphanet_disorder_df = pd.DataFrame()
for disorder in disorder_list:
    for item in disorder:
        if item.name == None:
            pass
        else:
            if item.name == 'orphacode':
                orphacode = item.text.strip()
            elif item.name == 'name':
                orphaname = item.text.strip()
                orphaname_lang = item['lang']
            elif item.name == 'synonymlist':
                synonymlist = item.find_all('synonym')
                synonymlist = [(synonym.text.strip(), synonym['lang']) for synonym in synonymlist]
            elif item.name == 'disordertype':
                disorder_type_id = item['id']
                children = item.findChildren()
                disordertype = []
                for child in children:
                     if child.name == 'name':
                            disordertype = child.text.strip()
            else:
                pass

    df = pd.DataFrame([[orphacode, orphaname, orphaname_lang, synonymlist, disordertype, disorder_type_id]], columns = ['orphacode', 'orphaname', 'orphaname_lang', 'synonymlist', 'disordertype', 'disorder_type_id'])
    orphanet_disorder_df = pd.concat([orphanet_disorder_df, df])

In [198]:
alias_df = term_df[['doid', 'name']]
alias_df.columns = ['doid', 'alias']

for row in term_df.iterrows():
    doid = row[1].doid
    synonym_list = row[1].synonym
    for item in synonym_list:
        if item[1] == 'EXACT':
            df = pd.DataFrame([[doid, item[0]]], columns =['doid', 'alias'])
            alias_df = pd.concat([alias_df, df])

In [201]:
bmt_indications = pd.read_csv(user_directory + '/biomedtracker/cleaned/bmt_indications.csv')

In [206]:
doids = []
for indication in bmt_indications.indication:
    df = alias_df[alias_df.alias.str.lower() == remove_notes(indication.lower())]
    doid_list = df.doid.drop_duplicates().tolist()
    if len(doid_list) == 0:
        doid = np.nan
    elif len(doid_list) > 1:
        doid = np.nan
        print(indication)
    else:
        doid = doid_list[0]

    doids += [doid]

Scleroderma


In [208]:
bmt_indications['doid'] = doids

In [216]:
no_match = bmt_indications[bmt_indications.doid.isnull()]


In [218]:
import ftfy
from fuzzywuzzy import process

In [219]:
no_match.indication = [ftfy.fix_text(indication) for indication in no_match.indication]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [241]:
is_imaging = []
no_imaging = []
for indication in no_match.indication:
    is_imaging += ['- Imaging' in indication]
    no_imaging += ['- Imaging' not in indication]

In [242]:
imaging = no_match[is_imaging]
no_match = no_match[no_imaging]

In [244]:
no_match['modified_indication'] = [remove_notes(indication) for indication in no_match.indication]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_match['modified_indication'] = [remove_notes(indication) for indication in no_match.indication]


In [246]:
doid_alias_list = alias_df.alias.tolist()

In [254]:
indication = no_match.modified_indication.tolist()[3]
process.extractOne(indication, doid_alias_list)

('methylmalonic aciduria and homocystinuria type cblC', 86)

# MONDO

In [None]:
file = 'C:/Users/mmanifesto/Documents/mondo.json'
with open (file, 'r', encoding='utf-8') as f:
    mondo_json = json.load(f)
    
mondo_nodes = mondo_json['graphs'][0]['nodes']

In [None]:
disease_tuples = []
for node in mondo_nodes:
    try:
        mondoID = node['id']
        mondoType = node['type']
        mondoLabel = node['lbl']
        #mondoDefinition = node['meta']['definition']['val']

        disease_tuples += [(mondoID, mondoLabel, 'hasExactSynonym', mondoID)]

        try:
            synonym_list = node['meta']['synonyms']
            for item in synonym_list:
                synonymType = item['pred']
                synonym = item['val']
                synonymXrefList = item['xrefs']

                disease_tuples += [(mondoID, synonym, synonymType, synonymXrefList)]
        except:
            pass
    except:
        pass
mondo_disease_synonym_df = pd.DataFrame(disease_tuples, columns=['mondoId', 'diseaseName', 'synonymType', 'synonymXrefList'])
mondo_disease_synonym_df.mondoId = mondo_disease_synonym_df.mondoId.str.replace('http://purl.obolibrary.org/obo/', '')
mondo_disease_synonym_df.diseaseName = mondo_disease_synonym_df.diseaseName.str.lower()

In [None]:
# Create a DF of the official Mondo Ontology Names
mondo_tuples = []
for node in mondo_nodes:
    try:
        mondoID = node['id']
    except:
        mondoID = np.nan
    try:
        mondoType = node['type']
    except:
        mondoType = np.nan
    try:
        mondoLabel = node['lbl']
    except:
        mondoLabel = np.nan
    try:
        mondoDefinition = node['meta']['definition']['val']
    except:
        mondoDefinition = np.nan
        
    mondo_tuples += [(mondoID, mondoType, mondoLabel, mondoDefinition)]
    
mondo_df = pd.DataFrame(mondo_tuples, columns=['mondoId', 'mondoType', 'mondoLabel', 'mondoDefinition'])
mondo_df.mondoId = mondo_df.mondoId.str.replace('http://purl.obolibrary.org/obo/', '')
mondo_df.mondoLabel = mondo_df.mondoLabel.str.lower()

In [None]:
# Select Biomedtracker Indications from their webpage
file = 'C:/Users/mmanifesto/Documents/BiomedtrackerDisease.htm'
with open (file, 'r') as f:
    html = f.read()
soup = BeautifulSoup(html, 'html5lib')

coverage = soup.find('div', {'data-target':'#navCoverage'})
children = coverage.findChildren(recursive=False)

section_tuples = []
section = ''
for item in children:
    if section == '':
        if item.name == 'h4':
            section = item.text.strip()
            sectionid = item['id']
    else:
        if item.name == 'h4':
            section = item.text.strip()
            sectionid = item['id']
        else:
            a_tags = item.find_all('a')
            for tag in a_tags:
                section_tuples += [(section, sectionid, tag.text.strip(), tag['href'])]

In [None]:
bmt_disease_df = pd.DataFrame(section_tuples, columns = ['area', 'areaid', 'disease', 'link'])
bmt_disease_df['modified_name'] = [fix_text(remove_notes(disease).lower()) for disease in bmt_disease_df.disease]

In [None]:
label = []
for disease in bmt_disease_df.modified_name:
    if 'imag' in disease:
        label += ['imaging']
        print(disease)
    
    elif 'prev' in disease:
        label += ['prevention']
        print(disease)
        
    elif 'treat' in disease:
        label += ['treatment']
        print(disease)
    else:
        label += [np.nan]

In [None]:
bmt_disease_df['label'] = label
no_label = bmt_disease_df[bmt_disease_df.label.isnull()]

mondo2 = mondo_disease_synonym_df.drop(columns = ['synonymXrefList'])
mondo2 = mondo2.drop_duplicates().reset_index(drop=True)
mondo2['ontology'] = [mondoId.split('_')[0] for mondoId in mondo2.mondoId]

In [None]:
disease_match = []
multiples = []
for disease in no_label.modified_name:
    match_df = mondo2[mondo2.diseaseName == disease]
    if match_df.shape[0] == 0:
        mondoid = np.nan
        synonymType = np.nan
    elif match_df.shape[0] == 1:
        mondoid = match_df.iloc[0].mondoId
        synonymType = match_df.iloc[0].synonymType
    else:
        df2 = match_df[match_df.synonymType == 'hasExactSynonym']
        if df2.shape[0] == 1:
            mondoid = df2.iloc[0].mondoId
            synonymType = df2.iloc[0].synonymType
        else:
            df3 = df2[df2.ontology == 'MONDO']
            if df3.shape[0] == 1:
                mondoid = df3.iloc[0].mondoId
                synonymType = df3.iloc[0].synonymType
            else:
                df3 = pd.merge(df3, mondo_df, on='mondoId', how='left')
                ratios = []
                partials = []
                for idx, row in df3.iterrows():
                    diseaseName = row.diseaseName
                    mondoLabel = row.mondoLabel
                    ratios += [fuzz.ratio(mondoLabel.lower(), diseaseName.lower())]
                    partials += [fuzz.partial_ratio(mondoLabel.lower(), diseaseName.lower())]
                df3['ratio'] = ratios
                df3['partial'] = partials
                try:
                    df3 = df3[df3.ratio == max(df3.ratio)]
                    df3 = df3[df3.ratio  >= 80]
                    if df3.shape[0] == 1:
                        mondoid = df3.iloc[0].mondoId
                        synonymType = df3.iloc[0].synonymType 
                    else:
                        monoid = np.nan
                        synonymType = np.nan
                except:
                    pass
                
    disease_match += [(disease, mondoid, synonymType)]

In [None]:
disease_match_df = pd.DataFrame(disease_match, columns = ['modified_name', 'mondoId', 'synonymType'])
print(disease_match_df[disease_match_df.mondoId.notnull()].shape[0] / disease_match_df.shape[0])

matched_df = disease_match_df[disease_match_df.mondoId.notnull()]
unmatched_df = disease_match_df[disease_match_df.mondoId.isnull()]

# Add the mondo fields to the df where there was one match found
matched_df = pd.merge(matched_df, mondo_df, on = 'mondoId', how='left')

matched_df.to_csv('C:/Users/mmanifesto/Documents/mondo_bmt_matched.csv', index=False)
unmatched_df.to_csv('C:/Users/mmanifesto/Documents/mondo_bmt_unmatched.csv', index=False)

In [None]:
for mondoId in matched_df.mondoId:
    mondo_disease_synonym_df = mondo_disease_synonym_df[mondo_disease_synonym_df != mondoId]
    
diseaseName_list = mondo_disease_synonym_df.diseaseName.tolist()
name_list = unmatched_df.modified_name.tolist()

In [None]:
match_list = []
for disease in notebook.tqdm(name_list):
    match_list += [process.extractOne(disease, diseaseName_list)]

fuzzy_match_df = pd.DataFrame(match_list, columns = ['diseaseName', 'ratio'])
fuzzy_match_df['modified_name'] = name_list

fuzzy_match_df = fuzzy_match_df.sort_values('ratio', ascending=False)
fuzzy_match_df = fuzzy_match_df[fuzzy_match_df.ratio >= 95]

fuzzy_match_df.to_csv('C:/Users/mmanifesto/Documents/fuzzy_matches.csv', index=False)