# Fuzzy comparison
The goal of this tool is to collectively solve how to merge multiple data source together. In mimic 3 and mimic 4, it shares similar filenames `patients.csv` and `patients.csv.gz`. Also inside each file, it might have columns `icd_code` in mimic3 and `icd9_code` in mimic4. 
The current solution i am trying out is to use fuzzy comparison to get the best score.

In [34]:
import pandas as pd
import json
from fuzzywuzzy import fuzz,process

In [35]:
with open("mimic_3.json", "r") as f:
    mimic_3 = json.load(f)

print(mimic_3)



In [36]:
with open("mimic_4_ed.json", "r") as f:
    mimic_4_ed = json.load(f)

with open("mimic_4_hosp.json", "r") as f:
    mimic_4_hosp = json.load(f)

with open("mimic_4_icu.json", "r") as f:
    mimic_4_icu = json.load(f)


mimic_4 = dict()
mimic_4.update(mimic_4_ed)
mimic_4.update(mimic_4_icu)
mimic_4.update(mimic_4_hosp)


In [37]:
print(len(mimic_4.keys()))

35


In [38]:
print(len(mimic_3.keys()))

25


In [39]:
fuzz.ratio("this is a bif","this is a pig")

85

In [40]:
fuzz.ratio("this is a bif","this is a big")

92

In [41]:
list(mimic_4.keys())

['diagnosis.csv',
 'medrecon.csv',
 'vitalsign.csv',
 'triage.csv',
 'edstays.csv',
 'pyxis.csv',
 'datetimeevents.csv',
 'ingredientevents.csv',
 'inputevents.csv',
 'procedureevents.csv',
 'd_items.csv',
 'chartevents.csv',
 'icustays.csv',
 'outputevents.csv',
 'poe.csv',
 'd_hcpcs.csv',
 'poe_detail.csv',
 'patients.csv',
 'diagnoses_icd.csv',
 'emar_detail.csv',
 'prescriptions.csv',
 'drgcodes.csv',
 'd_icd_diagnoses.csv',
 'd_labitems.csv',
 'transfers.csv',
 'admissions.csv',
 'labevents.csv',
 'pharmacy.csv',
 'procedures_icd.csv',
 'hcpcsevents.csv',
 'services.csv',
 'd_icd_procedures.csv',
 'omr.csv',
 'emar.csv',
 'microbiologyevents.csv']

In [42]:
list(mimic_3.keys())

['PROCEDUREEVENTS_MV',
 'CALLOUT',
 'D_CPT',
 'D_ITEMS',
 'CAREGIVERS',
 'MICROBIOLOGYEVENTS',
 'LABEVENTS',
 'INPUTEVENTS_CV',
 'ADMISSIONS',
 'D_LABITEMS',
 'DATETIMEEVENTS',
 'PRESCRIPTIONS',
 'PROCEDURES_ICD',
 'CHARTEVENTS',
 'TRANSFERS',
 'DIAGNOSES_ICD',
 'SERVICES',
 'DRGCODES',
 'OUTPUTEVENTS',
 'PATIENTS',
 'D_ICD_DIAGNOSES',
 'ICUSTAYS',
 'INPUTEVENTS_MV',
 'D_ICD_PROCEDURES',
 'CPTEVENTS']

In [43]:
list1 = list(mimic_3.keys())
list2 = list(mimic_4.keys())

# O(n2) fuzzy compare 
print("mimic 3 -> mimic 4 (confidence ratio)")
print("--------------------------------------")
for item1 in list1:
    best_match = None
    best_ratio = 0
    for item2 in list2:
        ratio = fuzz.ratio(item1.lower(), item2.lower())
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = item2
    print(f"{item1} -> {best_match} ({best_ratio}%)")

mimic 3 -> mimic 4 (confidence ratio)
--------------------------------------
PROCEDUREEVENTS_MV -> procedureevents.csv (86%)
CALLOUT -> microbiologyevents.csv (28%)
D_CPT -> d_hcpcs.csv (50%)
D_ITEMS -> d_items.csv (78%)
CAREGIVERS -> chartevents.csv (56%)
MICROBIOLOGYEVENTS -> microbiologyevents.csv (90%)
LABEVENTS -> labevents.csv (82%)
INPUTEVENTS_CV -> inputevents.csv (90%)
ADMISSIONS -> admissions.csv (83%)
D_LABITEMS -> d_labitems.csv (83%)
DATETIMEEVENTS -> datetimeevents.csv (88%)
PRESCRIPTIONS -> prescriptions.csv (87%)
PROCEDURES_ICD -> procedures_icd.csv (88%)
CHARTEVENTS -> chartevents.csv (85%)
TRANSFERS -> transfers.csv (82%)
DIAGNOSES_ICD -> diagnoses_icd.csv (87%)
SERVICES -> services.csv (80%)
DRGCODES -> drgcodes.csv (80%)
OUTPUTEVENTS -> outputevents.csv (86%)
PATIENTS -> patients.csv (80%)
D_ICD_DIAGNOSES -> d_icd_diagnoses.csv (88%)
ICUSTAYS -> icustays.csv (80%)
INPUTEVENTS_MV -> inputevents.csv (83%)
D_ICD_PROCEDURES -> d_icd_procedures.csv (89%)
CPTEVENTS -> inp

In [46]:
list1 = pd.read_csv(mimic_3["PROCEDUREEVENTS_MV"]["filepath"]).columns.to_list()
list2 = pd.read_csv(mimic_4["procedureevents.csv"]["filepath"]).columns.to_list()

In [47]:
# O(n2) fuzzy compare 
print("mimic 3 PROCEDUREEVENTS_MV -> mimic 4 procedureevents.csv (confidence ratio)")
print("--------------------------------------")
for item1 in list1:
    best_match = None
    best_ratio = 0
    for item2 in list2:
        ratio = fuzz.ratio(item1.lower(), item2.lower())
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = item2
    print(f"{item1} -> {best_match} ({best_ratio}%)")

mimic 3 PROCEDUREEVENTS_MV -> mimic 4 procedureevents.csv (confidence ratio)
--------------------------------------
Unnamed: 0 -> Unnamed: 0 (100%)
row_id -> hadm_id (46%)
subject_id -> subject_id (100%)
hadm_id -> hadm_id (100%)
icustay_id -> stay_id (82%)
starttime -> starttime (100%)
endtime -> endtime (100%)
itemid -> itemid (100%)
value -> value (100%)
valueuom -> valueuom (100%)
location -> location (100%)
locationcategory -> locationcategory (100%)
storetime -> storetime (100%)
cgid -> caregiver_id (50%)
orderid -> orderid (100%)
linkorderid -> linkorderid (100%)
ordercategoryname -> ordercategoryname (100%)
secondaryordercategoryname -> ordercategoryname (79%)
ordercategorydescription -> ordercategorydescription (100%)
isopenbag -> isopenbag (100%)
continueinnextdept -> continueinnextdept (100%)
cancelreason -> locationcategory (43%)
statusdescription -> statusdescription (100%)
comments_editedby -> patientweight (40%)
comments_canceledby -> continueinnextdept (43%)
comments_da