# Fuzzy comparison
The goal of this tool is to collectively solve how to merge multiple data source together. In mimic 3 and mimic 4, it shares similar filenames `patients.csv` and `patients.csv.gz`. Also inside each file, it might have columns `icd_code` in mimic3 and `icd9_code` in mimic4. 
The current solution i am trying out is to use fuzzy comparison to get the best score.

In [2]:
import pandas as pd
import json
from fuzzywuzzy import fuzz,process
import ipywidgets as widgets



open mimic_3

In [3]:
with open("data/mimic_3.json", "r") as f:
    mimic_3 = json.load(f)

print(mimic_3)



open mimic_4 and merge them

In [4]:

with open("data/mimic_4_ed.json", "r") as f:
    mimic_4_ed = json.load(f)

with open("data/mimic_4_hosp.json", "r") as f:
    mimic_4_hosp = json.load(f)

with open("data/mimic_4_icu.json", "r") as f:
    mimic_4_icu = json.load(f)


mimic_4 = dict()
mimic_4.update(mimic_4_ed)
mimic_4.update(mimic_4_icu)
mimic_4.update(mimic_4_hosp)


In [5]:

import json

# Write dictionary to file as JSON
with open('./data/mimic_4.json', 'w') as f:
    json.dump(mimic_4, f)




## Display mimic3

In [6]:
mimic_3_options = list(mimic_3.keys())

dropdown_mimic_3 = widgets.Dropdown(options = mimic_3_options, 
                                    value = mimic_3_options[0],
                                    description = "table name: ",
                                    disabled = False)



def display_value(key):
    value = mimic_3[key]
    print(key)
    for minikey,datatype in value.items():
        print(f"{minikey:<30}{datatype}")


widgets.interact(display_value,key = dropdown_mimic_3)    
    

interactive(children=(Dropdown(description='table name: ', options=('PROCEDUREEVENTS_MV', 'CALLOUT', 'D_CPT', …

<function __main__.display_value(key)>

# Table-level cross reference
for every table in mimic 3, find the counterpart in mimic_4 with fuzzy match

In [7]:
def get_matched_table(json_1,json_2, num_best_match = 3):
    """
    input two json files about the metadata information, returns a dataframe
    with five columns

    :param _type_ json_1: _description_
    :param _type_ json_2: _description_
    :param int num_best_match: _description_, defaults to 3
    :return _type_: _description_
    """
    
    # obtain the keys of json
    tables_1 = list(json_1.keys())
    tables_2 = list(json_2.keys())
    
    output = [[0 for j in range(5)] for i in range(len(tables_1))]
    
    # lower case all of the data
    tables_2_lower = [i.lower() for i in tables_2]

    for i,table_1 in enumerate(tables_1):
        matches = process.extract(table_1.lower(),tables_2_lower,limit = num_best_match)
        print(f"Best three matches for {table_1}:")
        for match in matches:
            print(f"{match[0]:<10} Confidence: {match[1]}%")
        print()
        # append to output
        output[i][0] = table_1
        output[i][1] = json_1[table_1]["filepath"]
        output[i][2] = matches[0][0]
        output[i][3] = json_2[matches[0][0]]["filepath"]
        output[i][4] = matches[0][1]
    
    df = pd.DataFrame(output,columns=["table_mimic_3",
                                      "filepath_mimic3",
                                      "table_mimic_4",
                                      "filepath_mimic4",
                                      "confidence"])
    return df

df_table_matches = get_matched_table(mimic_3,
                  mimic_4)

Best three matches for PROCEDUREEVENTS_MV:
procedureevents.csv Confidence: 86%
procedures_icd.csv Confidence: 67%
ingredientevents.csv Confidence: 58%

Best three matches for CALLOUT:
outputevents.csv Confidence: 49%
chartevents.csv Confidence: 39%
datetimeevents.csv Confidence: 37%

Best three matches for D_CPT:
d_hcpcs.csv Confidence: 72%
d_items.csv Confidence: 54%
prescriptions.csv Confidence: 54%

Best three matches for D_ITEMS:
d_items.csv Confidence: 90%
d_labitems.csv Confidence: 67%
datetimeevents.csv Confidence: 51%

Best three matches for CAREGIVERS:
chartevents.csv Confidence: 56%
labevents.csv Confidence: 49%
procedureevents.csv Confidence: 45%

Best three matches for MICROBIOLOGYEVENTS:
microbiologyevents.csv Confidence: 95%
labevents.csv Confidence: 49%
chartevents.csv Confidence: 48%

Best three matches for LABEVENTS:
labevents.csv Confidence: 95%
chartevents.csv Confidence: 70%
datetimeevents.csv Confidence: 60%

Best three matches for INPUTEVENTS_CV:
inputevents.csv C

In [8]:
df_table_matches

Unnamed: 0,table_mimic_3,filepath_mimic3,table_mimic_4,filepath_mimic4,confidence
0,PROCEDUREEVENTS_MV,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,procedureevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...,86
1,CALLOUT,../basic_filtered_data/mimic-iii-demo/CALLOUT.csv,outputevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/icu/o...,49
2,D_CPT,../basic_filtered_data/mimic-iii-demo/D_CPT.csv,d_hcpcs.csv,../basic_filtered_data/mimic-iv-demo/2.2/hosp/...,72
3,D_ITEMS,../basic_filtered_data/mimic-iii-demo/D_ITEMS.csv,d_items.csv,../basic_filtered_data/mimic-iv-demo/2.2/icu/d...,90
4,CAREGIVERS,../basic_filtered_data/mimic-iii-demo/CAREGIVE...,chartevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/icu/c...,56
5,MICROBIOLOGYEVENTS,../basic_filtered_data/mimic-iii-demo/MICROBIO...,microbiologyevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/hosp/...,95
6,LABEVENTS,../basic_filtered_data/mimic-iii-demo/LABEVENT...,labevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/hosp/...,95
7,INPUTEVENTS_CV,../basic_filtered_data/mimic-iii-demo/INPUTEVE...,inputevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/icu/i...,90
8,ADMISSIONS,../basic_filtered_data/mimic-iii-demo/ADMISSIO...,admissions.csv,../basic_filtered_data/mimic-iv-demo/2.2/hosp/...,95
9,D_LABITEMS,../basic_filtered_data/mimic-iii-demo/D_LABITE...,d_labitems.csv,../basic_filtered_data/mimic-iv-demo/2.2/hosp/...,95


In [9]:
list1 = pd.read_csv(mimic_3["PROCEDUREEVENTS_MV"]["filepath"]).columns.to_list()
list2 = pd.read_csv(mimic_4["procedureevents.csv"]["filepath"]).columns.to_list()

# Column-level cross reference
After we get the match for each table, we need to do the same for each column in each table.

In [10]:
def get_matched_columns(df, num_best_match = 3):
    
    for i,row in df.iterrows():
        df_mimic_3 = pd.read_csv(row[1])
        df_mimic_4 = pd.read_csv(row[3])
    
        list1 = df_mimic_3.columns.to_list()
        list2 = df_mimic_4.columns.to_list()
            
        for item1 in list1:
            best_match = None
            best_ratio = 0
            for item2 in list2:
                ratio = fuzz.ratio(item1.lower(), item2.lower())
                if ratio > best_ratio:
                    best_ratio = ratio
                    best_match = item2
            print(f"{item1} -> {best_match} ({best_ratio}%)")
    
    
    
    # lower case all of the data
    # tables_2_lower = [i.lower() for i in tables_2]

    # for i,table_1 in enumerate(tables_1):
    #     matches = process.extract(table_1.lower(),tables_2_lower,limit = num_best_match)
    #     print(f"Best three matches for {table_1}:")
    #     for match in matches:
    #         print(f"{match[0]:<10} Confidence: {match[1]}%")
    #     print()
        # append to output

get_matched_columns(df_table_matches)



Unnamed: 0 -> Unnamed: 0 (100%)
row_id -> hadm_id (46%)
subject_id -> subject_id (100%)
hadm_id -> hadm_id (100%)
icustay_id -> stay_id (82%)
starttime -> starttime (100%)
endtime -> endtime (100%)
itemid -> itemid (100%)
value -> value (100%)
valueuom -> valueuom (100%)
location -> location (100%)
locationcategory -> locationcategory (100%)
storetime -> storetime (100%)
cgid -> caregiver_id (50%)
orderid -> orderid (100%)
linkorderid -> linkorderid (100%)
ordercategoryname -> ordercategoryname (100%)
secondaryordercategoryname -> ordercategoryname (79%)
ordercategorydescription -> ordercategorydescription (100%)
isopenbag -> isopenbag (100%)
continueinnextdept -> continueinnextdept (100%)
cancelreason -> locationcategory (43%)
statusdescription -> statusdescription (100%)
comments_editedby -> patientweight (40%)
comments_canceledby -> continueinnextdept (43%)
comments_date -> locationcategory (41%)
Unnamed: 0 -> Unnamed: 0 (100%)
row_id -> hadm_id (46%)
subject_id -> subject_id (100%)

  df_mimic_3 = pd.read_csv(row[1])


Unnamed: 0 -> Unnamed: 0 (100%)
row_id -> hadm_id (46%)
subject_id -> subject_id (100%)
hadm_id -> hadm_id (100%)
icustay_id -> stay_id (82%)
charttime -> starttime (78%)
itemid -> itemid (100%)
amount -> amount (100%)
amountuom -> amountuom (100%)
rate -> rate (100%)
rateuom -> rateuom (100%)
storetime -> storetime (100%)
cgid -> caregiver_id (50%)
orderid -> orderid (100%)
linkorderid -> linkorderid (100%)
stopped -> storetime (50%)
newbottle -> starttime (33%)
originalamount -> originalamount (100%)
originalamountuom -> originalamount (90%)
originalroute -> originalrate (88%)
originalrate -> originalrate (100%)
originalrateuom -> originalrate (89%)
originalsite -> originalrate (83%)
Unnamed: 0 -> Unnamed: 0 (100%)
row_id -> hadm_id (46%)
subject_id -> subject_id (100%)
hadm_id -> hadm_id (100%)
admittime -> admittime (100%)
dischtime -> dischtime (100%)
deathtime -> deathtime (100%)
admission_type -> admission_type (100%)
admission_location -> admission_location (100%)
discharge_loc

  df_mimic_3 = pd.read_csv(row[1])


Unnamed: 0 -> Unnamed: 0 (100%)
row_id -> hadm_id (46%)
subject_id -> subject_id (100%)
hadm_id -> hadm_id (100%)
icustay_id -> stay_id (82%)
itemid -> itemid (100%)
charttime -> charttime (100%)
storetime -> storetime (100%)
cgid -> caregiver_id (50%)
value -> value (100%)
valuenum -> valuenum (100%)
valueuom -> valueuom (100%)
error -> valueuom (31%)
resultstatus -> stay_id (32%)
stopped -> storetime (50%)
Unnamed: 0 -> Unnamed: 0 (100%)
row_id -> transfer_id (47%)
subject_id -> subject_id (100%)
hadm_id -> hadm_id (100%)
icustay_id -> subject_id (50%)
dbsource -> outtime (40%)
eventtype -> eventtype (100%)
prev_careunit -> careunit (76%)
curr_careunit -> careunit (76%)
prev_wardid -> transfer_id (45%)
curr_wardid -> transfer_id (45%)
intime -> intime (100%)
outtime -> outtime (100%)
los -> outtime (20%)
Unnamed: 0 -> Unnamed: 0 (100%)
row_id -> hadm_id (46%)
subject_id -> subject_id (100%)
hadm_id -> hadm_id (100%)
seq_num -> seq_num (100%)
icd9_code -> icd_code (94%)
Unnamed: 0 -> 

In [11]:
df_table_matches

Unnamed: 0,table_mimic_3,filepath_mimic3,table_mimic_4,filepath_mimic4,confidence
0,PROCEDUREEVENTS_MV,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,procedureevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...,86
1,CALLOUT,../basic_filtered_data/mimic-iii-demo/CALLOUT.csv,outputevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/icu/o...,49
2,D_CPT,../basic_filtered_data/mimic-iii-demo/D_CPT.csv,d_hcpcs.csv,../basic_filtered_data/mimic-iv-demo/2.2/hosp/...,72
3,D_ITEMS,../basic_filtered_data/mimic-iii-demo/D_ITEMS.csv,d_items.csv,../basic_filtered_data/mimic-iv-demo/2.2/icu/d...,90
4,CAREGIVERS,../basic_filtered_data/mimic-iii-demo/CAREGIVE...,chartevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/icu/c...,56
5,MICROBIOLOGYEVENTS,../basic_filtered_data/mimic-iii-demo/MICROBIO...,microbiologyevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/hosp/...,95
6,LABEVENTS,../basic_filtered_data/mimic-iii-demo/LABEVENT...,labevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/hosp/...,95
7,INPUTEVENTS_CV,../basic_filtered_data/mimic-iii-demo/INPUTEVE...,inputevents.csv,../basic_filtered_data/mimic-iv-demo/2.2/icu/i...,90
8,ADMISSIONS,../basic_filtered_data/mimic-iii-demo/ADMISSIO...,admissions.csv,../basic_filtered_data/mimic-iv-demo/2.2/hosp/...,95
9,D_LABITEMS,../basic_filtered_data/mimic-iii-demo/D_LABITE...,d_labitems.csv,../basic_filtered_data/mimic-iv-demo/2.2/hosp/...,95


Now we need to find out a way to perform the column match.
1. read in a table in mimic 3 and in mimic 4
2. calculate the score
3. output into the following format

|{filename_3}|{filename_4}|column_mimic3|column_mimic4|datatype|score|required|
|----|---|---|---|---|---|---|
|-|-|-|-|-|-|-|

In [17]:
def find_best_matches(list_a, list_b):
    """
    input two list, output fuzzy comparison between 

    :param _type_ list_a: _description_
    :param _type_ list_b: _description_
    :return _type_: _description_
    """
    # Initialize empty lists for the output
    a_matches = []
    b_matches = []
    scores = []

    # Iterate through each item in list A
    for a in list_a:
        # Use fuzzywuzzy to find the best match for the current item in list B, comparing lowercase strings
        matches = process.extract(a.lower(), [b.lower() for b in list_b], limit=1)
        best_match = matches[0][0]
        score = matches[0][1]

        # Add the current item in list A and its best match and score to the output lists
        a_matches.append(a)
        b_matches.append(best_match)
        scores.append(score)

    # Create a dataframe from the output lists
    # df = pd.DataFrame({'List A': a_matches, 'List B Match': b_matches, 'Match Score': scores})
    return a_matches, b_matches, scores
    

In [19]:
mimic_3

{'PROCEDUREEVENTS_MV': {'Unnamed: 0': 'int',
  'row_id': 'int',
  'subject_id': 'int',
  'hadm_id': 'int',
  'icustay_id': 'int',
  'starttime': 'datetime',
  'endtime': 'datetime',
  'itemid': 'datetime',
  'value': 'int',
  'valueuom': 'string',
  'location': 'string',
  'locationcategory': 'string',
  'storetime': 'datetime',
  'cgid': 'int',
  'orderid': 'int',
  'linkorderid': 'int',
  'ordercategoryname': 'string',
  'secondaryordercategoryname': None,
  'ordercategorydescription': 'string',
  'isopenbag': 'int',
  'continueinnextdept': 'int',
  'cancelreason': 'int',
  'statusdescription': 'string',
  'comments_editedby': 'string',
  'comments_canceledby': 'string',
  'comments_date': 'datetime',
  'filepath': '../basic_filtered_data/mimic-iii-demo/PROCEDUREEVENTS_MV.csv'},
 'CALLOUT': {'Unnamed: 0': 'int',
  'row_id': 'int',
  'subject_id': 'int',
  'hadm_id': 'int',
  'submit_wardid': 'int',
  'submit_careunit': 'string',
  'curr_wardid': 'int',
  'curr_careunit': 'string',
  

In [26]:
def get_columns(tables_mimic3,tables_mimic4,meta_mimic3,meta_mimic4,threshold = 80):

    table_match = dict(zip(tables_mimic3,tables_mimic4))
    
    df_result = pd.DataFrame()
    
    # unpack the table name in mimic 3 and 4 for comparison
    for table_mimic3,table_mimic4 in table_match.items():
        # print(list(meta_mimic3[table_mimic3].keys()))
        columns_mimic3,columns_mimic4,scores = find_best_matches(list(meta_mimic3[table_mimic3].keys()),
                                  list(meta_mimic4[table_mimic4].keys()))
        
        # find the datatype from mimic3
        datatype_m3 = [meta_mimic3[table_mimic3][column] for column in columns_mimic3]
        
        df = pd.DataFrame(
            {
                "mimic3" : [table_mimic3 for i in range(len(columns_mimic3))],
                "mimic4" : [table_mimic4 for i in range(len(columns_mimic4))],
                "column_mimic3" : columns_mimic3,
                "column_mimic4_candidate_1" : columns_mimic4,
                "candidate_1_scores" : scores,
                "dtype_mimic3": datatype_m3,
                "mimic3_filepath": [meta_mimic3[table_mimic3]["filepath"] for i in range(len(columns_mimic3))],
                "mimic4_filepath": [meta_mimic4[table_mimic4]["filepath"] for i in range(len(columns_mimic4))],   
            }
        )
        
        
        
        df_result = pd.concat([df_result,df],axis = 0)
    return df_result

df = get_columns(df_table_matches["table_mimic_3"].to_list(),
            df_table_matches["table_mimic_4"].to_list(),
            mimic_3,
            mimic_4)
df.head(30)

Unnamed: 0,mimic3,mimic4,column_mimic3,column_mimic4_candidate_1,candidate_1_scores,dtype_mimic3,mimic3_filepath,mimic4_filepath
0,PROCEDUREEVENTS_MV,procedureevents.csv,Unnamed: 0,unnamed: 0,100,int,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...
1,PROCEDUREEVENTS_MV,procedureevents.csv,row_id,caregiver_id,60,int,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...
2,PROCEDUREEVENTS_MV,procedureevents.csv,subject_id,subject_id,100,int,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...
3,PROCEDUREEVENTS_MV,procedureevents.csv,hadm_id,hadm_id,100,int,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...
4,PROCEDUREEVENTS_MV,procedureevents.csv,icustay_id,stay_id,82,int,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...
5,PROCEDUREEVENTS_MV,procedureevents.csv,starttime,starttime,100,datetime,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...
6,PROCEDUREEVENTS_MV,procedureevents.csv,endtime,endtime,100,datetime,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...
7,PROCEDUREEVENTS_MV,procedureevents.csv,itemid,itemid,100,datetime,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...
8,PROCEDUREEVENTS_MV,procedureevents.csv,value,value,100,int,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...
9,PROCEDUREEVENTS_MV,procedureevents.csv,valueuom,valueuom,100,string,../basic_filtered_data/mimic-iii-demo/PROCEDUR...,../basic_filtered_data/mimic-iv-demo/2.2/icu/p...


In [27]:
df.to_csv("./output/table_matches_require_manual.csv")

In [28]:
check = pd.read_csv("../basic_filtered_data/mimic-iv-demo/2.2/hosp/transfers.csv.gz")

In [29]:
check

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,transfer_id,eventtype,careunit,intime,outtime
0,0,10009049,22995465.0,30030230,discharge,,2174-05-31 14:21:47,
1,1,10025612,23403708.0,32533329,discharge,,2125-10-03 12:25:27,
2,2,10020786,23488445.0,37922399,discharge,,2189-06-13 17:25:44,
3,3,10014078,25809882.0,34694622,discharge,,2166-08-26 14:49:42,
4,4,10039831,26924951.0,37155928,discharge,,2116-01-02 14:35:02,
...,...,...,...,...,...,...,...,...
1185,1185,10021118,24490144.0,31983963,transfer,PACU,2161-11-19 07:16:15,2161-11-19 10:04:04
1186,1186,10021118,24490144.0,38367109,transfer,Cardiac Surgery,2161-11-20 21:45:42,2161-11-23 16:06:31
1187,1187,10021118,24490144.0,39362807,transfer,Medicine/Cardiology,2161-11-16 19:53:20,2161-11-19 07:16:15
1188,1188,10021118,24490144.0,38425947,admit,Medicine/Cardiology,2161-11-15 20:10:55,2161-11-16 19:53:20
