# **Medication Correction**

## Step 1 - Medication Name Standardization and Preprocessing

by Chen Chen (c.chen2@wustl.edu); David Brown (browndavid@wustl.edu)
20250721

# Import Libraries

In [None]:
# General
import datetime
import os
from timeit import default_timer as timer

# Data visualization
import matplotlib.pyplot as plt

# Data handling
import numpy as np
import pandas as pd

# Drug recognition
from drug_named_entity_recognition import find_drugs

# Useful
import ast
import re
import string

In [None]:
# Overall execution duration
analysis_start = timer()

# Input data

In [None]:
# find those medication names in redcap
names = [f'med_name_{i}' for i in range(1, 21)]

In [None]:
%%time
fp_cwd = os.getcwd() + "/"
# Input RedCap data and Jason/Kebade's dictionary
input_data = pd.read_csv(fp_cwd + "MotherDatabaseFULL10-WIPMedicationsFullDA_DATA_2025-02-03_1317.csv", low_memory=False)
input_meds_dic = pd.read_csv(fp_cwd + "1107drugDictionary.csv", low_memory=False)
input_ahfs = pd.read_csv(fp_cwd + "AHFS.csv", low_memory=False)
input_ahfs_un = pd.read_csv(fp_cwd + "AHFS_drug.csv", low_memory=False)
manual_df = pd.read_csv(fp_cwd + "manual_matched.csv", low_memory=False)

# Input OTC drugs, Vitamins, and Supplements
arches_otc = pd.read_csv(fp_cwd + "ARCHES_OTC_drugs.csv", low_memory=False)
arches_otc = arches_otc.rename(columns={'value_x': 'Name'})
drives_vit_sup = pd.read_csv(fp_cwd + "vitamins_supplements_manual_extract.csv", low_memory=False)
# Concat ARCHES OTC drugs and DRIVES manually extracted Vitamins and Supplements
otc_vit_sup = pd.concat([arches_otc, drives_vit_sup], axis=0).drop_duplicates()
otc_vit_sup['Name'] = otc_vit_sup['Name'].str.lower().str.strip()
otc_vit_sup = otc_vit_sup.drop_duplicates()
print(drives_vit_sup.shape)
print(arches_otc.shape)
print(otc_vit_sup.shape)

## Enforce REDCap dtypes

In [None]:
# Coerce dtypes
for this_col in [i for i in input_data.columns if i not in ['map_id', 'otdate', 'nmedications', 'medications_complete']]:
    input_data[this_col] = input_data[this_col].astype('string')
    
input_data['map_id'] = input_data['map_id'].astype('int64')
input_data['nmedications'] = input_data['nmedications'].astype('Int64')
input_data['medications_complete'] = input_data['medications_complete'].astype('Int64')
input_data['otdate'] = input_data['otdate'].astype('string')

In [None]:
# Make a copy of medications input
df_med = input_data.copy()

In [None]:
# Make a copy of input_meds_dic
meds_dic = pd.DataFrame(input_meds_dic)

# Clean expert dictionary
for i in range(len(meds_dic)): 
    label = meds_dic.loc[i, 'mapped_drug_label']
    
    # Check if the string ends with 'd' followed by numbers and remove that part
    if re.search(r'd\d+$', label):
        meds_dic.loc[i, 'mapped_drug_label'] = re.sub(r'd\d+$', '', label).strip()

In [None]:
# Force the manual dataframe to use otdate
manual_df = manual_df.drop(['RedCapMedName'], axis=1).merge(
    df_med_lab,
    how='left',
    on=['map_id', 'redcap_event_name', 'Number']
)

# Sanity check
print(manual_df.isna().sum())

In [None]:
# Numbers of medications in dataframe
print('Both', df_med.shape)

In [None]:
df_med[['MedName']].nunique()

In [None]:
print(len(meds_dic['raw_drug_label'])) # 2486
print(len(list(set(meds_dic['raw_drug_label'].values.tolist())))) # 2474

In [None]:
mydict = dict(zip(meds_dic.drop_duplicates().raw_drug_label, meds_dic.drop_duplicates().mapped_drug_label))

# Functions

## clean_format

In [None]:
# Transform the series to string, lower cases and remove pubctuation marks
def clean_format(df):
    #df['med_lower'] = df['RedCapMedName'].str.lower().str.strip() # Make it lower cases
    df['med_lower'] = df['MedName'].str.lower().str.strip() # Make it lower cases
    df['Reversed_Med'] =  df['med_lower'].str.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    return df

## Absolute match: db_my_matching_drug

In [None]:
# matching drugs through general matching
def db_my_matching_drug(some_df, col):
    """
    20241031 db
    NOTE: find_drugs() returns a list of tuples, each tuple contains a dictionary, a token index, and another token index
    [ (dict, int, int), (dict, int, int), (dict, int, int), ... ]
    https://github.com/fastdatascience/drug_named_entity_recognition/blob/main/drug_named_entity_recognition_example_walkthrough.ipynb
    https://github.com/fastdatascience/drug_named_entity_recognition/blob/c227765307a2e721ec5c7fa830aff37a1ebddb11/src/drug_named_entity_recognition/drugs_finder.py#L227
    NOTE: as of 20241031, line 227 != line 239; potentially missing + 1
    
    find_drugs() always returns a list
    
    non-matchs are an empty list == list of length 0
    """
    # Copy for safety
    df = some_df.copy()
    
    # Create temporary list to store needed results
    list_results = []
    error_log_fzy = []
    
    # Iterate over dataframe
    for i, row in df.iterrows():
        """
        try:
            # Capture tokens
            tokens = row['Reversed_Med'].split(" ") if isinstance( row['Reversed_Med'], str ) else []
            # Call find_drugs() on tokens and append the result
            list_results.append( find_drugs( tokens, is_ignore_case=True, is_fuzzy_match=True ) )
        except Exception as e:
            # Log the error and the input text (RedCapMed)
            error_log_fzy.append({'value' : i, 'error': str(e), 'text': row['Reversed_Med']})
            # error, pass empty list
            list_results.append( [] )
        """
        # Capture tokens
        tokens = row[col].split(" ") if isinstance( row[col], str ) else []
        # Call find_drugs() on tokens and append the result
        #list_results.append( find_drugs( tokens, is_ignore_case=True, is_fuzzy_match=True ) )
        list_results.append( find_drugs( tokens, is_ignore_case=True ) )
    
    # Full results
    df['Results']  = [
        # find_drugs() always returns a list, if list is empty return a list with np.nan
        [np.nan] if len(i) < 1 \
        # Access the 0th element (dict) of the jth tuple for all matched drugs on a given set of tokens 
        else [ i[j][0] for j in range(len(i)) ]\
        for i in list_results
    ]
    
    # All drug names for all matches, if they exist
    df['DrugName']  = [
        # find_drugs() always returns a list, if list is empty return a list with np.nan
        [np.nan] if len(i) < 1 \
        # If 'name' key exists, access the 0th element (dict) of the jth tuple from the result
        else [ i[j][0]['name'] if 'name' in i[j][0].keys() \
            # Return np.nan if it does not exist
            else np.nan for j in range(len(i)) ]
        for i in list_results
    ]
    
    # All drugbank IDs for all matches, if they exist
    df['DrugbankId']  = [
        # find_drugs() always returns a list, if list is empty return a list with np.nan
        [np.nan] if len(i) < 1 \
        # If 'drugbank_id' key exists, access the 0th element (dict) of the jth tuple from the result
        else [ i[j][0]['drugbank_id'] if 'drugbank_id' in i[j][0].keys() \
            # Return np.nan if it does not exist
            else np.nan for j in range(len(i)) ]
        for i in list_results
    ]
    
    # All drugbank IDs for all matches, if they exist
    df['Similarity']  = [
        # find_drugs() always returns a list, if list is empty return a list with np.nan
        [np.nan] if len(i) < 1 \
        # If 'match_similarity' key exists, access the 0th element (dict) of the jth tuple from the result
        else [ i[j][0]['match_similarity'] if 'match_similarity' in i[j][0].keys() \
            # Return np.nan if it does not exist
            else np.nan for j in range(len(i)) ]
        for i in list_results
    ]
    
    return(df)

## Fuzzy match: my_fuzzy_match

In [None]:
def my_fuzzy_match(df, some_col):
    df = df.copy()  # Avoid modifying the original DataFrame
    list_results, error_log_fzy = [], []

    # Process each row
    for i, row in df.iterrows():
        tokens = row[some_col].split(" ") if isinstance(row[some_col], str) else []
        
        try:
            # Get results from find_drugs or use a placeholder if empty
            result = find_drugs(tokens, is_ignore_case=True, is_fuzzy_match=True) or \
                     [({'name': np.nan, 'Similarity': np.nan, 'drugbank_id': np.nan}, np.nan, np.nan)]
            list_results.append(result)
        except Exception as e:
            error_log_fzy.append(f"Row {i}: {e}")
            list_results.append([({'name': np.nan, 'match_similarity': np.nan, 'drugbank_id': np.nan}, np.nan, np.nan)])

    # Extract fields from results
    df['fzy_Results'] = list_results
    df['DrugbankId'] = df['fzy_Results'].apply(lambda x: [item[0].get('drugbank_id', np.nan) for item in x])
    df['DrugName'] = df['fzy_Results'].apply(lambda x: [item[0].get('name', np.nan) for item in x])
    df['Similarity'] = df['fzy_Results'].apply(lambda x: [item[0].get('match_similarity', np.nan) for item in x])

    return df#, error_log_fzy

## maskForValuesInColumnList

In [None]:
def maskForValuesInColumnList( some_df, some_col, some_val ):
    """
    https://stackoverflow.com/questions/41518920/how-to-query-if-a-list-type-column-contains-something
    
    Returns a dataframe where a value is found within a column of lists
    """
    this_mask = some_df[some_col].apply( lambda x: some_val in x )
    return( some_df[this_mask] )

## lower_threshold

In [None]:
def lower_threshold(some_df, some_col, threshold_value):
    # Where any value in the list exceeds the threshold
    this_mask = some_df[some_col].apply(lambda x: any(i <= threshold_value for i in x) )
    return some_df[this_mask]

## clean_parenthesis

In [None]:
def clean_parenthesis(df, col):
    # Apply regex to remove text within parentheses in the specified column
    df[col] = df[col].apply(lambda text: re.sub(r'\s*\(.*?\)', '', text) if isinstance(text, str) else text)
    return df

## find_unique

In [None]:
def find_unique (df, col):
    # Find the unique value in a list
    df = df.copy()
    df.loc[:, col + "set"] = df[col].apply(lambda x: list(set(x)))
    return df

## convert_to_list

In [None]:
def convert_to_list(df, col):
    # Convert strings to lists, ignoring NaN values
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    # Convert lists to sets to remove duplicates, then back to lists
    df[col] = df[col].apply(lambda x: list(set(x)) if isinstance(x, list) else x)
    
    # Replace NaN or None values with [np.nan]
    df[col] = df[col].apply(lambda x: [np.nan] if x is None or (isinstance(x, float) and pd.isnull(x)) else x)

    return df

## clean_list

In [None]:
def clean_list(df, col):
    # Convert string to actual list
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    # Converting list to a set and then back to a list
    df[col] = df[col].apply(lambda x: list(set(x)) if isinstance(x, list) else x)
    
    # Keep unique value from the list, either one will work
    df[col] = df[col].apply(lambda x: x[0] if isinstance(x, list) else x)
    
    return df

# Apply functions for Preprocessing and Matching

In [None]:
# clean the format in redcap medication names
df_med = clean_format(df_med)

In [None]:
# Remove vitmains and supplements
df_filtered = df_med[~df_med['med_lower'].isin(otc_vit_sup['Name'])]
# Remove vitamins, supplements, and OTCs again to ensure they are not in the dataframe
df_med = df_filtered[~df_filtered['Reversed_Med'].str.contains(
    r'vitamin|supplement|areds|calcium|fiber|fish oil|iron|magnesium|'
    r'drops|drop|potassium|complex|d3|insulin|inhaler|sodium|vit|salt|sennas|gummies|acides|diagnostic|device|'
    r'probiotics|probiotic|turmeric|tumeric|juice|centrum|herb|bioitin|omega|garlic|cinnamon|ginger|cranberry|nails|'
    r'melatonin|tums|cream|CoQ-10|meatonin|biotin|Milk|Lutein|Peppermint oil|Red Yeast Rice|Levemir Flex Pen|'
    r'Fiasp Flex Touch Injection|ENERGIZE|Oil|B-12|softener|Colace|Docusate|Q10|Enzyme|Glucosamine|wax'
    ,
    case=False, na=False
)].drop_duplicates()

In [None]:
df_med[['med_lower']].nunique()

In [None]:
df_med.shape

In [None]:
# Use absolute match on experts' results 
matched_manual = db_my_matching_drug(manual_df,"Correct_Name")

## 1. Absolute Match
- Correct: 1 : 1 match
- Hold out: 1 : multiple matches for experts to check
- Find: not match

In [None]:
%%time
# Apply 'my_matching_drug' function
abs_df = db_my_matching_drug(df_med,"Reversed_Med").reset_index(drop = True)

In [None]:
abs_df.shape

### Absolute - Held out: 1 to multiple ids

In [None]:
# Calculate the number of 'DrugName'
abs_df['numNames'] = abs_df['DrugName'].map(len)

# Capture drugs with matched multiple names
abs_mtp = abs_df[ abs_df['numNames'] > 1 ]

# Capture drugs without duplicate results
abs_mtp_set = find_unique(abs_mtp, 'DrugbankId')
abs_h = abs_mtp_set[abs_mtp_set['DrugbankIdset'].apply(lambda x: len(x) > 1)]

print(abs_h.shape)

# Check counts
abs_h_counts = abs_h.groupby(['map_id', 'otdate', 'Number'])['med_lower'].nunique()
print(abs_h_counts.sum())

In [None]:
abs_mtp[['med_lower']].count()

In [None]:
# Merge absolute held out with matched result
merged_abs_h = pd.merge(
    abs_h, #.drop(columns=['DrugName', 'DrugbankId']),
    matched_manual[['map_id', 'otdate', 'MedName', "Correct_Name"]],
    how='left',
    on=['map_id', 'otdate','MedName'],
    indicator=True
)
# matched_abs_h with human matches
matched_abs_h = merged_abs_h[merged_abs_h['_merge'] == 'both']

# Filter for rows in abs_h that haven't been matched in matched_manual
unmatched_abs_h = merged_abs_h[merged_abs_h['_merge'] == 'left_only']
print(matched_abs_h.shape)

# Drop the merge indicator column if not needed
unmatched_abs_h = unmatched_abs_h.drop(columns=['_merge'])
print(unmatched_abs_h.shape)

# Check counts of unique redcapname
matched_abs_h_counts = matched_abs_h.groupby(['map_id', 'otdate', 'Number'])['MedName'].nunique()
print(matched_abs_h_counts.sum())

### [Export unmatched_abs_h]

### Absolute - Find: no match

In [None]:
# Missing DrugBankIDs
# Subset to maskForValuesInColumnList( DATAFRAMEHERE, 'DrugName', np.nan )
abs_no_match = maskForValuesInColumnList( abs_df, 'DrugbankId', np.nan )
abs_no_match = abs_no_match.drop(abs_h.index, errors='ignore')

abs_no_match.shape

### Absolute [Correct] = abs_df - hold out - no match
- [union](https://pandas.pydata.org/docs/reference/api/pandas.Index.union.html)

In [None]:
# Combine the indexes from abs_h and abs_no_match
combined_index = abs_h.index.union(abs_no_match.index)

# Filter out the rows with combined_index
good_abs = abs_df[~abs_df.index.isin(combined_index)]

print(good_abs.shape) 

In [None]:
# Sanity check
common_index = abs_h.index.intersection(abs_no_match.index)
print(common_index)

In [None]:
total_rows = good_abs.shape[0] + len(combined_index)
print("Total from parts:", total_rows)
print("Original abs_df rows:", abs_df.shape[0])
print("Check match:", total_rows == abs_df.shape[0])

In [None]:
abs_df.shape[0] == good_abs.shape[0] + len(combined_index)

## 2. Fuzzy Match

- Correct: 1 to 1 match, similarity > 0.9
- Hold out: 1 to multiple matches
- Find: 
    - similarity =< 0.9
    - not match

In [None]:
%%time
# Apply 'my_fuzzy_match' function on the failed drugbankids
fz_df = my_fuzzy_match(abs_no_match, "Reversed_Med")  
fz_df.shape

### Fuzzy - Held out: 1 to multiple matches

In [None]:
# Calculate the number of 'DrugName'
fz_df['numNames'] = fz_df['DrugName'].map(len)

# Capture drugs with matched multiple names
fz_mtp = fz_df[ fz_df['numNames'] > 1 ]

# Capture drugs without duplicate results
fz_mtp_set = find_unique(fz_mtp, 'DrugbankId')
fz_h = fz_mtp_set[fz_mtp_set['DrugbankIdset'].apply(lambda x: len(x) > 1)]

fz_h.shape

In [None]:
# Merge absolute held out with matched result
merged_fz_h = pd.merge(
    fz_h, #.drop(columns=['DrugName', 'DrugbankId']),
    #matched_manual[['map_id', 'Number', 'RedCapMedName']],
    matched_manual[['map_id', 'Number', 'MedName']],
    how='left',
    #on=['RedCapMedName'],
    on=['MedName'],
    indicator=True
)
# matched_fz_h with human matches
matched_fz_h = merged_fz_h[merged_fz_h['_merge'] == 'both']
print(matched_fz_h.shape)

unmatched_fz_h = merged_fz_h[merged_fz_h['_merge'] == 'left_only']

# Drop the merge indicator column not needed
unmatched_fz_h = unmatched_fz_h.drop(columns=['_merge'])
print(unmatched_fz_h.shape)

### [Export unmatched_fz_h]

### Fuzzy - Find:

- no match
    - similarity =< 0.9

In [None]:
# Find no matches if their DrugbankId is nan
fz_no_match = maskForValuesInColumnList( fz_df, 'DrugbankId', np.nan )
fz_no_match = fz_no_match.drop(fz_h.index, errors='ignore')

fz_no_match.shape

In [None]:
# Find similarity lower than 0.9
fz_lower_60 = lower_threshold(fz_df, 'Similarity', 0.9) # Can be changed here depending on the study
fz_lower_60 = fz_lower_60.drop(fz_h.index, errors='ignore')

fz_lower_60.shape

In [None]:
# Check if indexes of abs_h are in abs_no_match
is_in_fz_no_match = fz_lower_60.index.isin(fz_no_match.index)
print(is_in_fz_no_match.sum())
mask_fz_lower_60 = fz_lower_60[~is_in_fz_no_match]

In [None]:
# concat 
fz_left = pd.concat([mask_fz_lower_60, fz_no_match], axis=0)
fz_left.shape

### Fuzzy Match [Correct] = fz_df - held out - no match

In [None]:
# Combine the indexes from fz_h and fz_no_match and fz_lower_60
combined_index_fz = fz_h.index.union(fz_left.index)

# Filter out the rows with combined_index
good_fz = fz_df[~fz_df.index.isin(combined_index_fz)]

print(good_fz.shape) 

## 3. J/K dictionary

In [None]:
# Add a column from expert dictionary here (filter for missing key from expert)
fz_left['expert'] = fz_left['med_lower'].map(mydict)
jk_df = clean_parenthesis(fz_left, 'expert')

print(jk_df.shape)

### Absolute Match on 'Jason' column

In [None]:
#Run same absolute match function on Jason column in the subset subset
abs_j = db_my_matching_drug(jk_df, "expert") 

In [None]:
# subset again
expert_no_match = maskForValuesInColumnList( abs_j, 'DrugbankId', np.nan )
expert_no_match.shape

In [None]:
# Calculate the number of 'DrugName'
abs_j['numNames'] = abs_j['DrugName'].map(len)

# Capture drugs with matched multiple names
abs_mtp_j = abs_j[ abs_j['numNames'] > 1 ]

# Capture drugs with duplicate results
abs_mtp_j_set = find_unique(abs_mtp_j, 'DrugbankId')
abs_j_h = abs_mtp_j_set[abs_mtp_j_set['DrugbankIdset'].apply(lambda x: len(x) > 1)]

abs_mtp_j_set.shape

In [None]:
good_expert = abs_j[~abs_j.index.isin(expert_no_match.index)]

print(good_expert.shape)

### Fuzzy Match on 'Jason' column of no matches

In [None]:
%%time
# Apply 'my_fuzzy_match' function on the failed drugbankids
fz_j = my_fuzzy_match(expert_no_match, "expert")  
fz_j.shape

### Jason - Hold out: 1 to multiple matches

In [None]:
# Calculate the number of 'DrugName'
fz_j['numNames'] = fz_j['DrugName'].map(len)

# Capture drugs with matched multiple names
fz_mtp_j = fz_j[ fz_j['numNames'] > 1 ]

# Capture drugs without duplicate results
fz_mtp_j_set = find_unique(fz_mtp_j, 'DrugbankId')
fz_j_h = fz_mtp_j_set[fz_mtp_j_set['DrugbankIdset'].apply(lambda x: len(x) > 1)]

fz_j_h.shape

### expert - Find: 
 - no matches
 - similarity < 0.9

In [None]:
j_lower_60 = lower_threshold(fz_j, 'Similarity', 0.9)
j_lower_60.shape

In [None]:
fz_j_no_match = maskForValuesInColumnList( fz_j, 'DrugbankId', np.nan )
fz_j_no_match.shape

combined_index_fz = fz_h.index.union(fz_left.index)

# Filter out the rows with combined_index
good_fz = fz_df[~fz_df.index.isin(combined_index_fz)]### Fuzzy expert - Correct = fz_j - hold out - no matches 

In [None]:
# Combine the indexes from fz_j_no_match and j_lower_60
combined_index_fz_j = fz_j_no_match.index.union(j_lower_60.index).union(fz_j_h.index)

# Filter out the rows with combined_index
good_fz_j = fz_j[~fz_j.index.isin(combined_index_fz_j)]

print(good_fz_j.shape) 

In [None]:
len(combined_index_fz_j)

## 4. Errors

In [None]:
errors = fz_df.loc[combined_index_fz_j]

In [None]:
errors.shape

In [None]:
# Merge human matched 'error's to exist 'errors'
merged_matched_er = pd.merge(
    errors.drop(columns=['DrugName', 'DrugbankId']),
    matched_manual[['map_id', 'otdate', 'Number', 'MedName', 'DrugName', 'DrugbankId', 'Correct_Name', 'CorrectDrugbankId']],
    how='left',
    on=['map_id', 'otdate', 'Number', 'MedName']
)

In [None]:
merged_matched_er.shape

In [None]:
# Apply function
merged_matched_er = convert_to_list(merged_matched_er, 'DrugbankId')
merged_matched_er = convert_to_list(merged_matched_er, 'DrugName')

### Corrected errors(corrected_error1)

In [None]:
# Those missing DrugName in errors
# Find those mismatches Drug Names with manually matched names
na_names = maskForValuesInColumnList(merged_matched_er, 'DrugName', np.nan )
na_names.shape

In [None]:
# Those corrected DrugName in 'errors'
corrected_error = merged_matched_er[~merged_matched_er.index.isin(na_names.index)]
corrected_error.shape

### left to correct errors

In [None]:
matched_left_error = na_names[['map_id', 'otdate', 'Number', 'MedName', 'Correct_Name', 'CorrectDrugbankId']]

# Only use REDCapMedName to merge
left_na_names_in_error_correct = pd.merge(
    matched_left_error[['map_id', 'otdate', 'Number', 'MedName']],
    matched_manual[['MedName', 'Correct_Name', 'CorrectDrugbankId', 'DrugName', 'DrugbankId']].drop_duplicates(subset=['MedName']),
    how='left',
    on=['MedName']
)

# Apply function convert to list
left_na_names_in_error_correct = convert_to_list(left_na_names_in_error_correct, 'DrugbankId')
left_na_names_in_error_correct = convert_to_list(left_na_names_in_error_correct, 'DrugName')
left_na_names_in_error_correct.shape

In [None]:
# Extract those missing DrugName in errors that needs to be human identified
need_identify_drugs = maskForValuesInColumnList(left_na_names_in_error_correct, 'DrugName', np.nan )
need_identify_drugs.shape

In [None]:
# Those corrected 'errors'
corrected_left_error = left_na_names_in_error_correct[~left_na_names_in_error_correct.index.isin(need_identify_drugs.index)]
corrected_left_error.shape

In [None]:
left_na_names_in_error_correct.head()

## 5. Concate all the correct Drugs

In [None]:
# concat 3 good dataframes as 1
combined_good = pd.concat([good_fz_j, good_jason, good_fz, good_abs], ignore_index=False)

In [None]:
combined_good.shape # previously (3236, 13)

In [None]:
# Convert strings to list and keep the unique element of DrugbankId and DrugName
combined_good = clean_list(combined_good, 'DrugbankId')
combined_good = clean_list(combined_good, 'DrugName')

In [None]:
# Drop unneeded variables
combined_good = combined_good.drop(columns = ['med_lower',
 'Reversed_Med',
 'Results',
 'Similarity',
 'numNames',
 'fzy_Results',
 'expert'])

In [None]:
combined_good.shape

## 6. Concate all held out sets and errors

In [None]:
matched_abs_h = matched_abs_h.drop(columns = ['med_lower',
 'Reversed_Med',
 'Results',
 'Similarity',
 'numNames',
 'DrugbankIdset',
 'Correct_Name'])

In [None]:
matched_fz_h = matched_fz_h[['map_id_x', 'otdate', 'Number_x', 'MedName', 'DrugName', 'DrugbankId']]
matched_fz_h = matched_fz_h.rename(columns={'map_id_x': 'map_id', 'Number_x': 'Number'})

In [None]:
corrected_error = corrected_error[['map_id', 'otdate', 'Number', 'MedName', 'DrugName', 'DrugbankId']]

In [None]:
unmatched_abs_h = unmatched_abs_h[['map_id', 'otdate', 'Number', 'MedName', 'Correct_Name']]

unmatched_fz_h = unmatched_fz_h[['map_id_x', 'otdate', 'Number_x', 'MedName', 'DrugName', 'DrugbankId']]
unmatched_fz_h = unmatched_fz_h.rename(columns={'map_id_x': 'map_id', 'Number_x': 'Number'})

In [None]:
matched_abs_h.shape

In [None]:
matched_fz_h.shape

In [None]:
corrected_error.shape

In [None]:
matched_left_error.shape

In [None]:
matched_h_er1 = pd.concat([matched_abs_h, # 47
                           matched_fz_h, # 17
                           corrected_error, # 354
                           matched_left_error, #13
                           need_identify_drugs,#
                           unmatched_abs_h, #
                           unmatched_fz_h #
                          ], ignore_index=True)

# Convert strings to list and keep the unique element of DrugbankId and DrugName
combined_h_er = clean_list(matched_h_er1, 'DrugbankId')
combined_h_er = clean_list(matched_h_er1, 'DrugName')

matched_h_er1.shape

In [None]:
combined_h_er.isna().sum()

In [None]:
# Fill Correct_Name where it is NaN in DrugName
combined_h_er['DrugName'] = combined_h_er['DrugName'].fillna(combined_h_er['Correct_Name'])

# Fill CorrectDrugbankId where it is NaN with values from RedCapMedName (or another condition)
combined_h_er['DrugbankId'] = combined_h_er['DrugbankId'].fillna(combined_h_er['CorrectDrugbankId'])

matched_h_er = combined_h_er.drop(columns = ['Correct_Name', 'CorrectDrugbankId'])

#matched_h_er.drop_duplicates(inplace=True)  # Drop duplicates in place
print(matched_h_er.shape)  # Print the shape of the DataFrame after dropping duplicates

In [None]:
# concat matched_h_er and combined_good
tmp_result = pd.concat([matched_h_er, combined_good], ignore_index=True)
tmp_result['DrugbankId'] = tmp_result['DrugbankId'].str.split(',')
tmp_result = tmp_result.explode('DrugbankId', ignore_index=True)

In [None]:
tmp_result.shape

In [None]:
result = tmp_result[['map_id', 'otdate', 'Source', 'Number', 'MedName', 'DrugName', 'DrugbankId']].drop_duplicates()
result.shape

In [None]:
result.isna().sum()

In [None]:
missing = result[result['DrugName'].isna()]
missing

In [None]:
result = result.sort_values([
    'map_id',
    'otdate',
    'Source',
])

fp_result = datetime.date.today().strftime('%Y%m%d') + '_drugs_aft_DER.csv'
result.to_csv(fp_cwd + fp_result, index=False)

In [None]:
result.shape

In [None]:
result[['DrugName']].nunique()

In [None]:
result.isna().sum()

# Total Execution Time

In [None]:
analysis_end = timer()
print(datetime.timedelta(seconds=(analysis_end - analysis_start)))