# **Medication Correction**

## Step 3

by Chen Chen (c.chen2@wustl.edu); David Brown (browndavid@wustl.edu)

# Imports

In [246]:
# General
import datetime
import os
from timeit import default_timer as timer # Preferred across OS; see https://stackoverflow.com/a/25823885

# Data handling
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt

# Drug recognition
from drug_named_entity_recognition import find_drugs

# Useful
import ast
import re
import string

# Functions 

In [248]:
def clean_label(label):
    # Remove bracket content
    label = re.sub(r'\(.*?\)', '', label).strip()
    # Remove trailing 'd' followed by numbers
    label = re.sub(r'\sd\d+$', '', label).strip()
    return label

# Input data

In [251]:
fp_cwd = os.getcwd() + "/"

fp_full_matched_extend = datetime.date.today().strftime('%Y%m%d') + '_full_matched_name_extend.csv'
input_drugs = pd.read_csv(fp_cwd + fp_full_matched_extend, low_memory=False)

## Clean some misspelling & Remove Vitamins/Supplements/OTCs

In [252]:
replace_dict = {
    'fluoxetine Delayed Release Oral Capsule': 'fluoxetine',
    'paroxetine Oral Suspension [Paxil]': 'paroxetine',
    'trazodone Oral Capsule': 'trazodone'
}

In [253]:
df = input_drugs[~input_drugs['MedName'].str.contains(
    r'CoQ-10|melatonin|biotin|Milk|Lutein|Peppermint oil|Red Yeast Rice|Levemir Flex Pen|'
    r'Fiasp Flex Touch Injection|ENERGIZE|Oil|B-12|softener|Colace|Docusate|Q10|Enzyme|'
    r'Glucosamine|wax|Hydroxocobalamin|Zinc sulfate|Chondroitin sulfates|hyaluronate|'
    r'methylsulfonylmethane|Lactobacillus acidophilus|Melatonin / tryptophan|L-glutamine|lecithin|phenazopyridine|arginine Extended Release Oral Tablet',
    case=False, na=False
)].drop_duplicates()

In [254]:
df = df[~df['mapped_name'].str.contains(
    r'CoQ-10|melatonin|biotin|Milk|Lutein|Peppermint oil|Red Yeast Rice|Levemir Flex Pen|'
    r'Fiasp Flex Touch Injection|ENERGIZE|Oil|B-12|softener|Colace|Docusate|Q10|Enzyme|'
    r'Glucosamine|wax|Hydroxocobalamin|Zinc sulfate|Chondroitin sulfates|hyaluronate|'
    r'methylsulfonylmethane|Lactobacillus acidophilus|Melatonin / tryptophan|L-glutamine|lecithin|phenazopyridine|arginine Extended Release Oral Tablet',
    case=False, na=False
)].drop_duplicates()

In [255]:
input_drugs['mapped_name'].nunique()

446

In [256]:
df['mapped_name'].nunique()

446

In [257]:
input_drugs.shape

(23446, 12)

In [258]:
df.shape

(23446, 12)

In [259]:
df['mapped_name'] = df['mapped_name'].replace(replace_dict) # 2 meds were corrected
df['med_lower'] = df['MedName'].str.lower().str.strip() # Make it lower cases

In [260]:
df['mapped_name'].nunique()

444

In [None]:
# convert the long format to wide format
df = df.pivot_table(
    index=['map_id', 'redcap_event_name', 'Number', 'RedCapMedName', 'mapped_name'],  # Row identifiers
    columns='AHFSClassText',  
    values='AHFSClassNum',  
    aggfunc=lambda x: 1  # Assign 1 if a value exists
).fillna(0).reset_index()  # Replace NaN with 0 and reset index

df_wide = df.drop_duplicates()