# Data Exploration and Data Prep

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import csv

np.random.seed(500)

## Data Load MIMIC-III
Load in `PRESCRIPTIONS` and `NOTEEVENTS` csvs into pandas dataframe.

In [None]:
df_prescriptions = pd.read_csv('../data/real-mimic-iii-database/PRESCRIPTIONS.csv', dtype=str)
df_noteevents = pd.read_csv('../data/real-mimic-iii-database/NOTEEVENTS.csv', dtype=str)

In [None]:
df_prescriptions.head()

In [None]:
df_noteevents.head()

## Data Exploration

### Find most frequent prescriptions

In [None]:
df_most_frequent_prescriptions = df_prescriptions[['ROW_ID', 'NDC']][df_prescriptions['NDC'] != '0'].groupby('NDC').count().sort_values('ROW_ID',ascending=False)

In [None]:
# Use only top 5 for now.
df_most_frequent_prescriptions = df_most_frequent_prescriptions.head(20)
df_most_frequent_prescriptions = df_most_frequent_prescriptions.reset_index()
df_most_frequent_prescriptions

In [None]:
# Get Drug Names From Most Freq NDCs
df_most_frequent_prescriptions = df_most_frequent_prescriptions.rename(columns={'ROW_ID': 'FREQ'})
list_most_freq_prescription_ndcs = df_most_frequent_prescriptions['NDC'].to_list()
dict_most_freq_prescription_ndcs = df_prescriptions[df_prescriptions['NDC'].isin(list_most_freq_prescription_ndcs)][['NDC', 'DRUG']].groupby('DRUG').max().reset_index().to_dict(orient='records')

In [None]:
ndc_to_drug_names = {}

for record in dict_most_freq_prescription_ndcs:
    drug, ndc = record['DRUG'], record['NDC']
    if ndc not in ndc_to_drug_names:
        ndc_to_drug_names[ndc] = []

    # Remove non alpha numeric characters and make lowercase
    drug = re.sub('[^A-Za-z0-9]+', '', drug)
    drug = drug.lower()    
    ndc_to_drug_names[ndc].append(drug)

ndc_to_drug_names


In [None]:
df_noteevents['TEXT_NORMALIZED'] = df_noteevents['TEXT'].apply(lambda x: re.sub('[^A-Za-z0-9]+', '', x).lower())
df_noteevents.head()


In [None]:
df_noteevents.to_csv('../data/processed/NOTEEVENTS.csv', quoting=csv.QUOTE_ALL, quotechar='"')

In [None]:
# Save new dataframe with `Text_NORMALIZED` - Done
# Loop over NDC codes and find `TEXT_NORMALIZED` with drug names
# Save to dataframe with ROW_ID, TEXT_NORMALIZED, NDC

Find all drug names for NDC `00338001702`

Discharge summary records have a list of medications `Medications on Admission` and `Medications on Transfer`. Might be better to take the 50 previous characters and 50 next characters of a text match of outside of sections.

In [None]:
df_noteevents_with_drug = df_noteevents[df_noteevents['TEXT'].str.contains("5% Dextrose") == True]

In [None]:
df_noteevents_with_drug

In [None]:
df_noteevents_with_drug.iloc[0]['TEXT']