# Data Exploration and Data Prep

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import csv
import string
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
np.random.seed(500)

## Data Load MIMIC-III
Load in `PRESCRIPTIONS` and `NOTEEVENTS` csvs into pandas dataframe.

In [None]:
df_prescriptions = pd.read_csv('../data/real-mimic-iii-database/PRESCRIPTIONS.csv', dtype=str)
df_noteevents = pd.read_csv('../data/real-mimic-iii-database/NOTEEVENTS.csv', dtype=str)

## Data Exploration

### Find most frequent prescriptions

In [None]:
df_most_frequent_prescriptions = df_prescriptions[['ROW_ID', 'NDC']][df_prescriptions['NDC'] != '0'].groupby('NDC').count().sort_values('ROW_ID',ascending=False)

In [None]:
# Use only top 20 for now.
df_most_frequent_prescriptions = df_most_frequent_prescriptions.head(20)
df_most_frequent_prescriptions = df_most_frequent_prescriptions.reset_index()

In [None]:
# Get Drug Names From Most Freq NDCs
df_most_frequent_prescriptions = df_most_frequent_prescriptions.rename(columns={'ROW_ID': 'FREQ'})
list_most_freq_prescription_ndcs = df_most_frequent_prescriptions['NDC'].to_list()
dict_most_freq_prescription_ndcs = df_prescriptions[df_prescriptions['NDC'].isin(list_most_freq_prescription_ndcs)][['NDC', 'DRUG']].groupby('DRUG').max().reset_index().to_dict(orient='records')

In [None]:
ndc_to_drug_names = {}

for record in dict_most_freq_prescription_ndcs:
    drug, ndc = record['DRUG'], record['NDC']
    if ndc not in ndc_to_drug_names:
        ndc_to_drug_names[ndc] = []

    # Remove non alpha numeric characters and make lowercase
    drug = re.sub('[^A-Za-z0-9]+', '', drug)
    drug = drug.lower()    
    ndc_to_drug_names[ndc].append(drug)

ndc_to_drug_names

In [None]:
df_noteevents_sample_insulin = df_noteevents.head(10000)[df_noteevents['TEXT'].str.contains("insulin") == True]

In [None]:
def clean_up_text(text):
    stopset = set(stopwords.words('english') + list(string.punctuation))
    tokens = word_tokenize(text.lower())
    return [re.sub('[^A-Za-z0-9]+', '', i) for i in tokens if i not in stopset]

df_noteevents_sample_insulin['TEXT_NORMALIZED'] = df_noteevents_sample_insulin['TEXT'].apply(clean_up_text)
df_noteevents_sample_insulin.head()

In [None]:
# Save new dataframe with `Text_NORMALIZED` - Done
# Loop over NDC codes and find `TEXT_NORMALIZED` with drug names
# Save to dataframe with ROW_ID, TEXT_NORMALIZED, NDC

test_ndc = '00088222033'
drug_name = 'insulin'

def extract_text_subset(text_normalized=[]):
    idx = 0
    for i in range(len(text_normalized)):
        if drug_name in text_normalized[i]:
            idx = i
            break
    subset_left = "".join(text_normalized[idx - 10: idx])
    subset_right = "".join(text_normalized[idx + 1: idx + 10])
    return subset_left[-20:] + drug_name + subset_right[:20]

df_noteevents_sample_insulin['DRUG_ORDER_TEXT'] = df_noteevents_sample_insulin['TEXT_NORMALIZED'].apply(extract_text_subset)
df_noteevents_sample_insulin['NDC'] = test_ndc
df_noteevents_sample_insulin.head()

In [None]:
df_noteevents_sample_insulin.tail()

In [None]:
df_noteevents['ROW_ID'] = df_noteevents['ROW_ID'].astype('int')
df_noteevents_without_insulin = df_noteevents[df_noteevents['ROW_ID'] > 7956].head(3000)

In [None]:
df_noteevents_without_insulin['TEXT_NORMALIZED'] = df_noteevents_without_insulin['TEXT'].apply(clean_up_text)

In [None]:
def extract_text_subset_random(text_normalized=[]):
    idx = 50
    subset = text_normalized[idx - 10: idx + 10]
    subset_left = "".join(text_normalized[idx - 10: idx])
    subset_right = "".join(text_normalized[idx + 1: idx + 10])
    return subset_left[-20:] + subset_right[:20 + len(drug_name)]
    return "".join(subset)

df_noteevents_without_insulin['DRUG_ORDER_TEXT'] = df_noteevents_without_insulin['TEXT_NORMALIZED'].apply(extract_text_subset_random)
df_noteevents_without_insulin['NDC'] = '0'
df_noteevents_without_insulin.head()

In [None]:
df_feature = pd.concat([df_noteevents_sample_insulin[['DRUG_ORDER_TEXT', 'NDC']],df_noteevents_without_insulin[['DRUG_ORDER_TEXT', 'NDC']]])

In [None]:
df_feature.to_csv('../data/processed/FEATURE_INSULIN_SAMPLE.csv', index=False, quoting=csv.QUOTE_ALL, quotechar='"')