In [None]:
from pandarallel import pandarallel
import pandas as pd
import numpy as np
import csv
import re

np.random.seed(500)
pandarallel.initialize()

In [None]:
df = pd.read_csv('../data/real-mimic-iii-database/NOTEEVENTS.csv', dtype=object)

df[['TEXT']].to_csv('../data/processed/NOTEEVENTS_TEXT_ONLY.csv', encoding='utf-8', quotechar='"', quoting=csv.QUOTE_ALL, index=False)

In [None]:
df_notes = pd.read_csv('../data/processed/NOTEEVENTS_TEXT_ONLY.csv', dtype=object)

In [None]:
from nltk.corpus import stopwords
english_stopwords = set(stopwords.words('english'))

In [None]:
def element_exists(x):
        return x

def normalize_text(x):
    # Make text lower case and remove non alpha and space characters.
    lower_alpha_with_space = re.sub('[^a-z ]+', '', x.lower()).strip()
    # Split on space
    word_arr = lower_alpha_with_space.split(' ')
    # Remove stop words
    word_arr = [word for word in word_arr if not word in english_stopwords]
    # Remove nulls from word array
    return list(filter(element_exists, word_arr))


In [None]:
df_notes['TEXT_NORMALIZED_ARRAY'] = df_notes['TEXT'].parallel_apply(normalize_text)

In [None]:
df_notes[['TEXT_NORMALIZED_ARRAY']].head()

In [None]:
df_ndc_codes = pd.read_csv('../data/processed/ndc_codes_extracted.csv')

In [None]:
df_ndc_codes['DRUG_NAME_ARRAY'] = df_ndc_codes['DRUG_NORMALIZED'].apply(lambda x: list(filter(element_exists, x.split(' '))))

In [None]:
num_classes = len(df_ndc_codes['NDC'])

ndc_codes_dict = df_ndc_codes[['DRUG_NAME_ARRAY', 'NDC']].to_dict(orient='records')

ndc_codes = list(map(lambda x: x['NDC'], ndc_codes_dict))
drug_name_array = list(map(lambda x: set(x['DRUG_NAME_ARRAY']), ndc_codes_dict))

In [None]:
set(df_notes['TEXT_NORMALIZED_ARRAY'][0]).intersection(drug_name_array[5])

In [None]:
def determine_labels(text_array):
    text_set = set(text_array)

    labels = [0] * num_classes

    for i in range(num_classes):
        drug_name_set = drug_name_array[i]

        if len(text_set.intersection(drug_name_set)) > 0:
            labels[i] = 1

    return labels

df_notes['LABELS'] = df_notes['TEXT_NORMALIZED_ARRAY'].parallel_apply(determine_labels)

In [None]:
df_notes['LABELS']

In [None]:
df_notes.to_csv('../data/processed/NOTEEVENTS_WITH_LABELS.csv', encoding='utf-8', quotechar='"', quoting=csv.QUOTE_ALL, index=False)

In [None]:
def contains_one_true_class(labels):
    if max(labels) == 1:
        return True
    else :
        return False

df_notes['CONTAINS_ONE_TRUE_CLASS']= df_notes['LABELS'].parallel_apply(contains_one_true_class)

In [None]:
def join_text_array(text_array):
    return "".join(text_array)

df_notes['TEXT_NORMALIZED_JOINED'] = df_notes['TEXT_NORMALIZED_ARRAY'].apply(join_text_array)

In [None]:
df_notes_true_label = df_notes[df_notes['CONTAINS_ONE_TRUE_CLASS'] == True]
df_notes_true_label.head()

In [None]:
df_notes_final = df_notes_true_label[['TEXT_NORMALIZED_ARRAY', 'TEXT_NORMALIZED_JOINED', 'LABELS']]

In [None]:
df_notes_final.to_csv('../data/processed/NOTEEVENTS_ML_DATASET.csv', encoding='utf-8', quotechar='"', quoting=csv.QUOTE_ALL, index=False)