In [2]:
from pandarallel import pandarallel
import pandas as pd
import numpy as np
import csv
import re

np.random.seed(500)
pandarallel.initialize()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
df = pd.read_csv('../data/real-mimic-iii-database/NOTEEVENTS.csv', dtype=object)

df[['TEXT']].to_csv('../data/processed/NOTEEVENTS_TEXT_ONLY.csv', encoding='utf-8', quotechar='"', quoting=csv.QUOTE_ALL, index=False)

In [3]:
df_notes = pd.read_csv('../data/processed/NOTEEVENTS_TEXT_ONLY.csv', dtype=object)

In [6]:
from nltk.corpus import stopwords
english_stopwords = set(stopwords.words('english'))

In [7]:
def element_exists(x):
        return x

def normalize_text(x):
    # Make text lower case and remove non alpha and space characters.
    lower_alpha_with_space = re.sub('[^a-z ]+', '', x.lower()).strip()
    # Split on space
    word_arr = lower_alpha_with_space.split(' ')
    # Remove stop words
    word_arr = [word for word in word_arr if not word in english_stopwords]
    # Remove nulls from word array
    return list(filter(element_exists, word_arr))


In [8]:
df_notes['TEXT_NORMALIZED_ARRAY'] = df_notes['TEXT'].parallel_apply(normalize_text)

In [9]:
df_notes[['TEXT_NORMALIZED_ARRAY']].head()

Unnamed: 0,TEXT_NORMALIZED_ARRAY
0,"[admission, date, discharge, date, serviceadde..."
1,"[admission, date, discharge, date, date, birth..."
2,"[admission, date, discharge, date, service, ca..."
3,"[admission, date, discharge, date, service, me..."
4,"[admission, date, discharge, date, date, birth..."


In [10]:
df_ndc_codes = pd.read_csv('../data/processed/ndc_codes_extracted.csv')

In [15]:
df_ndc_codes['DRUG_NAME_ARRAY'] = df_ndc_codes['DRUG_NORMALIZED'].apply(lambda x: list(filter(element_exists, x.split(' '))))

In [34]:
num_classes = len(df_ndc_codes['NDC'])

ndc_codes_dict = df_ndc_codes[['DRUG_NAME_ARRAY', 'NDC']].to_dict(orient='records')

ndc_codes = list(map(lambda x: x['NDC'], ndc_codes_dict))
drug_name_array = list(map(lambda x: set(x['DRUG_NAME_ARRAY']), ndc_codes_dict))

In [45]:
set(df_notes['TEXT_NORMALIZED_ARRAY'][0]).intersection(drug_name_array[5])

set()

In [48]:
def determine_labels(text_array):
    text_set = set(text_array)

    labels = [0] * num_classes

    for i in range(num_classes):
        drug_name_set = drug_name_array[i]

        if len(text_set.intersection(drug_name_set)) > 0:
            labels[i] = 1

    return labels

df_notes['LABELS'] = df_notes['TEXT_NORMALIZED_ARRAY'].parallel_apply(determine_labels)

In [51]:
df_notes['LABELS']

0          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1          [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, ...
2          [1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, ...
3          [0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, ...
4          [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...
                                 ...                        
2083175    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2083176    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2083177    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2083178    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2083179    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: LABELS, Length: 2083180, dtype: object

In [50]:
df_notes.to_csv('../data/processed/NOTEEVENTS_WITH_LABELS.csv', encoding='utf-8', quotechar='"', quoting=csv.QUOTE_ALL, index=False)

In [55]:
def contains_one_true_class(labels):
    if max(labels) == 1:
        return True
    else :
        return False

df_notes['CONTAINS_ONE_TRUE_CLASS']= df_notes['LABELS'].parallel_apply(contains_one_true_class)

In [63]:
def join_text_array(text_array):
    return "".join(text_array)

df_notes['TEXT_NORMALIZED_JOINED'] = df_notes['TEXT_NORMALIZED_ARRAY'].apply(join_text_array)

In [64]:
df_notes_true_label = df_notes[df_notes['CONTAINS_ONE_TRUE_CLASS'] == True]
df_notes_true_label.head()

Unnamed: 0,TEXT,TEXT_NORMALIZED_ARRAY,LABELS,CONTAINS_ONE_TRUE_CLASS,TEXT_NORMALIZED_JOINED
1,Admission Date: [**2118-6-2**] Discharg...,"[admission, date, discharge, date, date, birth...","[1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, ...",True,admissiondatedischargedatedatebirthsexfservice...
2,Admission Date: [**2119-5-4**] D...,"[admission, date, discharge, date, service, ca...","[1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, ...",True,admissiondatedischargedateservicecardiothoraci...
3,Admission Date: [**2124-7-21**] ...,"[admission, date, discharge, date, service, me...","[0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, ...",True,admissiondatedischargedateservicemedicinealler...
4,Admission Date: [**2162-3-3**] D...,"[admission, date, discharge, date, date, birth...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",True,admissiondatedischargedatedatebirthsexmservice...
5,Admission Date: [**2172-3-5**] D...,"[admission, date, discharge, date, date, birth...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True,admissiondatedischargedatedatebirthsexfservice...


In [66]:
df_notes_final = df_notes_true_label[['TEXT_NORMALIZED_ARRAY', 'TEXT_NORMALIZED_JOINED', 'LABELS']]

In [67]:
df_notes_final.to_csv('../data/processed/NOTEEVENTS_ML_DATASET.csv', encoding='utf-8', quotechar='"', quoting=csv.QUOTE_ALL, index=False)