#### Bag of Words Approaches:
In this notebook I am using bag of words (BoW) type of text vectorization. 

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import pickle
import warnings
warnings.filterwarnings('ignore')

In [22]:
df_adm_notes_clean = pd.read_pickle('Prepared_Data.pkl')

In [23]:
df_adm_notes_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51113 entries, 1 to 58975
Data columns (total 12 columns):
SUBJECT_ID             51113 non-null int64
HADM_ID                51113 non-null int64
ADMITTIME              51113 non-null datetime64[ns]
DISCHTIME              51113 non-null datetime64[ns]
DEATHTIME              5792 non-null datetime64[ns]
ADMISSION_TYPE         51113 non-null object
NEXT_ADMITTIME         11169 non-null datetime64[ns]
NEXT_ADMISSION_TYPE    11169 non-null object
DAYS_NEXT_ADMIT        11169 non-null float64
CATEGORY               49083 non-null object
TEXT                   49083 non-null object
OUTPUT_LABEL           51113 non-null int64
dtypes: datetime64[ns](4), float64(1), int64(3), object(4)
memory usage: 5.1+ MB


In [24]:
df_adm_notes_clean = df_adm_notes_clean.reset_index(drop = True)
print(len(df_adm_notes_clean))
print(df_adm_notes_clean['ADMISSION_TYPE'].value_counts())

51113
EMERGENCY    42071
ELECTIVE      7706
URGENT        1336
Name: ADMISSION_TYPE, dtype: int64


#### Save 20% of the data for validation:

In [25]:
df_valid = df_adm_notes_clean.sample(frac=0.2,random_state=42)
print(len(df_valid))
print(df_valid.head())
print(df_valid['OUTPUT_LABEL'].value_counts())
df_valid.to_pickle('df_valid.pkl')

10223
       SUBJECT_ID  HADM_ID           ADMITTIME           DISCHTIME DEATHTIME  \
20564       21284   126923 2168-05-03 07:15:00 2168-05-10 14:50:00       NaT   
9061         9344   116730 2199-05-21 02:59:00 2199-06-26 14:32:00       NaT   
4144         4308   109424 2141-04-04 18:38:00 2141-04-06 16:52:00       NaT   
45720       82935   157739 2183-01-17 20:44:00 2183-01-19 11:19:00       NaT   
26434       27266   165453 2197-10-17 15:27:00 2197-10-26 12:00:00       NaT   

      ADMISSION_TYPE      NEXT_ADMITTIME NEXT_ADMISSION_TYPE  DAYS_NEXT_ADMIT  \
20564       ELECTIVE 2171-02-04 04:04:00           EMERGENCY       999.551389   
9061       EMERGENCY                 NaT                 NaN              NaN   
4144       EMERGENCY                 NaT                 NaN              NaN   
45720      EMERGENCY                 NaT                 NaN              NaN   
26434      EMERGENCY 2197-11-24 19:46:00           EMERGENCY        29.323611   

                CATEGORY  

In [26]:
rows_pos = df_valid.OUTPUT_LABEL == 1
df_valid_pos = df_valid.loc[rows_pos]
print('Positive class ', len(df_valid_pos))
df_valid_neg = df_valid.loc[~rows_pos]
print('Negative class ', len(df_valid_neg))
df_valid = pd.concat([df_valid_pos, df_valid_neg.sample(n = len(df_valid_pos), random_state = 42)],axis = 0)
print(len(df_valid))

Positive class  588
Negative class  9635
1176


In [27]:
df_valid.to_pickle('df_valid_balanced.pkl')

In [28]:
df_train_all=df_adm_notes_clean.drop(df_valid.index)

#### As I mentioned in preparation section, data is highly imbalanced. I am going to do down sampling for negative cases:

In [29]:
rows_pos = df_train_all.OUTPUT_LABEL == 1
df_train_pos = df_train_all.loc[rows_pos]
print('Positive class ', len(df_train_pos))
df_train_neg = df_train_all.loc[~rows_pos]
print('Negative class ', len(df_train_neg))

Positive class  2416
Negative class  47521


#### Making a balanced dataset (50/50):

In [30]:
df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state = 42)],axis = 0)

#### Shuffling before training:

In [31]:
df_train = df_train.sample(n = len(df_train), random_state = 42).reset_index(drop = True)
len(df_train)

4832

In [32]:
df_train.to_pickle('df_train.pkl')

#### Preprocess for bag-of-words (BoF):

In [33]:
def preprocess_text(df):
    # This function preprocesses the text by filling not a number and replacing new lines ('\n') and carriage returns ('\r')
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT = df.TEXT.str.replace('\n',' ')
    df.TEXT = df.TEXT.str.replace('\r',' ')
    return df

In [34]:
df_train = preprocess_text(df_train)
df_valid = preprocess_text(df_valid)

In [35]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


# sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

from numpy import array
from scipy.sparse import csr_matrix
from time import time
import re
import string


# keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
from keras.utils import np_utils
from keras.utils import to_categorical

[nltk_data] Downloading package punkt to /home/elena/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/elena/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/elena/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


#### Now lets look at one of the texts:

In [36]:
df_train.TEXT[0]

"Admission Date:  [**2110-12-7**]       Discharge Date:  [**2110-12-13**]  Date of Birth:   [**2061-12-15**]       Sex:  M  Service:  HISTORY OF PRESENT ILLNESS:  The patient is a 48 year old male with a history of hepatitis C virus and hepatocellular cancer status post chemoembo in [**5-6**] who presented to [**Hospital3 4298**] E.D. on [**2110-12-7**] with hematemesis since [**2110-12-5**].  He had one episode of hematemesis on [**12-5**]/ and two episodes on [**12-6**] as well as hematemesis and hematochezia on [**12-7**].  At [**Hospital3 4298**] E.D. patient was noted to have a hematocrit of 18 and was transfused two units of packed red blood cells and was noted to be having coagulopathy with INR of 2.1 and was given FFP as well as vitamin K.  Patient denies any past history of varices or upper GI bleed.  He denies recent alcohol or drug use.  He was feeling nauseated and began vomiting and continued to have hematemesis for three days until he was transferred to [**Hospital1 18**]

#### These texts need lots of cleaning:

In [37]:
def clean_text(text):

    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t).split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
    
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    
    return text

#### Now lets see how they look:

In [38]:
clean_text(df_train['TEXT'][0])

'admiss date discharg date date birth sex servic histori present ill patient year old male histori hepat virus hepatocellular cancer status post chemoembo present hospit hematemesi sinc one episod hematemesi two episod well hematemesi hematochezia hospit patient note hematocrit transfus two unit pack red blood cell note coagulopathi inr given ffp well vitamin patient deni past histori varic upper bleed deni recent alcohol drug use feel nauseat began vomit continu hematemesi three day transfer hospit definit care patient transfus total five unit pack red blood cell given vitamin counteract coagulopathi underw egd band sclerotherapi grade lower esophag varic lower mid esophagus patient also abdomin followup chemoembo show portal vein thrombosi present patient deni short breath chest pain headach nausea vomit fever chill state feel much better past medic histori signific hepat virus alcohol cirrhosi diagnos duoden ulcer status post perfor repair diagnos hepatocellular carcinoma radiothera

#### Looks clean and ready for BoW!

In [39]:
df_train['TEXT'] = df_train['TEXT'].map(lambda x: clean_text(x))
df_valid['TEXT'] = df_valid['TEXT'].map(lambda x: clean_text(x))

In [40]:
df_train.to_pickle('train_prepared.pkl')
df_valid.to_pickle('valid_prepared_balanced.pkl')

In [42]:
len(df_valid)

1176