#### Bag of Words Approaches:
In this notebook I am using bag of words (BoW) type of text vectorization. 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import pickle
import warnings
warnings.filterwarnings('ignore')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
df_train_all = pd.read_pickle('train_a.pkl')
df_valid = pd.read_pickle('valid_a.pkl')
df_test = pd.read_pickle('test_a.pkl')

In [5]:
# split the training data into positive and negative
rows_pos = df_train_all.OUTPUT_LABEL == 1
df_train_pos = df_train_all.loc[rows_pos]
df_train_neg = df_train_all.loc[~rows_pos]# merge the balanced data
df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state = 42)],axis = 0)# shuffle the order of training samples 
df_train = df_train.sample(n = len(df_train), random_state = 42).reset_index(drop = True)

#### Preprocess for bag-of-words (BoF):

In [6]:
def preprocess_text(df):
    # This function preprocesses the text by filling not a number and replacing new lines ('\n') and carriage returns ('\r')
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT = df.TEXT.str.replace('\n',' ')
    df.TEXT = df.TEXT.str.replace('\r',' ')
    return df

In [7]:
df_train = preprocess_text(df_train)
df_valid = preprocess_text(df_valid)
df_test = preprocess_text(df_test)

In [8]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


# sklearn
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

from numpy import array
from scipy.sparse import csr_matrix
from time import time
import re
import string


# keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
from keras.utils import np_utils
from keras.utils import to_categorical

[nltk_data] Downloading package punkt to /home/elena/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/elena/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/elena/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


#### Now lets look at one of the texts:

In [9]:
df_train.TEXT[0]

"Admission Date:  [**2168-2-19**]              Discharge Date:   [**2168-2-24**]   Service: MEDICINE  Allergies: Penicillins / Morphine  Attending:[**First Name3 (LF) 5755**] Chief Complaint: Melena and coffee ground emesis  Major Surgical or Invasive Procedure: Upper Endoscopy with clipping of duodenal ulcer  History of Present Illness: 87 F with anemia, GERD, DM2, HTN, hyperlipid, presents with black coffee ground vomit and black diarrhea x past 10 days. She has had 10 episodes of vomiting and 10 episodes of diarrhea per day. She also has abdominal pain that feels like pressure and bloating, that is in a band-like area across the middle of her abdomen. She presented to the ED today because she was speaking with her daughter and said she was so tired that she couldn't even hold the phone any longer, that she was very dizzy, and that she just did not feel well. She had been keeping her black vomit and diarrhea a secret from the family. She has never had such black vomit or diarrhea bef

#### These texts need lots of cleaning:

In [10]:
def clean_text(text):

    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t).split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    text = " ".join(text)
    
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    
    return text

#### Now lets see how they look:

In [11]:
clean_text(df_train['TEXT'][0])

'admiss date discharg date servic medicin allergi penicillin morphin attend first name chief complaint melena coffe ground emesi major surgic invas procedur upper endoscopi clip duoden ulcer histori present ill anemia gerd htn hyperlipid present black coffe ground vomit black diarrhea past day episod vomit episod diarrhea per day also abdomin pain feel like pressur bloat band like area across middl abdomen present today speak daughter said tire even hold phone longer dizzi feel well keep black vomit diarrhea secret famili never black vomit diarrhea life granddaught report take daili aspirin frequent take medic believ famili ibuprofen alev although sent countri sbp moder tender abdomen melen stool lavag clear place high suction began show light pink fluid may ngt trauma hct hct day ago antibodi transfus difficult match receiv ppi contact name arriv icu plan perform egd patient receiv unit rbc cough dizzi diffus abd pain fatigu fever chill headach sore throat sob dysuria past medic histo

#### Looks clean and ready for BoW!

In [12]:
df_train['TEXT'] = df_train['TEXT'].map(lambda x: clean_text(x))
df_valid['TEXT'] = df_valid['TEXT'].map(lambda x: clean_text(x))
df_test['TEXT'] = df_test['TEXT'].map(lambda x: clean_text(x))

In [14]:
print(len(df_train))
print(len(df_valid))
print(df_valid.head(3))
print(len(df_test))
print(df_test.head(3))

4198
7667
       SUBJECT_ID  HADM_ID           ADMITTIME           DISCHTIME DEATHTIME  \
9061        86662   140257 2133-12-08 17:23:00 2133-12-15 14:30:00       NaT   
4144        23568   173420 2114-09-19 20:35:00 2114-10-02 15:16:00       NaT   
26434       26218   133475 2154-06-18 05:59:00 2154-06-20 16:25:00       NaT   

      ADMISSION_TYPE      NEXT_ADMITTIME NEXT_ADMISSION_TYPE  DAYS_NEXT_ADMIT  \
9061       EMERGENCY                 NaT                 NaN              NaN   
4144       EMERGENCY 2114-10-15 17:31:00           EMERGENCY         13.09375   
26434      EMERGENCY                 NaT                 NaN              NaN   

                CATEGORY                                               TEXT  \
9061   Discharge summary  admiss date discharg date date birth sex servi...   
4144   Discharge summary  admiss date discharg date date birth sex servi...   
26434                NaN                                                      

       OUTPUT_LABEL  
9061 

In [15]:
df_train.to_pickle('train_prepared_a.pkl')
df_valid.to_pickle('valid_prepared_a.pkl')
df_test.to_pickle('test_prepared_a.pkl')

In [16]:
len(df_valid)

7667