# Discharge Notes and Readmission Rates: Modeling

This notebook is for the modeling the cleaned dataset.

## Import

In [31]:
import pandas as pd
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
import string
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,\
HashingVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

## Train and Test Split

In [14]:
df = pd.read_csv('data/admissions_cleaned.csv')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45591 entries, 0 to 45590
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ADMISSION_TYPE      45591 non-null  object
 1   ADMISSION_LOCATION  45591 non-null  object
 2   DISCHARGE_LOCATION  45591 non-null  object
 3   INSURANCE           45591 non-null  object
 4   LANGUAGE            45591 non-null  object
 5   RELIGION            45591 non-null  object
 6   MARITAL_STATUS      45591 non-null  object
 7   ETHNICITY           45591 non-null  object
 8   DIAGNOSIS           45591 non-null  object
 9   READMISSION         45591 non-null  int64 
 10  GENDER              45591 non-null  object
 11  AGE                 45591 non-null  int64 
 12  TEXT                45591 non-null  object
 13  CPT_CD              45591 non-null  object
 14  DIAG_ICD9_CODE      45590 non-null  object
 15  DRG_CODE            45591 non-null  object
 16  PROCED_ICD9_CODE    45

In [16]:
X = df.drop('READMISSION', axis=1)
y = df['READMISSION']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [19]:
vectorizer = TfidfVectorizer()

In [20]:
tf_idf_data_train = vectorizer.fit_transform(X_train['TEXT'])

In [21]:
tf_idf_data_test = vectorizer.transform(X_test['TEXT'])

In [23]:
tf_idf_data_train.shape

(34193, 185915)

In [26]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

In [28]:
nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

In [29]:
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

In [32]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.7387 		 Testing Accuracy: 0.7386

----------------------------------------------------------------------

Random Forest
Training Accuracy: 1.0 		 Testing Accuracy: 0.7593


## Tokenizing

In [18]:
sw = stopwords.words('english')

In [None]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
tokenizer = RegexpTokenizer(pattern)
sample_doc = tokenizer.tokenize(sample_document)