# Note
Previous notebook drafts have analyzed several classification methods not included here, particularly SVM methods. The SVM classifiers did not return quality results and took long times to run. They are therefore not included in this analysis.

# Defining Parameters

* **Data**: Discharge Summary Notes
* **Model Imbalance**: ADASYN
* **Vectorizor**: Count
* **Vectorizor Parameters**: 3000
* **Dimension Reduction Method**: Truncated SVD
* **Grid Search Scoring Parameter**: f1

In [None]:
scoring_metric = 'f1'
max_features = 3000
svd_features = 600
max_iter_log = 500

# Library Calls

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import feather
import nltk
import re
import string
import yellowbrick
import sklearn
import matplotlib.pyplot as plt

from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from yellowbrick.text import FreqDistVisualizer
from yellowbrick.features import RadViz

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, cross_validate

from imblearn.over_sampling import ADASYN 

from collections import Counter

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import classification_report, roc_curve, roc_auc_score, confusion_matrix
from sklearn.metrics import accuracy_score,precision_recall_fscore_support
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

# Random State
rng = np.random.RandomState(5590)
solver_log = 'saga'

# Data Import and Processing
## !!!One Data Frame Import Must Be Commented Out!!!!

We define the dataframes as none for a subsqent check.

In [None]:
df_all = None
df_ds = None

In [None]:
# All Available Notes
#df_all = pd.read_csv('./data/text_processed_all.csv.gz', compression='gzip', low_memory=False)

# Dishcharge Summary Notes Onlly
df_ds = pd.read_csv('./data/text_processed_discharge_summary.csv.gz', compression='gzip', low_memory=False)

# Define Data Frame

Here we do a check to see which data frame we are analyzing, All Notes or Discharge Notes

In [None]:
if df_all is not None:
    df = df_all
else:
    df = df_ds
      
# Convert HADMID to String
df.hadm_id = df.hadm_id.astype('int64').astype(str)

# Convert Readmit_30 to Int
df.readmit_30 = df.readmit_30.astype('int')

# Functions

## Modified Tokenizer
Define a modified tokenizer function. This function will remove numbers and characters, as well as set all words to lower case.

In [None]:
def tokenize_note_events(text):
    #create a list of all characters and numbers
    num_puct_list = string.punctuation+'0123456789'
    
    # Create a dictionary aligning each numeric and chcarter to a space
    t = str.maketrans(dict.fromkeys(num_puct_list, " "))
    
    # Convert Text to lower case and apply dictionary
    text = text.lower().translate(t)
    
    #tokenize
    tokens = nltk.word_tokenize(text)
    return tokens

## Custom Stop Words
Define custom stop words

In [None]:
stop_words = ['the','and','to','of','was','with','a','on','in','for','name',
 'is','patient','s','he','at','as','or','one','she','his','her','am', 
 'were','you','pt','pm','by','be','had','your','this','date', 'from',
 'there','an','that','p','are','have','has','h','but','o', 
 'namepattern','which','every','also', 'w', 'd', 'c', 'l', 
 'q', 'r', 'x', 't', 'm']

## Text Processing Function with Truncated SVD
The following function performs all of the prec-processing steps, using sub-sampling to balance the data. The function tokenizes the text using 1-grams and 2-grams. The function returns a processed predictor dataframe for the training and validation data, as well as the target variable for the training and validation data.

In [None]:
def text_processing_tr_v(df):
    # Clean the Entire Data Set of numbers, characters, carriage returns, and new lines
    df.text = df.text.fillna(' ')
    df.text = df.text.str.replace('\n',' ')
    df.text = df.text.str.replace('\r',' ')
    
    # Split into Train, Valid and Test
    ## Shuffle
    df = df.sample(n = len(df), random_state = rng)
    df = df.reset_index(drop = True)
    
    ## Extract Data for Test and Valid Sampling
    df_v_te = df.sample(frac=0.40, random_state = rng)
    
    ## Test Sample
    df_te = df_v_te.sample(frac = 0.5, random_state = rng)
    
    ## Valid Sample
    df_v = df_v_te.drop(df_te.index)

    ## Training Sample
    df_tr = df.drop(df_v_te.index)
    
    # Initialize Vectorizer and SVD
    ## Use Modified Tokenizer, Set number of n-grams, use custom stop words
    vect = CountVectorizer(max_features = max_features, 
                           tokenizer = tokenize_note_events, 
                           ngram_range = (1,2),
                           stop_words=stop_words
                          )

    # Transform Text
    # Fit Vectorizer on Training Data
    vect.fit(df_tr.text.values)
    
    # Transform the text into vectors.
    x_tr_tf = vect.transform(df_tr.text.values)
    x_v_tf = vect.transform(df_v.text.values)
    x_te_tf  = vect.transform(df_te.text.values)
    
    # Define Target Variables
    y_tr = df_tr.readmit_30
    y_v = df_v.readmit_30
    y_te = df_te.readmit_30
    
    return x_tr_tf, x_v_tf, x_te_tf, y_tr, y_v, y_te, df_tr, vect

## ADASYN Over Sampling

In [None]:
def asadyn_sample(x_train, y_train):
     # Initialize ADASYN 
    ada = ADASYN(random_state = rng)
    x_train_adasyn, y_tr_adasym = ada.fit_resample(x_train, y_train)
    
    return x_train_adasyn, y_tr_adasym

## Dimension Reduction: Truncated SVD

In [None]:
def trunc_svd(x_tr, x_v, x_te, n_components):
    svd = TruncatedSVD(n_components = n_components, random_state = rng)
    # Fit SVD on Training Data
    svd.fit(x_tr)
    
    # Transform Sparse Matrices
    x_tr_svd = svd.transform(x_tr)
    x_v_svd = svd.transform(x_v)
    x_te_svd  = svd.transform(x_te)
    
    return x_tr_svd, x_v_svd, x_te_svd, svd

## Model Output

### Scoring Metrics
The following functions calculate the scoring metrics the models will be evlauted on.

In [None]:
def calc_specificity(y_actual, y_pred, thresh):
    # calculates specificity
    return sum((y_pred < thresh) & (y_actual == 0)) /sum(y_actual ==0)

def calc_prevalence(y_actual):
    # calculates prevalence
    return sum((y_actual == 1)) /len(y_actual)

### Scoring Output
The following code generates the visual output of the scoring metrics.

In [None]:
def model_output(y_tr, y_tr_preds, y_tr_preds_prob, y_v, y_v_preds, y_v_preds_prob):
    
    fpr_tr, tpr_tr, thresholds_tr = roc_curve(y_tr, y_tr_preds_prob)
    fpr_v, tpr_v, thresholds_v = roc_curve(y_v, y_v_preds_prob)

    thresh = 0.5

    auc_t = '%.3f' %roc_auc_score(y_tr, y_tr_preds_prob)
    auc_v = '%.3f' %roc_auc_score(y_v, y_v_preds_prob)
    
    acc_t = '%.3f' %accuracy_score(y_tr, y_tr_preds)
    acc_v = '%.3f' %accuracy_score(y_v, y_v_preds)
        
    recall_t = '%.3f' %recall_score(y_tr, y_tr_preds)
    recall_v = '%.3f' %recall_score(y_v, y_v_preds)
        
    precision_t = '%.3f' %precision_score(y_tr, y_tr_preds)
    precision_v = '%.3f' %precision_score(y_v, y_v_preds)
      
    
    f1_t = '%.3f' %f1_score(y_tr, y_tr_preds)
    f1_v = '%.3f' %f1_score(y_v, y_v_preds)
    
    specificity_t = '%.3f' %calc_specificity(y_tr, y_tr_preds, thresh)
    specificity_v = '%.3f' %calc_specificity(y_v, y_v_preds, thresh)
    
    prevalence_t = '%.3f' %calc_prevalence(y_tr)
    prevalence_v = '%.3f' %calc_prevalence(y_v)
    
    
    data = {'Score':['AUC', 'Accuracy', 'Precision', 'Recall', 'F1', 'Specificity', 'Prevalence'],
            'Train':[auc_t, acc_t, precision_t, recall_t, f1_t, specificity_t, prevalence_t],
            'Validation':[auc_v, acc_v, precision_v, recall_v,  f1_v, specificity_v, prevalence_v]
           }
    
    df = pd.DataFrame(data)
    
    print(df)

    plt.plot(fpr_tr, tpr_tr,'r-', label = 'Train AUC: %.2f'%roc_auc_score(y_tr, y_tr_preds))
    plt.plot(fpr_v, tpr_v,'b-',label = 'Valid AUC: %.2f'%roc_auc_score(y_v, y_v_preds))
    plt.plot([0,1],[0,1],'-k')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

## Model Execution

The following function trains a default model to the training data and generates training and validation scores.

In [None]:
def train_model(clf, x_tr, x_v, y_tr, y_v):
    
    clf.fit(x_tr, y_tr)

    y_tr_preds = clf.predict(x_tr)
    y_v_preds = clf.predict(x_v)
    
    y_tr_preds_prob = clf.predict_proba(x_tr)[:,1]
    y_v_preds_prob = clf.predict_proba(x_v)[:,1]

    model_output(y_tr, y_tr_preds, y_tr_preds_prob, y_v, y_v_preds, y_v_preds_prob)

The following function runs the gridsearchcv

In [None]:
def grid_search_opt(x_tr, x_v, y_tr, y_v, clf, hyper_param_dict):

    # Create a Grid Search
    clf = GridSearchCV(clf, 
                       hyper_param_dict, 
                       cv=5, 
                       verbose=1, 
                       n_jobs = -1,
                       scoring = scoring_metric)

    # Best Model
    clf.fit(x_tr, y_tr)
    
    # Export Best Parameters to New Model
    clf_best_est = clf.best_estimator_

    print("Best parameters set found on training set:")
    print()
    print(clf.best_params_)
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    return clf_best_est

# Data Set: Discharge Notes
The following analysis will be of the data frame with all discharge notes.

## Data Processing

First the data is processed and tokenized. This will take a while.

In [None]:
df = df.sample(frac = 1, random_state = rng)
x_tr_pre, x_v_pre, x_te_pre, y_tr_pre, y_v_pre, y_te_pre, df_tr_non_xfrm, vect = text_processing_tr_v(df)

### Imbalance Data Set Method
Use ADASYN Oversampling to Correct the imbalance in the training data set.

In [None]:
x_tr_adasyn, y_tr_adasyn = asadyn_sample(x_tr_pre, y_tr_pre)

### Dimension Reduction

Then we reduce the dimensions of the sparse matrics to improve model run time.

For Discharge Summary, SVD is only used for surveying the default models. For All Notes, SVD is used for final outputs.

In [None]:
x_tr_adasyn_svd, x_v_svd, x_te_svd, svd_n = trunc_svd(x_tr_adasyn, x_v_pre, x_te_pre, svd_features)

Check the shape of the reduced data frame 

In [None]:
svd_var = '%.1f' %svd_n.explained_variance_ratio_.sum()
print('The explained variance of the Truncatved SVD dimension reduction using', 
      svd_features,'features is %.3f.' %svd_n.explained_variance_ratio_.sum())

## Modeling - Default Settings

### Naive Bayes

#### Pre-SVD
Naive Bayes will not run with Truncated SVD due to negative matrix values.

In [None]:
# # Define X/Y Values
# x_tr = x_tr_adasyn
# x_v = x_v_pre
# y_tr = y_tr_adasyn
# y_v = y_v_pre

# train_model(clf, x_tr, x_v, y_tr, y_v)

### Logistic Regression

In [None]:
# # Define Classifier - Default Settings
# clf = LogisticRegression(random_state = rng, solver = solver_log, max_iter = max_iter_log)

#### SVD

In [None]:
# # Define X/Y Values
# x_tr = x_tr_adasyn_svd
# x_v = x_v_svd
# y_tr = y_tr_adasyn
# y_v = y_v_pre

# train_model(clf, x_tr, x_v, y_tr, y_v)

###  Random Forest

In [None]:
# Define Classifier - Default Settings
# clf = RandomForestClassifier(random_state = rng, n_estimators = 100)

#### SVD

In [None]:
# # Define X/Y Values
# x_tr = x_tr_adasyn_svd
# x_v = x_v_svd
# y_tr = y_tr_adasyn
# y_v = y_v_pre

# train_model(clf, x_tr, x_v, y_tr, y_v)

### Ada Boost Classifier

In [None]:
# # Define Classifier - Default Settings
# clf = AdaBoostClassifier(random_state = rng)

#### SVD

In [None]:
# x_tr = x_tr_adasyn_svd
# x_v = x_v_svd
# y_tr = y_tr_adasyn
# y_v = y_v_pre

# train_model(clf, x_tr, x_v, y_tr, y_v)

### XGBoost

In [None]:
# # Define Classifier - Default Settings
# clf = XGBClassifier(random_state = rng)

#### SVD

In [None]:
# x_tr = x_tr_adasyn_svd
# x_v = x_v_svd
# y_tr = y_tr_adasyn
# y_v = y_v_pre

# train_model(clf, x_tr, x_v, y_tr, y_v)

## Parameter Optimization

### Define X / Y Training and Validation Values

In [None]:
x_tr = x_tr_adasyn
x_v = x_v_pre
y_tr = y_tr_adasyn
y_v = y_v_pre

### Naive Bayes

Naive Bayes cannot use the dimensionally reduced matrices due to negative values.

#### Define Function Inputs

In [None]:
# Establish Classifier
clf = MultinomialNB()

#Define Parameters
alpha = np.logspace(1, 5, 4)

# Create hyperparameter options
hyperparameters = dict(alpha = alpha)

In [None]:
clf = grid_search_opt(x_tr, x_v, y_tr, y_v, clf, hyperparameters)

train_model(clf, x_tr, x_v, y_tr, y_v)

# Define for Test Analysis
clf_nb_opt = clf

### KNNeighbors

#### Define Function Inputs

In [None]:
# Establish Classifier
clf = KNeighborsClassifier()

#Define Parameters
n_neighbors  = np.linspace(3,33,11).astype('int') 
weights = ('uniform', 'distance')
metric = ('euclidean', 'manhattan')

# Create hyperparameter options
hyperparameters = dict(n_neighbors = n_neighbors, weights = weights, metric = metric)

#### Pre SVD

In [None]:
clf = grid_search_opt(x_tr, x_v, y_tr, y_v, clf, hyperparameters)

train_model(clf, x_tr, x_v, y_tr, y_v)

# Define for Test Analysis
clf_knn_opt = clf

### Logistic Regression

#### Define Function Inputs

In [None]:
# Establish Classifier
clf = LogisticRegression(random_state = rng, solver = solver_log, max_iter = max_iter_log)

#Define Parameters
penalty = ['l1','l2']       
C = np.logspace(-5, 1, 7)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty = penalty)

In [None]:
clf = grid_search_opt(x_tr, x_v, y_tr, y_v, clf, hyperparameters)

train_model(clf, x_tr, x_v, y_tr, y_v)

# Define for Test Analysis
clf_logreg_opt = clf

### Random Forest

Random Forest models have so far performed so-so, and have taken a very long time to run. For the training data it often overfits to match 100%. I recommend commenting out these models until final analysis

#### Define Function Inputs

In [None]:
# # Establish Classifier
# clf = RandomForestClassifier(random_state = rng)

# #Define Parameters
# n_estimators = [100, 300, 500, 800, 1200]
# max_depth = [5, 8, 15, 25, 30]
# min_samples_split = [2, 5, 10, 15, 100]
# min_samples_leaf = [1, 2, 5, 10] 


# # Create hyperparameter options
# hyperparameters = dict(n_estimators=n_estimators, 
#                        max_depth=max_depth,
#                        min_samples_split = min_samples_split,
#                        min_samples_leaf = min_samples_leaf)

#### Pre SVD

In [None]:
# clf = grid_search_opt(x_tr, x_v, clf, hyperparameters)

# train_model(clf, x_tr, x_v)

# # Define for Test Analysis
# clf_rf_opt_pre = clf

### Ada Boost

#### Define Function Inputs

In [None]:
# Establish Classifier
clf = AdaBoostClassifier(random_state = rng)

#Define Parameters
n_estimators = [50, 100, 150, 200, 250]
learning_rate = np.logspace(-4, 2, 7)

# Create hyperparameter options
hyperparameters = dict(n_estimators=n_estimators, 
                       learning_rate=learning_rate)

#### Pre SVD

In [None]:
clf = grid_search_opt(x_tr, x_v, y_tr, y_v, clf, hyperparameters)

train_model(clf, x_tr, x_v, y_tr, y_v)

# Define for Test Analysis
clf_ada_opt = clf

### XGBoost

#### Define Function Inputs

In [None]:
# Establish Classifier
clf = XGBClassifier(random_state = 5590)

#Define Parameters
n_estimators = [25, 50, 100, 150, 200]
learning_rate = np.logspace(-4, 1, 6)

# Create hyperparameter options
hyperparameters = dict(n_estimators=n_estimators, learning_rate=learning_rate)

#### Pre SVD

In [None]:
clf = grid_search_opt(x_tr, x_v, y_tr, y_v, clf, hyperparameters)

train_model(clf, x_tr, x_v, y_tr, y_v)

# Define for Test Analysis
clf_xbg_opt = clf

# Test Set Scoring

### Define X/Y Values
For Naive Bayes, the pre-svd values will be used and insert manually. Global values for x and y will be defined here for the remaining models.

In [None]:
x = x_te_pre
y = y_te_pre

## General Function

In [None]:
def pretty_cm(y_pred, y_truth, labels):
    '''
    'Pretty' implementation of a confusion matrix with some evaluation statistics.
    
    Input:
    y_pred - object with class predictions from the model
    y_truth - object with actual classes
    labels - list containing label names
    '''
    
    cm = confusion_matrix(y_truth, y_pred)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'BuGn_r')
    ax.set_xlabel('Predicted label')
    ax.set_ylabel('Actual label')
    ax.set_title('Confusion Matrix', size = 15) 
    ax.xaxis.set_ticklabels(labels)
    ax.yaxis.set_ticklabels(labels)
    
    print('#######################')
    print('Evaluation metrics ####')
    print('#######################')
    print('Accuracy: {:.3f}'.format(accuracy_score(y_truth, y_pred)))
    print('AUC Score: {:.3f}'.format(roc_auc_score(y_truth, y_pred)))
    print('Precision: {:.3f}'.format(precision_score(y_truth, y_pred)))
    print('Recall: {:.3f}'.format(recall_score(y_truth, y_pred)))
    print('F1: {:.3f}'.format(f1_score(y_truth, y_pred)))
    print('Prevelance: {:.3f}'.format(calc_prevalence(y_truth)))

In [None]:
def test_score_cv(clf, x, y):
    scoring = ['roc_auc', 'accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1']
    
    results = cross_validate(estimator = clf,
                                          X = x,
                                          y = y,
                                          cv = 5,
                                          scoring = scoring)
    for metric_name in results.keys():
        average_score = np.average(results[metric_name])
        print('%s : %.3f' % (metric_name, average_score))
    
    prevalence_te = '%.3f' %calc_prevalence(y)
    print('positive target prevalence: ', prevalence_te)

## Naive Bayes

In [None]:
clf = clf_nb_opt

### Overall Metrics

In [None]:
y_true, y_pred = y_te_pre, clf.predict(x_te_pre)
pretty_cm(y_pred, y_true, [0, 1])

### Cross Validation

In [None]:
test_score_cv(clf, x_te_pre, y_te_pre)

## KNNeighbors

In [None]:
clf = clf_knn_opt

### Overall Metrics

In [None]:
y_true, y_pred = y, clf.predict(x)
pretty_cm(y_pred, y_true, [0, 1])

### Cross Validation

In [None]:
test_score_cv(clf, x, y)

## Logistic Regression

In [None]:
clf = clf_logreg_opt

### Overall Metrics

In [None]:
y_true, y_pred = y, clf.predict(x)
pretty_cm(y_pred, y_true, [0, 1])

### Cross Validation

In [None]:
test_score_cv(clf, x, y)

## Ada Boost

In [None]:
clf = clf_ada_opt

### Overall Metrics

In [None]:
y_true, y_pred = y, clf.predict(x)
pretty_cm(y_pred, y_true, [0, 1])

### Cross Validation

In [None]:
test_score_cv(clf, x, y)

## XGBoost

In [None]:
clf = clf_xbg_opt

### Overall Metrics

In [None]:
y_true, y_pred = y, clf.predict(x)
pretty_cm(y_pred, y_true, [0, 1])

### Cross Validation

In [None]:
test_score_cv(clf, x, y)