# Length of Stay Bi-LSTM

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np
import pandas as pd
import ast
import csv
import plotly.plotly as py
import plotly.graph_objs as go

from random import random
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Dropout
from keras.layers import Activation
from keras import optimizers
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers import TimeDistributed
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import max_norm

from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV
from common_utility.IOUtility import spark_read_parquet
from pyspark.sql import SparkSession
from etl.Transformation import *

from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()

np.random.seed(7)

## Reading in Dataset

In [None]:
spark = SparkSession.builder.master("local").appName("read parquet").getOrCreate()

In [None]:
df = spark.read.parquet('/data/AppsFiles/batch/parquetDatabase/los/nullImputationDFV14.parquet')
.select("Column Names needed to be removed due to issues regarding intellectual property").toPandas()
# df = spark_read_parquet(spark, "LSTMNullImputation")

In [None]:
num_ts_list = ["Column Names needed to be due to for issues regarding intellectual property"]

num_list = ['Age']

## Functions

In [None]:
def create_lstm_feature(df, num_ts_list, num_list, general_feature=["VISIT_ID", "label"]):
    '''
    Generate feature for lstm model
    :param df: pandas df
    :param numeric_ts: numerical time series feature list
    :param cat_ts: categorical time series feature list
    :param numeric_non_ts: numerical non-time series feature list
    :param cat_non_ts: categorical non-time series feature list
    :param general_feature: Non-training feature, label
    :return: pandas df
    '''
    feature_df = df[general_feature]
    for feature in num_ts_list:
        temp = create_lstm_numerical_array_feature(df, feature)
        feature_df = pd.concat([temp, feature_df], axis=1)

    for feature in num_list:
        temp = create_lstm_numerical_non_array_feature(df, feature)
        feature_df = pd.concat([temp, feature_df], axis=1)

    return feature_df

def create_lstm_numerical_array_feature(df, feature):

    '''
    Create Numerical Time Series columns for LSTM model
    :param df: raw pandas df LSTM Model
    :param feature: feature list
    :return: pandas df
    '''

    series = df[feature].apply(lambda x: np.array(x.toArray())).values.reshape(-1, 1)
    x_train = np.apply_along_axis(lambda x: x[0], 1, series)
    cols = [feature + '_t' + str(i + 1) for i in list(range(7))]
    return pd.DataFrame(x_train, columns=cols)

def lstm_feature_generator(x, y):

    '''
    generate lstm feature df
    :param x: training feature
    :param y: training label
    :return: np.array feature and label for training
    '''

    n_features = x.shape[1]/7
    input_x = x.values
    input_x = input_x.reshape(x.shape[0], 7, int(n_features))
#     input_y = to_categorical(y, num_classes=len(np.unique(y.values)))
    input_y = y.values
    return input_x, input_y

def create_grid_search_dict(model):
    parser = ConfigParser(allow_no_value=True)
    parser.read('{}resources/application.conf'.format(module_path))
    var = ast.literal_eval(parser.get(model, "grid_search_var_list"))
    grid_dict = {}
    for hyper_param in var:
        grid_list = ast.literal_eval(parser.get(model, hyper_param))
        grid_dict.update({hyper_param:grid_list})
    return grid_dict

def create_early_stopping():
    '''
    create early stopping parameter
    :return: early stopping list
    '''
    early_stopping = EarlyStopping(monitor='val_acc', patience=100, verbose=0, mode='auto')
    return early_stopping

def undersampling(y_df, x_df):
    num_neg_labels = len(y_df[y_df['label']==0])
    pos_label_indices = y_df[y_df['label']==1].index
    random_indices = np.random.choice(pos_label_indices, num_neg_labels, replace=False)
    neg_label_indices = y_df[y_df['label']==0].index
    under_sample_indices = np.concatenate([neg_label_indices,random_indices])
    under_sample_y_df = y_df.loc[under_sample_indices]
    under_sample_x_df = x_df.loc[under_sample_indices]
    return under_sample_y_df, under_sample_x_df

#create ROC
def create_roc_trace(df, label_col, pred_col):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(df[label_col], df[pred_col])

    roc_auc = auc(false_positive_rate, true_positive_rate)

    trace = go.Scatter(x=false_positive_rate, y=true_positive_rate,
                        mode='lines',
                        line=dict(width=2),
                        name='ROC curve (area = %0.2f)' % roc_auc
                        )
    return trace

def create_overlay_roc_curve(trace_list):
    trace2 = go.Scatter(x=[0, 1], y=[0, 1],
                        mode='lines',
                        line=dict(color='navy', width=2, dash='dash'),
                        showlegend=False)
    data = [trace_list] + [trace2]
    layout = go.Layout(title='<b>Receiver Operating Characteristic Curve',
                       height=500,
                       width=700,
                       xaxis=dict(title='False Positive Rate',
                                  range=[0, 1],
                                  tick0=0,
                                  dtick=0.1),
                       yaxis=dict(title='True Positive Rate',
                                  range=[0, 1],
                                  tick0=0,
                                  dtick=0.1))
    fig = go.Figure(data=data, layout=layout)
    return iplot(fig, filename='overlaid histogram')


## Some Preprocessing

In [None]:
#create feature Dataframe (explode timeseries)
feature_df = create_lstm_feature(df, num_ts_list, num_list, general_feature=["label"])
feature_df = feature_df.drop(['Age_t8', 'Age_t9', 'Age_t10', 'Age_t11'], axis=1)

In [None]:
#separate input and output dataframe
inputs_x = feature_df.loc[:, feature_df.columns != 'label']
outputs_y = feature_df.loc[:, feature_df.columns == 'label']
#perform train / validation / test split
x_train1, x_test1, y_train1, y_test1 = train_test_split(inputs_x, outputs_y, test_size=0.2)
x_train2, x_val1, y_train2, y_val1 = train_test_split(x_train1, y_train1, test_size=0.2)

In [None]:
#perform undersampling
y_train3, x_train3 = undersampling(y_train2, x_train2)
y_val2, x_val2 = undersampling(y_val1, x_val1)

In [None]:
#create 3D feature array from df as required for LSTM input
x_train4, y_train4 = lstm_feature_generator(x_train3, y_train3)
x_val3, y_val3 = lstm_feature_generator(x_val2, y_val2)
x_test2, y_test2 = lstm_feature_generator(x_test1, y_test1)

In [None]:
#just some renaming for simplicity
x_train, y_train, x_val, y_val, x_test, y_test = x_train4, y_train4, x_val3, y_val3, x_test2, y_test2
x_train.shape, x_val.shape, x_test.shape, y_train.shape, y_val.shape, y_test.shape

## The Model

In [None]:
#The Bi-LSTM Model function
def create_BiLSTM(x_train, y_train, x_val, y_val, neurons, merge_mode, dropout, lr, batch_size, epochs):
    model = Sequential()
    model.add(Bidirectional(LSTM(neurons), merge_mode=merge_mode, input_shape=(x_train.shape[1],x_train.shape[2])))
#     model.add(LSTM(neurons, input_shape=(x_train.shape[1],x_train.shape[2]), kernel_constraint=max_norm(2)))
    model.add(Dropout(dropout))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
#     optimizer = optimizers.Adam(lr=lr)
#     optimizer = optimizers.RMSprop(lr=lr)
    optimizer = optimizers.Nadam(lr=lr)
    model.compile(loss='binary_crossentropy', optimizer = optimizer, metrics=['acc'])
    early_stopping = create_early_stopping()
    
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=epochs, verbose=1, batch_size=batch_size, callbacks=[early_stopping],)

    return model


In [None]:
#Training the model
model = create_BiLSTM(x_train, y_train, x_val, y_val, neurons=200, merge_mode='ave', dropout=0.01, lr=0.0005, batch_size=500, epochs=1000)

## Model Evaluation

In [None]:
#Test set: Specificity and Sensitivity 
y_pred_test = model.predict_classes(x_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_test).ravel()
test_spec = tn/(tn+fp)
test_sens = tp/(tp+fn)
test_spec, test_sens

In [None]:
#Train set: Specificity and Sensitivity
y_pred_train = model.predict_classes(x_train)
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_train).ravel()
train_spec = tn/(tn+fp)
train_sens = tp/(tp+fn)
train_spec, train_sens

In [None]:
#Test set: f1 Score
f1_test = f1_score(y_test, y_pred_test)
f1_test

In [None]:
#Train set: f1 Score
f1_train = f1_score(y_train, y_pred_train)
f1_train

In [None]:
#Test set: ROC curve
y_test_score = model.predict(x_test)
d = {'test_labels':y_test.reshape(y_test.shape[0]).tolist(), 'test_probabilities':y_test_score.reshape(y_test_score.shape[0]).tolist()}
test_prob_df = pd.DataFrame(data=d)
roc_trace = create_roc_trace(test_prob_df, 'test_labels', 'test_probabilities')
create_overlay_roc_curve(roc_trace)

In [None]:
#Train set: ROC curve
y_train_score = model.predict(x_train)
d = {'train_labels':y_train.reshape(y_train.shape[0]).tolist(), 'train_probabilities':y_train_score.reshape(y_train_score.shape[0]).tolist()}
train_prob_df = pd.DataFrame(data=d)
roc_trace = create_roc_trace(train_prob_df, 'train_labels', 'train_probabilities')
create_overlay_roc_curve(roc_trace)