# Notebook Description
Initial notebook for larger analysis of multiple reruns. Includes random search hyperparameter tuning, along with evaluating over multiple reruns for all data types: baseline, random and balanced.

In [None]:
from helper_functions.nn import create_nn
from helper_functions.notebook_utils import BinaryExpectedCalibrationError, balance_classes, encode_and_split_new, equate_weights, generate_one_hot, paired_t_test, train_simultaneously, two_sample_t_test, verify_proportions
from helper_functions.testing import encode_and_split


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io

import tensorflow as tf

from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import ttest_rel
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

from keras.optimizers import Adam
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dropout
from tensorflow.keras.initializers import GlorotUniform

import keras.backend as K
from sklearn.calibration import calibration_curve

import h5py
import shutil

In [None]:
file_path = 'combined_w_age_and_insurance.csv'  # Update with your file path

try:
  merged_df = pd.read_csv(file_path)
except FileNotFoundError:
  print(f"Error: File not found at {file_path}")
except pd.errors.ParserError:
  print(f"Error: Unable to parse the CSV file at {file_path}. Check the file format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")

merged_df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
frequent_races = ['WHITE', 'BLACK', 'HISPANIC', 'ASIAN']

# Update 'race' column
merged_df.loc[~merged_df['race'].isin(frequent_races), 'race'] = 'OTHER'

columns = merged_df.columns

numerical_columns = list(merged_df.columns[5:13])
numerical_columns.append('age')

for column in numerical_columns:
    scaler = MinMaxScaler()
    values = merged_df[column].values.reshape(-1, 1)
    merged_df[column] = scaler.fit_transform(values)

copy = merged_df.copy()

categorical_columns = ['gender', 'arrival_transport', 'race']
numeric_values = copy[numerical_columns].values
categorical_values = copy[categorical_columns]

encoder = OneHotEncoder()

# Fit the OneHotEncoder with the categorical values
encoder.fit(categorical_values)

categorical_values = encoder.transform(categorical_values).toarray()

X_num_cat = np.concatenate((numeric_values, categorical_values), axis=1)

disposition_column = merged_df['disposition'].values

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder and transform the 'disposition' column
Y_encoded = label_encoder.fit_transform(disposition_column)
Y = (1 - Y_encoded)

text = merged_df['chiefcomplaint'].values

# Combining text data with numerical data for training and testing
string_array_reshaped = text.reshape(-1, 1)
X_combined = np.concatenate((string_array_reshaped, X_num_cat), axis=1)

races = ['ASIAN', 'BLACK', 'HISPANIC', 'OTHER', 'WHITE']

In [None]:
for i in range(len(merged_df)):
    row = X_combined[i, -5:]
    for j in range(row.shape[0]):
        if row[j] == 1:
            ind = j
    if ind != races.index(merged_df['race'].loc[i]):
        print('Something went wrong')

proportions = []

for race in races:
    print(f'Race: {race}')
    pct = (len(merged_df[merged_df['race']==race]))/(len(merged_df))
    proportions.append(pct)
    print(f'- {round(pct*100, 2)}% of dataset')

# Baseline Model Hyperparameter Tuning & Testing

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_combined, Y, test_size=0.3, random_state=42)

# Dictionary of array masks to correspond which races are at each index for easier computation
masks = {race: None for race in races}
for i in range(len(races)):
    masks[races[i]] = X_train[:, -5+i] == 1

X_train, X_val, Y_train, Y_val = train_test_split(X_combined, Y, test_size=0.3)

Y_train = Y_train.astype('float32')
Y_val = Y_val.astype('float32')

X_num_train, X_num_val, X_enc_train, X_enc_val = encode_and_split_new(X_train, X_val)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from sklearn.model_selection import train_test_split
import numpy as np
import random


def create_model(shape_text, shape_num, layer1_values, layer2_values, num_bins=10, dropout=0):
    initializer = GlorotUniform()

    # Define input shapes
    text_input = Input(shape=(shape_text,), name='encoded_text_input')
    combined_input = Input(shape=(shape_num,), name='numeric_input')

    # Define neural network for text data with optional dropout
    text_model = text_input
    for units in layer1_values:
        text_model = Dense(units, activation='relu', kernel_initializer=initializer)(text_model)
        if dropout > 0:
            text_model = Dropout(dropout)(text_model)

    # Concatenate text model output with combined numerical/categorical input
    combined_with_text = Concatenate()([combined_input, text_model])

    # Define additional layers if needed with optional dropout
    for units in layer2_values:
        combined_with_text = Dense(units, activation='relu', kernel_initializer=initializer)(combined_with_text)
        if dropout > 0:
            combined_with_text = Dropout(dropout)(combined_with_text)

    # Output layer
    output = Dense(1, activation='sigmoid', kernel_initializer=initializer)(combined_with_text)

    # Define model
    model = Model(inputs=[combined_input, text_input], outputs=output)

    loss = BinaryExpectedCalibrationError(num_bins=num_bins)
    opt = tf.keras.optimizers.Adam()

    model.compile(optimizer=opt,
                  loss=loss,
                  metrics=['accuracy', AUC(name='val_auc')])

    return model

# Custom callback to track overall best AUC across all iterations
class PrintOnBestAUC(Callback):
    def __init__(self, best_auc_global_ref, min_diff = 0.001):
        super().__init__()
        self.best_auc_global_ref = best_auc_global_ref
        self.min_diff = min_diff

    def on_epoch_end(self, epoch, logs=None):
        current_auc = logs.get("val_auc")
        print(f"Epoch {epoch + 1}:")
        # Check against global best AUC stored in the list
        if current_auc > self.best_auc_global_ref[0] + self.min_diff:
            print(f" Best Validation Set AUC improved from {self.best_auc_global_ref[0]} to {current_auc}")
            self.best_auc_global_ref[0] = current_auc
            print(f"  Training loss: {logs['loss']:.4f} - accuracy: {logs['accuracy']:.4f}")
            print(f"  Validation loss: {logs['val_loss']:.4f} - accuracy: {logs['val_accuracy']:.4f} - AUC: {logs['val_auc']:.4f}")
            # Update best model when global best_auc improves
            print("Saving best model...\n")
            model.save('best_model.keras')  # Save only when overall best AUC improves

        else:
            print(f"AUC did not improve from {self.best_auc_global_ref[0]}\n")

In [None]:
# Check if TensorFlow is using GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Verify if TensorFlow is using the GPU
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

In [None]:
# Define hyperparameter ranges
layer1_units_range = [32, 64, 128]
num_layer1_range = [1, 2, 3]
layer2_units_range = [32, 64, 128]
num_layer2_range = [1, 2, 3]
dropout_range = [0.0, 0.25]
num_bins_range = [4, 8, 16]
batch_size_range = [32, 64, 128, 1024]

# Define number of random search iterations
n_iterations = 100
best_loss = float("inf")
best_auc_global = [-np.inf]  # Use a list for best AUC to allow for updates inside the callback

# Dictionary to store best hyperparameters
best_hyperparams = {}

# Now, modify the random search loop to utilize the global best AUC reference
for i in range(n_iterations):
    print(f"\n ---------- Random search iteration {i + 1}/{n_iterations} ----------\n")

    # Randomly select hyperparameters
    layer1_units = random.choice(layer1_units_range)
    num_layer1 = random.choice(num_layer1_range)
    layer2_units = random.choice(layer2_units_range)
    num_layer2 = random.choice(num_layer2_range)
    dropout = random.choice(dropout_range)
    num_bins = random.choice(num_bins_range)
    batch_size = random.choice(batch_size_range)

    print(f"Hyperparameters: \n- layer1_units={layer1_units}, \n- num_layer1={num_layer1}, \n- layer2_units={layer2_units}, \n- num_layer2={num_layer2}, \n- dropout={dropout}, \n- num_bins={num_bins}, \n- batch_size={batch_size}\n")

    # Create the model
    model = create_model(
        shape_text=X_enc_train.shape[1],
        shape_num=X_num_train.shape[1],
        layer1_values=[layer1_units] * num_layer1,
        layer2_values=[layer2_units] * num_layer2,
        num_bins=num_bins,
        dropout=dropout
    )

    # Callbacks for early stopping and model checkpoint
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    # Create the callback with a reference to the global best AUC (using list)
    print_callback = PrintOnBestAUC(best_auc_global)

    # Train the model
    history = model.fit(
        [X_num_train, X_enc_train], Y_train,
        validation_data=([X_num_val, X_enc_val], Y_val),
        epochs=20,  # You can adjust the number of epochs
        batch_size=batch_size,
        callbacks=[early_stopping, print_callback],
        verbose=0  # Set verbose to 0 to suppress default logging
    )

    # Get the best validation loss
    val_loss = min(history.history['val_loss'])

    # Update the best AUC and best hyperparameters if the model improves
    if val_loss < best_loss or best_auc_global[0] > best_hyperparams.get("best_auc", -np.inf):
        best_loss = val_loss
        # Store the best hyperparameters
        best_hyperparams = {
            'layer1_units': layer1_units,
            'num_layer1': num_layer1,
            'layer2_units': layer2_units,
            'num_layer2': num_layer2,
            'dropout': dropout,
            'num_bins': num_bins,
            'batch_size': batch_size,
            'best_loss': best_loss,
            'best_auc': best_auc_global[0]
        }
        print(f"Best model updated with validation loss: {best_loss:.4f} and AUC: {best_auc_global[0]:.4f}")

# Print the best hyperparameters and final results at the end of the search
print(f"\nRandom search complete. Best validation loss: {best_loss:.4f}, Best AUC: {best_auc_global[0]:.4f}")
print(f"Best hyperparameters: {best_hyperparams}")