In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# Methods

- We will build models reflecting three levels of available data
    - "Easy" is mostly data already available to the emergency dispatcher before the notification comes in, like month, day of week, hour, weather, urban/rural, 

# Setup

## Import Libraries

In [2]:
print ('Install Packages')

import sys, copy, math, time, os, gc

print ('Python version: {}'.format(sys.version))

#from collections import Counter

import numpy as np
print ('NumPy version: {}'.format(np.__version__))
np.set_printoptions(suppress=True)

import scipy as sc
print ('SciPy version:  {}'.format(sc.__version__))

import tensorflow as tf
print ('TensorFlow version:  {}'.format(tf.__version__))
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

from tensorflow import keras
print ('Keras version:  {}'.format(keras.__version__))

from keras import layers
import keras.backend as K
#from keras.layers import IntegerLookup
#from keras.layers import Normalization
#from keras.layers import StringLookup
#from keras.utils import get_custom_objects
#from keras.utils import tf_utils

from keras.models import Sequential
from keras.layers import Dense
from keras import callbacks

#from keras.wrappers.scikit_learn import KerasClassifier
from scikeras.wrappers import KerasClassifier

import pandas as pd
print ('Pandas version:  {}'.format(pd.__version__))
pd.set_option('display.max_rows', 500)

import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
#    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

import matplotlib.pyplot as plt
%matplotlib inline

# Library for reading Microsoft Access files
#import pandas_access as mdb

import sklearn
print ('SciKit-Learn version: {}'.format(sklearn.__version__))
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
#from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
#from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

import imblearn
print ('Imbalanced-Learn version: {}'.format(imblearn.__version__))
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier

#!pip install pydot

# Set Randomness.  Copied from https://www.kaggle.com/code/abazdyrev/keras-nn-focal-loss-experiments
import random
random_seed = 123
print ('random_seed =', random_seed)
np.random.seed(random_seed) # NumPy
random.seed(random_seed) # Python
tf.random.set_seed(random_seed) # Tensorflow

import warnings
warnings.filterwarnings('ignore')

print ('Finished Installing Packages')

Install Packages
Python version: 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:51:49) [Clang 16.0.6 ]
NumPy version: 1.26.4
SciPy version:  1.13.1
TensorFlow version:  2.16.2
Keras version:  3.4.1
Pandas version:  2.2.2
SciKit-Learn version: 1.5.0
Imbalanced-Learn version: 0.12.3
random_seed = 123
Finished Installing Packages


## Get Data

In [3]:
def Get_Data(filename):
    print ('Get_Data()')
    data = pd.read_csv(filename, low_memory=False)
        
        
    print ('data.shape: ', data.shape)
    print ('End Get_Data()')
    print ()
    return data



# Models

# Five-Fold Cross Validation

In [4]:
def Five_Fold_Cross_Validation(data, model, filename, title):
    print ()
    print ('------------------------')
    print ()
    print (filename)
    print ()
    
    target = 'HOSPITAL'
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = random.randint(1,100))
    target_column = data.loc[:,target]
    y_test = []
    y_proba = []
    y_pred = []
    
    iteration = 0
    for train_index, test_index in skf.split(data, target_column):
        print ('K-fold iteration = ', iteration)
        iteration += 1
        
#        print ('len(train_index) = ', len(train_index))
#        print (train_index)
#        print ('len(test_index) = ', len(test_index))
#        print (test_index)
        
        train_fold = data.iloc[train_index]
#        print ()
#        print ('train_fold')
#        display(train_fold)
        
        test_fold = data.iloc[test_index]
#        print ()
#        print ('test_fold')
#        display(test_fold)
#        print ('type(test_fold) = ', type(test_fold))
        
        
        X_train_fold = train_fold.drop(columns=[target])
        X_test_fold = test_fold.drop(columns=[target])
        y_train_fold = train_fold[target].squeeze()        
        y_test_fold = test_fold[target].squeeze()
#        print ('type(y_test_fold) = ', type(y_test_fold))
        
#        print ()
        model.fit(X_train_fold, y_train_fold.values.ravel())
        y_proba_fold = model.predict_proba(X_test_fold)
        y_proba_fold = [x[1] for x in y_proba_fold]
        y_pred_fold = list(np.around(np.array(y_proba_fold),0))
        
        ###
#        print ('X_train_fold')
#        display(X_train_fold)
#        print ('y_train_fold')
#        display(y_train_fold)
#        print ('y_train_fold.value_counts()')
#        display(y_train_fold.value_counts())
#        print ('y_proba_fold')
#        print (y_proba_fold)
#        ###
#        
        y_test = y_test + y_test_fold.to_list()
        y_proba = y_proba + y_proba_fold
#        print ('len(y_proba) = ', len(y_proba))
        y_pred = y_pred + y_pred_fold

    y_test = np.array(y_test)
    y_proba = np.array(y_proba)
    y_pred = np.array(y_pred)
    
    DF = pd.DataFrame(y_test, columns=['y_test'])
    DF['y_proba'] = y_proba
    DF['y_pred'] = y_pred
    DF.to_csv('../../Big_Files/CRSS_05_' + filename + '.csv')
#    print (DF)
    
    
#    Chart_and_Plots(y_test, y_proba, y_pred, filename, title)
    
    
    print ()
#    return model    


In [5]:
def BRFC_5_Fold(data, target, alpha, filename):
     
    title = ''
    model = BalancedRandomForestClassifier(
        bootstrap = True, ccp_alpha = 0.0, criterion = 'gini', 
        max_depth = None,
#        max_depth = 40, 
        max_features = 'sqrt', 
        max_leaf_nodes = None,
#        max_leaf_nodes = 10000,  
        max_samples = None, 
        min_impurity_decrease = 0.0, 
        min_samples_leaf = 1, 
        min_samples_split = 2, 
        min_weight_fraction_leaf = 0.0, 
        n_estimators = 100, 
#        n_estimators = 1000, 
        n_jobs = None, 
        oob_score = False, 
        random_state = random.randint(1,100), 
        replacement = False, 
        sampling_strategy = 'auto', 
        verbose = 0, 
        warm_start = False,
        class_weight = {0:1-alpha, 1:alpha}
    )
    Five_Fold_Cross_Validation(data, model, filename, title)



In [6]:
def RFC_5_Fold(data, target, filename):
    title = ''
    model = RandomForestClassifier(max_depth=2, random_state = random.randint(1,100))
    Five_Fold_Cross_Validation(data, model, filename, title)


In [7]:
def AdaBoost_5_Fold(data, target, filename):
    title = ''
    model = AdaBoostClassifier(n_estimators=100, random_state = random.randint(1,100))
    Five_Fold_Cross_Validation(data, model, filename, title)


In [8]:
def RUSBoost_5_Fold(data, target, filename):
    title = ''
    estimator = DecisionTreeClassifier(
        max_depth=1,
#        class_weight={0:(1+r_target)/(2*r_target), 1:(1+r_target)/(2*1)},
    )    
    model = RUSBoostClassifier(
        n_estimators=1000, 
        estimator=estimator,
        algorithm='SAMME.R', 
        random_state = random.randint(1,100)
    )
    Five_Fold_Cross_Validation(data, model, filename, title)


In [9]:
def BalancedBagging_5_Fold(data, target, filename):
    title = ''
    model = BalancedBaggingClassifier(
        random_state = random.randint(1,100)
    )
    Five_Fold_Cross_Validation(data, model, filename, title)
    

In [10]:
def EasyEnsemble_5_Fold(data, target, filename):
    title = ''
    estimator = AdaBoostClassifier(n_estimators=10, random_state = random.randint(1,100))
    model = EasyEnsembleClassifier(n_estimators=10, estimator=estimator, random_state = random.randint(1,100))
    Five_Fold_Cross_Validation(data, model, filename, title)

    

In [11]:
def LogisticRegression_5_Fold(data, target, alpha, filename):
    title = ''
    model = LogisticRegression(
#        class_weight={0:(1+r_target)/(2*r_target), 1:(1+r_target)/(2*1)}
        class_weight = {0:1-alpha, 1:alpha},
        max_iter=1000,
        random_state = random.randint(1,100),
    )
    Five_Fold_Cross_Validation(data, model, filename, title)


In [12]:
def KBFC_5_Fold(data, target, alpha, gamma, filename):
    print ()
    print ('------------------------')
    print ()
    print (filename)
    print ()
    
    target = 'HOSPITAL'
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state  = random.randint(1,100))
    target_column = data.loc[:,target]
    y_test = []
    y_proba = []
    y_pred = []
    
    iteration = 0
    for train_index, test_index in skf.split(data, target_column):
        print ()
        print ()
        print ('K-fold iteration = ', iteration)
        iteration += 1
        
#        print ('len(train_index) = ', len(train_index))
#        print (train_index)
#        print ('len(test_index) = ', len(test_index))
#        print (test_index)
        
        train_fold = data.iloc[train_index]
#        print ()
#        print ('train_fold')
#        display(train_fold)
        
        test_fold = data.iloc[test_index]
#        print ()
#        print ('test_fold')
#        display(test_fold)
#        print ('type(test_fold) = ', type(test_fold))
        
        
        X_train_fold = train_fold.drop(columns=[target])
        X_test_fold = test_fold.drop(columns=[target])
        y_train_fold = train_fold[target].squeeze()        
        y_test_fold = test_fold[target].squeeze()
#        print ('type(y_test_fold) = ', type(y_test_fold))

#        print ('len(X_train_fold) = ', len(X_train_fold))
#        print ('len(X_test_fold) = ', len(X_test_fold))
#        print ('len(y_train_fold) = ', len(y_train_fold))
#        print ('len(y_test_fold) = ', len(y_test_fold))
#        print ()
        
#        print ()

        loss_function = tf.keras.losses.BinaryFocalCrossentropy(
            apply_class_balancing=True,
            alpha=alpha,
            gamma=gamma,
    #        from_logits=False,
    #        label_smoothing=0.0,
    #        axis=-1,
    #        reduction=losses_utils.ReductionV2.AUTO,
    #        name='binary_focal_crossentropy'
        )   
    
        # create model
        model = Sequential()
#        print ('data.shape = ', data.shape, data.shape[-1])
        model.add(Dense(60, input_shape=(data.shape[-1]-1,), activation='relu'))
#        model.add(Dense(30, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))    
        # Compile model
        metrics = [
            keras.metrics.Precision(name="precision"),
            keras.metrics.Recall(name="recall"),
    #        F1_Metric,
        ]
        model.compile(loss=loss_function, optimizer=tf.keras.optimizers.Adam(), metrics=metrics)
        
        earlystopping = callbacks.EarlyStopping(
#            monitor="val_loss",
            monitor = "recall",
            min_delta = 0.001,
            mode="auto",
            patience=2,
            restore_best_weights=True
        )

        estimator = KerasClassifier(
            model=model, 
            random_state = random.randint(1,100),
            metrics=metrics,
            batch_size=128, 
            verbose=0,
            epochs=30,
            callbacks=[earlystopping]
        )
    
        estimator.fit(X_train_fold, y_train_fold.values.ravel())
        y_proba_fold = estimator.predict_proba(X_test_fold)
        y_proba_fold = [x[1] for x in y_proba_fold]
        print ('len(y_proba_fold) = ', len(y_proba_fold))
        y_pred_fold = list(np.around(np.array(y_proba_fold),0))
        
        ###
#        print ('X_train_fold')
#        display(X_train_fold.head())
#        print ('y_train_fold')
#        display(y_train_fold.head())
#        print ('y_train_fold.value_counts()')
#        display(y_train_fold.value_counts())
#        print ('X_test_fold')
#        display(X_test_fold.head())
#        print ('y_test_fold')
#        display(y_test_fold.head())
#        print ('y_test_fold.value_counts()')
#        display(y_test_fold.value_counts())
#        print ('y_proba_fold')
#        print (y_proba_fold[:10])
#        ###
#        
        y_test = y_test + y_test_fold.to_list()
        y_proba = y_proba + y_proba_fold
#        print ('len(y_proba) = ', len(y_proba))
        y_pred = y_pred + y_pred_fold

    y_test = np.array(y_test)
    y_proba = np.array(y_proba)
    y_pred = np.array(y_pred)
    
    DF = pd.DataFrame(y_test, columns=['y_test'])
    DF['y_proba'] = y_proba
    DF['y_pred'] = y_pred
    print ('Length before dropna(): ',len(DF))
    DF.dropna(inplace=True)
    print ('Length after dropna(): ',len(DF))
    DF.to_csv('../../Big_Files/CRSS_05_' + filename + '.csv')
    
#    Chart_and_Plots(y_test, y_proba, y_pred, filename, '')
    
    
    print ()
#    return model    


# Run

In [13]:
def Build_Models(Features, run):
    read_filename = '../../Big_Files/CRSS_04' + run + '_' + Features + '.csv'
    print ('random_seed = %d' % random_seed)
    data = Get_Data(read_filename)
    data = data.astype('int64')
    target = 'HOSPITAL'
    write_filename_features = run + '_' + Features
    
    y = data[target]
    N = len(y)
    n = len(y[y==1])
#    p = (N-n)/n
#    alpha_balanced = p/(p+1)
    alpha_balanced = 1 - n/N
#    print ('p = ', p)
    print ('alpha_balanced = ', alpha_balanced)    

    """
    filename = 'RFC' + write_filename_features
    RFC_5_Fold(data, target, filename)
    """
    
    alphas = [
#        [0.5, '_0_5'],
#        [0.6, '_0_6'],
#        [0.7, '_0_7'],
#        [0.8, '_0_8'],
#        [0.85, '_0_85'],
#        [0.90, '_0_9'],
#        [0.95, '_0_95'],
    ]
    for alpha, alpha_string in alphas:
        filename = 'BRFC_alpha' + alpha_string + write_filename_features
        BRFC_5_Fold(data, target, alpha, filename)
    
    """
    alpha = 0.5
    filename = 'LogReg_alpha_0_5' + write_filename_features
    LogisticRegression_5_Fold(data, target, alpha, filename)

    alpha = alpha_balanced
    filename = 'LogReg_alpha_balanced' + write_filename_features
    LogisticRegression_5_Fold(data, target, alpha, filename)

    AdaBoost_5_Fold(data, target, 'AdaBoost' + write_filename_features)
    BalancedBagging_5_Fold(data, target, 'BalBag' + write_filename_features)
    EasyEnsemble_5_Fold(data, target, 'EEC' + write_filename_features)
    RUSBoost_5_Fold(data, target, 'RUSBoost' + write_filename_features)
    """
    
    KBFC = [
        [0.5, '_alpha_0_5', 0.0, '_gamma_0_0'],
#        [alpha_balanced, '_alpha_balanced', 0.0, '_gamma_0_0'],
#        [0.5, '_alpha_0_5', 1.0, '_gamma_1_0'],
#        [0.5, '_alpha_0_5', 2.0, '_gamma_2_0'],
    ]
    for alpha, alpha_str, gamma, gamma_str in KBFC:
        filename = 'KBFC' + alpha_str + gamma_str + write_filename_features
        KBFC_5_Fold(data, target, alpha, gamma, filename)
        
    del data
    gc collect
        


In [14]:
%%time
Runs = [
#    '_0_0_0_0',
#    '_0_0_0_1',
#    '_0_0_1_0',
#    '_0_0_1_1',
#    '_0_1_0_0',
#    '_0_1_0_1',
#    '_0_1_1_0',
#    '_0_1_1_1',
#    '_1_0_0_0',
#    '_1_0_0_1',
#    '_1_0_1_0',
    '_1_0_1_1',
    '_1_1_0_0',
    '_1_1_0_1',
    '_1_1_1_0',
    '_1_1_1_1',
]

#Runs = ['_0_0_1_0_1']

for run in Runs:
    Run = run
    print ()
    print ('------------------------------------')
    print ('Run = ', Run)

    random_seed = int(Run[1])
    print ('random_seed = ', random_seed)
    random.seed(random_seed) # Python
    np.random.seed(random_seed) # NumPy
    tf.random.set_seed(random_seed) # Tensorflow
    
    Build_Models('Hard', run)
#    Build_Models('Medium', run)
#    Build_Models('Easy', run)
    
# CPU times: user 4h 45min 20s, sys: 7min 22s, total: 4h 52min 43s
# Wall time: 4h 30min 47s


------------------------------------
Run =  _1_0_1_1
random_seed =  1
random_seed = 1
Get_Data()
data.shape:  (802700, 129)
End Get_Data()

alpha_balanced =  0.8427108508782858

------------------------

KBFC_alpha_0_5_gamma_0_0_1_0_1_1_Hard



K-fold iteration =  0
len(y_proba_fold) =  160540


K-fold iteration =  1
len(y_proba_fold) =  160540


K-fold iteration =  2
len(y_proba_fold) =  160540


K-fold iteration =  3
len(y_proba_fold) =  160540


K-fold iteration =  4
len(y_proba_fold) =  160540
Length before dropna():  802700
Length after dropna():  802700


------------------------------------
Run =  _1_1_0_0
random_seed =  1
random_seed = 1
Get_Data()
data.shape:  (802700, 139)
End Get_Data()

alpha_balanced =  0.8427108508782858

------------------------

KBFC_alpha_0_5_gamma_0_0_1_1_0_0_Hard



K-fold iteration =  0
len(y_proba_fold) =  160540


K-fold iteration =  1
len(y_proba_fold) =  160540


K-fold iteration =  2
len(y_proba_fold) =  160540


K-fold iteration =  3
len(y_pr