In [1]:
%%latex
\tableofcontents

<IPython.core.display.Latex object>

# Methods

- We will build models reflecting three levels of available data
    - "Easy" is mostly data already available to the emergency dispatcher before the notification comes in, like month, day of week, hour, weather, urban/rural, 

# Setup

## Import Libraries

In [2]:
print ('Install Packages')

import sys, copy, math, time, os

print ('Python version: {}'.format(sys.version))

#from collections import Counter

import numpy as np
print ('NumPy version: {}'.format(np.__version__))
np.set_printoptions(suppress=True)

import scipy as sc
print ('SciPy version:  {}'.format(sc.__version__))

import tensorflow as tf
print ('TensorFlow version:  {}'.format(tf.__version__))
tf.config.run_functions_eagerly(True)
tf.data.experimental.enable_debug_mode()

from tensorflow import keras
print ('Keras version:  {}'.format(keras.__version__))

from keras import layers
import keras.backend as K
#from keras.layers import IntegerLookup
#from keras.layers import Normalization
#from keras.layers import StringLookup
#from keras.utils import get_custom_objects
#from keras.utils import tf_utils

from keras.models import Sequential
from keras.layers import Dense

#from keras.wrappers.scikit_learn import KerasClassifier
from scikeras.wrappers import KerasClassifier

import pandas as pd
print ('Pandas version:  {}'.format(pd.__version__))
pd.set_option('display.max_rows', 500)

import matplotlib
matplotlib.use("pgf")
matplotlib.rcParams.update({
#    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

import matplotlib.pyplot as plt
%matplotlib inline

# Library for reading Microsoft Access files
#import pandas_access as mdb

import sklearn
print ('SciKit-Learn version: {}'.format(sklearn.__version__))
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
#from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
#from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

import imblearn
print ('Imbalanced-Learn version: {}'.format(imblearn.__version__))
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import CondensedNearestNeighbour
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import RUSBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier

#!pip install pydot

# Set Randomness.  Copied from https://www.kaggle.com/code/abazdyrev/keras-nn-focal-loss-experiments
import random
#np.random.seed(42) # NumPy
#random.seed(42) # Python
#tf.random.set_seed(42) # Tensorflow

import warnings
warnings.filterwarnings('ignore')

print ('Finished Installing Packages')

Install Packages
Python version: 3.10.9 | packaged by conda-forge | (main, Feb  2 2023, 20:26:08) [Clang 14.0.6 ]
NumPy version: 1.24.2
SciPy version:  1.7.3




TensorFlow version:  2.11.0
Keras version:  2.11.0
Pandas version:  1.5.3
SciKit-Learn version: 1.2.2
Imbalanced-Learn version: 0.10.1
Finished Installing Packages


## Get Data

In [3]:
def Get_Data(Features):
    print ('Get_Data()')
    if Features==1:
        data = pd.read_csv(
        '../../Big_Files/CRSS_Imputed.csv',
        low_memory=False
    )
    print ('data.shape: ', data.shape)
    
    print ('End Get_Data()')
    print ()
    return data

def Test_Get_Data():
    data = Get_Data()
    display (data.head())
    
#Test_Get_Data()

# Tools

## Engineer Features
- AGE_x_SEX
    - We had found that the correlation between age and hospitalization varied by sex, so we made a new feature that captured the complexities
- AGE_x_SCH_BUS
    - We also found that those on a school bus had different rates of hospitalization based on age, so we created this more complex feature.

In [4]:
def Feature_Engineering_Cross_Two(data):
    print ('Feature_Engineering_Cross_Two')
    Pairs = [
        ['AGE', 'SEX', 'AGE_x_SEX'],
        ['AGE', 'SCH_BUS', 'AGE_x_SCH_BUS']
    ]
    for P in Pairs:
        data[P[2]] = data[P[0]].map(str) + '_x_' + data[P[1]].map(str)
    
    print ()
    return data
        

## Thin Features 
### Thin Features to only "Hard" Level

In [5]:
def Thin_to_Hard_Features(data):
    print ('Thin_to_Hard_Features()')

    Merge = [
        'CASENUM',
        'VEH_NO',
        'PER_NO',        
    ]

    Accident = [
        'DAY_WEEK',
        'HOUR',
        'INT_HWY',
        'LGT_COND',
        'MONTH',
#        'PEDS',
        'PERMVIT',
#        'PERNOTMVIT', # Pedestrians, which we have taken out
        'PJ',
        'PSU',
        'PVH_INVL',
        'REGION',
        'REL_ROAD',
        'RELJCT1',
        'RELJCT2',
        'SCH_BUS',
        'TYP_INT',
        'URBANICITY',
        'VE_FORMS',
        'VE_TOTAL',
        'WEATHER',
        'WRK_ZONE',
        'YEAR',
    ]
    
    Vehicle = [
        'BODY_TYP',
        'BUS_USE',
        'EMER_USE',
        'MAKE',
#        'MOD_YEAR',
        'MODEL',
        'NUMOCCS',
        'VALIGN',
        'VNUM_LAN',
        'VPROFILE',
        'VSPD_LIM',
#        'VSURCOND',
        'VTRAFCON',
        'VTRAFWAY',
    ]
    
    Person = [
        'AGE',
#        'LOCATION', # Pedestrian location; taken out
        'PER_TYP',
        'SEX',
        'HOSPITAL',    
    ]

    Engineered = [
        'VEH_AGE',
        'AGE_x_SEX',
        'AGE_x_SCH_BUS'
    ]
    
    # Put features in alphabetical order
    Features = Accident + Vehicle + Person + Engineered
    Features = sorted(Features)
#    Features = Merge + Features
    
    data = data.filter(Features, axis=1)
    
    print ('data.shape: ', data.shape)
    
    print ('End Thin_to_Hard_Features()')
    print ()
        
    return data

def Test_Thin_to_Hard_Features():
    data = Get_Data()
    data = Thin_to_Hard_Features(data)
    for feature in data:
        display(data[feature].value_counts())
        
#Test_Thin_to_Hard_Features()

### Thin Features to "Medium" Level

In [6]:
def Thin_to_Medium_Features(data):
    print ('Thin_to_Medium_Features()')

    Merge = [
        'CASENUM',
        'VEH_NO',
        'PER_NO',        
    ]

    Accident = [
        'DAY_WEEK',
        'HOUR',
        'INT_HWY',
#        'LGT_COND',
        'MONTH',
#        'PEDS',
#        'PERMVIT',
#        'PERNOTMVIT',
        'PJ',
        'PSU',
#        'PVH_INVL',
        'REGION',
        'REL_ROAD',
        'RELJCT1',
#        'RELJCT2',
#        'SCH_BUS',
        'TYP_INT',
        'URBANICITY',
#        'VE_FORMS',
#        'VE_TOTAL',
        'WEATHER',
#        'WRK_ZONE',
        'YEAR',
    ]
    
    Vehicle = [
#        'BODY_TYP',
#        'BUS_USE',
#        'EMER_USE',
#        'MAKE',
#        'MOD_YEAR',
#        'MODEL',
#        'NUMOCCS',
        'VALIGN',
        'VNUM_LAN',
        'VPROFILE',
        'VSPD_LIM',
#        'VSURCOND',
        'VTRAFCON',
        'VTRAFWAY',
    ]
    
    Person = [
        'AGE',
#        'LOCATION',
#        'PER_TYP',
        'SEX',
        'HOSPITAL',    
    ]

    Engineered = [
#        'VEH_AGE',
        'AGE_x_SEX',
#        'AGE_x_SCH_BUS'
    ]
    
    # Put features in alphabetical order
    Features = Accident + Vehicle + Person + Engineered
    Features = sorted(Features)
#    Features = Merge + Features
    
    data = data.filter(Features, axis=1)
    
    print ('data.shape: ', data.shape)
    
    print ('End Thin_to_Medium_Features()')
    print ()
        
    return data

def Test_Thin_to_Medium_Features():
    data = Get_Data()
    data = Thin_to_Medium_Features(data)
    for feature in data:
        display(data[feature].value_counts())
        
#Test_Thin_to_Medium_Features()

### Thin Features to "Easy" Level

In [7]:
def Thin_to_Easy_Features(data):
    print ('Thin_to_Easy_Features()')

    Accident = [
        'DAY_WEEK',
        'HOUR',
#        'INT_HWY',
#        'LGT_COND',
        'MONTH',
#        'PEDS',
#        'PERMVIT',
#        'PERNOTMVIT',
        'PJ',
        'PSU',
#        'PVH_INVL',
        'REGION',
#        'REL_ROAD',
#        'RELJCT1',
#        'RELJCT2',
#        'SCH_BUS',
#        'TYP_INT',
        'URBANICITY',
#        'VE_FORMS',
#        'VE_TOTAL',
        'WEATHER',
#        'WRK_ZONE',
        'YEAR',
    ]
    
    Vehicle = [
#        'BODY_TYP',
#        'BUS_USE',
#        'EMER_USE',
#        'MAKE',
#        'MOD_YEAR',
#        'MODEL',
#        'NUMOCCS',
#        'VALIGN',
#        'VNUM_LAN',
#        'VPROFILE',
#        'VSPD_LIM',
#        'VSURCOND',
#        'VTRAFCON',
#        'VTRAFWAY',
    ]
    
    Person = [
#        'AGE',
#        'LOCATION',
#        'PER_TYP',
#        'SEX',
        'HOSPITAL',    
    ]

    Engineered = [
#        'VEH_AGE',
#        'AGE_x_SEX',
#        'AGE_x_SCH_BUS'
    ]
    
    # Put features in alphabetical order
    Features = Accident + Vehicle + Person + Engineered
    Features = sorted(Features)
#    Features = Merge + Features
    
    data = data.filter(Features, axis=1)
    
    print ('data.shape: ', data.shape)
    
    print ('End Thin_to_Easy_Features()')
    print ()
        
    return data

def Test_Thin_to_Easy_Features():
    data = Get_Data()
    data = Thin_to_Easy_Features(data)
    for feature in data:
        display(data[feature].value_counts())
        
#Test_Thin_to_Easy_Features()

## Get Dummies
- Transform categorical data into one-hot-encoded features
- For each value in the category, make a new feature that is "1" when the feature has that value, "0" otherwise.  

In [8]:
def Get_Dummies(data, target):
    print ('Get_Dummies')
    data = data.astype('category')
    Target = data.pop(target)
    data_Dummies = pd.get_dummies(data, prefix = data.columns)
    data_Dummies = data_Dummies.join(Target)
#    for feature in data_Dummies:
#        print (feature)
    print ()

    return data_Dummies

def Test_Get_Dummies():
    print ('Test_Get_Dummies')
    A = pd.DataFrame({
        'A': ['a', 'b', 'a'], 
        'B': ['b', 'a', 'c'], 
        'C': [1, 2, 3]})
    C = Get_Dummies(A, 'C')
    display(C)
    print ()

#Test_Get_Dummies()

# Models

# Five-Fold Cross Validation

In [23]:
def Five_Fold_Cross_Validation(data, model, filename, title):
    print ()
    print ('------------------------')
    print ()
    print (filename)
    print ()
    
    target = 'HOSPITAL'
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state = random.randint(1,100))
    target_column = data.loc[:,target]
    y_test = []
    y_proba = []
    y_pred = []
    
    iteration = 0
    for train_index, test_index in skf.split(data, target_column):
        print ('K-fold iteration = ', iteration)
        iteration += 1
        
#        print ('len(train_index) = ', len(train_index))
#        print (train_index)
#        print ('len(test_index) = ', len(test_index))
#        print (test_index)
        
        train_fold = data.iloc[train_index]
#        print ()
#        print ('train_fold')
#        display(train_fold)
        
        test_fold = data.iloc[test_index]
#        print ()
#        print ('test_fold')
#        display(test_fold)
#        print ('type(test_fold) = ', type(test_fold))
        
        
        X_train_fold = train_fold.drop(columns=[target])
        X_test_fold = test_fold.drop(columns=[target])
        y_train_fold = train_fold[target].squeeze()        
        y_test_fold = test_fold[target].squeeze()
#        print ('type(y_test_fold) = ', type(y_test_fold))
        
#        print ()
        model.fit(X_train_fold, y_train_fold.values.ravel())
        y_proba_fold = model.predict_proba(X_test_fold)
        y_proba_fold = [x[1] for x in y_proba_fold]
        y_pred_fold = list(np.around(np.array(y_proba_fold),0))
        
        ###
#        print ('X_train_fold')
#        display(X_train_fold)
#        print ('y_train_fold')
#        display(y_train_fold)
#        print ('y_train_fold.value_counts()')
#        display(y_train_fold.value_counts())
#        print ('y_proba_fold')
#        print (y_proba_fold)
#        ###
#        
        y_test = y_test + y_test_fold.to_list()
        y_proba = y_proba + y_proba_fold
#        print ('len(y_proba) = ', len(y_proba))
        y_pred = y_pred + y_pred_fold

    y_test = np.array(y_test)
    y_proba = np.array(y_proba)
    y_pred = np.array(y_pred)
    
    DF = pd.DataFrame(y_test, columns=['y_test'])
    DF['y_proba'] = y_proba
    DF['y_pred'] = y_pred
    DF.to_csv('../../Big_Files/' + filename + '.csv')
#    print (DF)
    
    
#    Chart_and_Plots(y_test, y_proba, y_pred, filename, title)
    
    
    print ()
#    return model    


In [24]:
def BRFC_5_Fold(data, target, alpha, filename):
     
    title = ''
    model = BalancedRandomForestClassifier(
        bootstrap = True, ccp_alpha = 0.0, criterion = 'gini', 
        max_depth = None,
#        max_depth = 40, 
        max_features = 'sqrt', 
        max_leaf_nodes = None,
#        max_leaf_nodes = 10000,  
        max_samples = None, 
        min_impurity_decrease = 0.0, 
        min_samples_leaf = 1, 
        min_samples_split = 2, 
        min_weight_fraction_leaf = 0.0, 
        n_estimators = 100, 
#        n_estimators = 1000, 
        n_jobs = None, 
        oob_score = False, 
        random_state = random.randint(1,100), 
        replacement = False, 
        sampling_strategy = 'auto', 
        verbose = 0, 
        warm_start = False,
        class_weight = {0:1-alpha, 1:alpha}
    )
    Five_Fold_Cross_Validation(data, model, filename, title)



In [25]:
def RFC_5_Fold(data, target, filename):
    title = ''
    model = RandomForestClassifier(max_depth=2, random_state = random.randint(1,100))
    Five_Fold_Cross_Validation(data, model, filename, title)


In [26]:
def AdaBoost_5_Fold(data, target, filename):
    title = ''
    model = AdaBoostClassifier(n_estimators=100, random_state = random.randint(1,100))
    Five_Fold_Cross_Validation(data, model, filename, title)


In [27]:
def RUSBoost_5_Fold(data, target, filename):
    title = ''
    estimator = DecisionTreeClassifier(
        max_depth=1,
#        class_weight={0:(1+r_target)/(2*r_target), 1:(1+r_target)/(2*1)},
    )    
    model = RUSBoostClassifier(
        n_estimators=1000, 
        estimator=estimator,
        algorithm='SAMME.R', 
        random_state = random.randint(1,100)
    )
    Five_Fold_Cross_Validation(data, model, filename, title)


In [28]:
def BalancedBagging_5_Fold(data, target, filename):
    title = ''
    model = BalancedBaggingClassifier(
        random_state = random.randint(1,100)
    )
    Five_Fold_Cross_Validation(data, model, filename, title)
    

In [29]:
def EasyEnsemble_5_Fold(data, target, filename):
    title = ''
    estimator = AdaBoostClassifier(n_estimators=10, random_state = random.randint(1,100))
    model = EasyEnsembleClassifier(n_estimators=10, estimator=estimator, random_state = random.randint(1,100))
    Five_Fold_Cross_Validation(data, model, filename, title)

    

In [30]:
def LogisticRegression_5_Fold(data, target, alpha, filename):
    title = ''
    model = LogisticRegression(
#        class_weight={0:(1+r_target)/(2*r_target), 1:(1+r_target)/(2*1)}
        class_weight = {0:1-alpha, 1:alpha},
        max_iter=1000,
        random_state = random.randint(1,100),
    )
    Five_Fold_Cross_Validation(data, model, filename, title)


In [38]:
def KBFC_5_Fold(data, target, alpha, gamma, filename):
    print ()
    print ('------------------------')
    print ()
    print (filename)
    print ()
    
    target = 'HOSPITAL'
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state  = random.randint(1,100))
    target_column = data.loc[:,target]
    y_test = []
    y_proba = []
    y_pred = []
    
    iteration = 0
    for train_index, test_index in skf.split(data, target_column):
        print ()
        print ()
        print ('K-fold iteration = ', iteration)
        iteration += 1
        
#        print ('len(train_index) = ', len(train_index))
#        print (train_index)
#        print ('len(test_index) = ', len(test_index))
#        print (test_index)
        
        train_fold = data.iloc[train_index]
#        print ()
#        print ('train_fold')
#        display(train_fold)
        
        test_fold = data.iloc[test_index]
#        print ()
#        print ('test_fold')
#        display(test_fold)
#        print ('type(test_fold) = ', type(test_fold))
        
        
        X_train_fold = train_fold.drop(columns=[target])
        X_test_fold = test_fold.drop(columns=[target])
        y_train_fold = train_fold[target].squeeze()        
        y_test_fold = test_fold[target].squeeze()
#        print ('type(y_test_fold) = ', type(y_test_fold))

#        print ('len(X_train_fold) = ', len(X_train_fold))
#        print ('len(X_test_fold) = ', len(X_test_fold))
#        print ('len(y_train_fold) = ', len(y_train_fold))
#        print ('len(y_test_fold) = ', len(y_test_fold))
#        print ()
        
#        print ()

        loss_function = tf.keras.losses.BinaryFocalCrossentropy(
            apply_class_balancing=True,
            alpha=alpha,
            gamma=gamma,
    #        from_logits=False,
    #        label_smoothing=0.0,
    #        axis=-1,
    #        reduction=losses_utils.ReductionV2.AUTO,
    #        name='binary_focal_crossentropy'
        )   
    
        # create model
        model = Sequential()
#        print ('data.shape = ', data.shape, data.shape[-1])
        model.add(Dense(60, input_shape=(data.shape[-1]-1,), activation='relu'))
#        model.add(Dense(30, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))    
        # Compile model
        metrics = [
            keras.metrics.Precision(name="precision"),
            keras.metrics.Recall(name="recall"),
    #        F1_Metric,
        ]
        model.compile(loss=loss_function, optimizer=tf.keras.optimizers.Adam(), metrics=metrics)
        estimator = KerasClassifier(
            model=model, 
            random_state = random.randint(1,100),
            metrics=metrics,
            batch_size=128, 
            verbose=0,
            epochs=20,
        )
    


        estimator.fit(X_train_fold, y_train_fold.values.ravel())
        y_proba_fold = estimator.predict_proba(X_test_fold)
        y_proba_fold = [x[1] for x in y_proba_fold]
        print ('len(y_proba_fold) = ', len(y_proba_fold))
        y_pred_fold = list(np.around(np.array(y_proba_fold),0))
        
        ###
#        print ('X_train_fold')
#        display(X_train_fold.head())
#        print ('y_train_fold')
#        display(y_train_fold.head())
#        print ('y_train_fold.value_counts()')
#        display(y_train_fold.value_counts())
#        print ('X_test_fold')
#        display(X_test_fold.head())
#        print ('y_test_fold')
#        display(y_test_fold.head())
#        print ('y_test_fold.value_counts()')
#        display(y_test_fold.value_counts())
#        print ('y_proba_fold')
#        print (y_proba_fold[:10])
#        ###
#        
        y_test = y_test + y_test_fold.to_list()
        y_proba = y_proba + y_proba_fold
#        print ('len(y_proba) = ', len(y_proba))
        y_pred = y_pred + y_pred_fold

    y_test = np.array(y_test)
    y_proba = np.array(y_proba)
    y_pred = np.array(y_pred)
    
    DF = pd.DataFrame(y_test, columns=['y_test'])
    DF['y_proba'] = y_proba
    DF['y_pred'] = y_pred
    print ('Length before dropna(): ',len(DF))
    DF.dropna(inplace=True)
    print ('Length after dropna(): ',len(DF))
    DF.to_csv('../../Big_Files/' + filename + '.csv')
    
#    Chart_and_Plots(y_test, y_proba, y_pred, filename, '')
    
    
    print ()
#    return model    


# Run

In [32]:
def Run_with_Hard_Features(run):
    data = Get_Data()
    data = data.astype('int64')
    target = 'HOSPITAL'
    data = Feature_Engineering_Cross_Two(data)
    data = Thin_to_Hard_Features(data)
    write_filename_features = '_Hard' + run
    data = Get_Dummies(data, target)
    
    y = data[target]
    N = len(y)
    n = len(y[y==1])
    p = (N-n)/n
    alpha_balanced = p/(p+1)
    print ('p = ', p)
    print ('alpha_balanced = ', alpha_balanced)    

    """
    filename = 'RFC' + write_filename_features
    RFC_5_Fold(data, target, filename)
    
    alpha = 0.5
    filename = 'BRFC_alpha_0_5' + write_filename_features
    BRFC_5_Fold(data, target, alpha, filename)
    
    alpha = alpha_balanced
    filename = 'BRFC_alpha_balanced' + write_filename_features
    BRFC_5_Fold(data, target, alpha, filename)
    

    alpha = 0.5
    filename = 'LogReg_alpha_0_5' + write_filename_features
    LogisticRegression_5_Fold(data, target, alpha, filename)

    alpha = alpha_balanced
    filename = 'LogReg_alpha_balanced' + write_filename_features
    LogisticRegression_5_Fold(data, target, alpha, filename)

    AdaBoost_5_Fold(data, target, 'AdaBoost' + write_filename_features)
    BalancedBagging_5_Fold(data, target, 'BalBag' + write_filename_features)
    EasyEnsemble_5_Fold(data, target, 'EEC' + write_filename_features)
    RUSBoost_5_Fold(data, target, 'RUSBoost' + write_filename_features)
    
    alpha = 0.5
    gamma = 0.0
    filename = 'KBFC_alpha_0_5_gamma_0_0' + write_filename_features
    KBFC_5_Fold(data, target, alpha, gamma, filename)

    """
    alpha = alpha_balanced
    gamma = 0.0
    filename = 'KBFC_alpha_balanced_gamma_0_0' + write_filename_features
    KBFC_5_Fold(data, target, alpha, gamma, filename)

    alpha = 0.5
    gamma = 1.0
    filename = 'KBFC_alpha_0_5_gamma_1_0' + write_filename_features
#    KBFC_5_Fold(data, target, alpha, gamma, filename)

    alpha = 0.5
    gamma = 2.0
    filename = 'KBFC_alpha_0_5_gamma_2_0' + write_filename_features
#    KBFC_5_Fold(data, target, alpha, gamma, filename)
    


In [33]:
def Run_with_Medium_Features(run):
    data = Get_Data()
    data = data.astype('int64')
    target = 'HOSPITAL'
    data = Feature_Engineering_Cross_Two(data)
    data = Thin_to_Medium_Features(data)
    write_filename_features = '_Medium' + run
    data = Get_Dummies(data, target)

    y = data[target]
    N = len(y)
    n = len(y[y==1])
    p = (N-n)/n
    alpha_balanced = p/(p+1)
    print ('p = ', p)
    print ('alpha_balanced = ', alpha_balanced)    

    filename = 'RFC' + write_filename_features
    RFC_5_Fold(data, target, filename)
    
    alpha = 0.5
    filename = 'BRFC_alpha_0_5' + write_filename_features
    BRFC_5_Fold(data, target, alpha, filename)
    
    alpha = alpha_balanced
    filename = 'BRFC_alpha_balanced' + write_filename_features
    BRFC_5_Fold(data, target, alpha, filename)
    

    alpha = 0.5
    filename = 'LogReg_alpha_0_5' + write_filename_features
    LogisticRegression_5_Fold(data, target, alpha, filename)

    alpha = alpha_balanced
    filename = 'LogReg_alpha_balanced' + write_filename_features
    LogisticRegression_5_Fold(data, target, alpha, filename)

    AdaBoost_5_Fold(data, target, 'AdaBoost' + write_filename_features)
    BalancedBagging_5_Fold(data, target, 'BalBag' + write_filename_features)
    EasyEnsemble_5_Fold(data, target, 'EEC' + write_filename_features)
    RUSBoost_5_Fold(data, target, 'RUSBoost' + write_filename_features)
    
    alpha = 0.5
    gamma = 0.0
    filename = 'KBFC_alpha_0_5_gamma_0_0' + write_filename_features
    KBFC_5_Fold(data, target, alpha, gamma, filename)

    alpha = alpha_balanced
    gamma = 0.0
    filename = 'KBFC_alpha_balanced_gamma_0_0' + write_filename_features
    KBFC_5_Fold(data, target, alpha, gamma, filename)

    alpha = 0.5
    gamma = 1.0
    filename = 'KBFC_alpha_0_5_gamma_1_0' + write_filename_features
    KBFC_5_Fold(data, target, alpha, gamma, filename)

    alpha = 0.5
    gamma = 2.0
    filename = 'KBFC_alpha_0_5_gamma_2_0' + write_filename_features
    KBFC_5_Fold(data, target, alpha, gamma, filename)
    


In [34]:
def Run_with_Easy_Features(run):
    data = Get_Data()
    data = data.astype('int64')
    target = 'HOSPITAL'
    data = Feature_Engineering_Cross_Two(data)
    data = Thin_to_Easy_Features(data)
    write_filename_features = '_Easy' + run
    data = Get_Dummies(data, target)

    y = data[target]
    N = len(y)
    n = len(y[y==1])
    p = (N-n)/n
    alpha_balanced = p/(p+1)
    print ('p = ', p)
    print ('alpha_balanced = ', alpha_balanced)    

    filename = 'RFC' + write_filename_features
    RFC_5_Fold(data, target, filename)
    
    alpha = 0.5
    filename = 'BRFC_alpha_0_5' + write_filename_features
    BRFC_5_Fold(data, target, alpha, filename)
    
    alpha = alpha_balanced
    filename = 'BRFC_alpha_balanced' + write_filename_features
    BRFC_5_Fold(data, target, alpha, filename)
    

    alpha = 0.5
    filename = 'LogReg_alpha_0_5' + write_filename_features
    LogisticRegression_5_Fold(data, target, alpha, filename)

    alpha = alpha_balanced
    filename = 'LogReg_alpha_balanced' + write_filename_features
    LogisticRegression_5_Fold(data, target, alpha, filename)

    AdaBoost_5_Fold(data, target, 'AdaBoost' + write_filename_features)
    BalancedBagging_5_Fold(data, target, 'BalBag' + write_filename_features)
    EasyEnsemble_5_Fold(data, target, 'EEC' + write_filename_features)
    RUSBoost_5_Fold(data, target, 'RUSBoost' + write_filename_features)
    
    alpha = 0.5
    gamma = 0.0
    filename = 'KBFC_alpha_0_5_gamma_0_0' + write_filename_features
    KBFC_5_Fold(data, target, alpha, gamma, filename)

    alpha = alpha_balanced
    gamma = 0.0
    filename = 'KBFC_alpha_balanced_gamma_0_0' + write_filename_features
    KBFC_5_Fold(data, target, alpha, gamma, filename)

    alpha = 0.5
    gamma = 1.0
    filename = 'KBFC_alpha_0_5_gamma_1_0' + write_filename_features
    KBFC_5_Fold(data, target, alpha, gamma, filename)

    alpha = 0.5
    gamma = 2.0
    filename = 'KBFC_alpha_0_5_gamma_2_0' + write_filename_features
    KBFC_5_Fold(data, target, alpha, gamma, filename)
    


In [39]:
%%time
Run_with_Hard_Features('_Run_3')
# CPU times: user 4h 45min 20s, sys: 7min 22s, total: 4h 52min 43s
# Wall time: 4h 30min 47s

Get_Data()
data.shape:  (713566, 78)
End Get_Data()

Feature_Engineering_Cross_Two

Thin_to_Hard_Features()
data.shape:  (713566, 39)
End Thin_to_Hard_Features()

Get_Dummies

p =  5.609785468153692
alpha_balanced =  0.84870915934896

------------------------

KBFC_alpha_balanced_gamma_0_0_Hard_Run_3



K-fold iteration =  0
len(y_proba_fold) =  142714


K-fold iteration =  1
len(y_proba_fold) =  142713


K-fold iteration =  2
len(y_proba_fold) =  142713


K-fold iteration =  3
len(y_proba_fold) =  142713


K-fold iteration =  4
len(y_proba_fold) =  142713
Length before dropna():  713566
Length after dropna():  713566

CPU times: user 58min 54s, sys: 49.5 s, total: 59min 43s
Wall time: 1h 3min 2s


In [36]:
%%time
#Run_with_Medium_Features('_Run_3')
# CPU times: user 2h 21min 16s, sys: 3min 36s, total: 2h 24min 52s
# Wall time: 2h 19min 48s

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.05 µs


In [37]:
%%time
#Run_with_Easy_Features('_Run_3')
# CPU times: user 2h 2min 53s, sys: 1min 36s, total: 2h 4min 29s
# Wall time: 2h 3min 30s

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 3.81 µs
