In [48]:
# Import our dependencies
# !pip install keras_tuner
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import keras_tuner as kt
import datetime
import shutil
import keras
import ast
import re


#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
final_results_all = []
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


# Define Functions

In [49]:

# Function to change value of column to "other" (typically used for rare values)
def otherize_column(df, column, list_of_values_to_replace):
    df_copy = df.copy()


    # Replace in dataframe
    for app in list_of_values_to_replace:
        df_copy[column] = df_copy[column].replace(app,"Other")

    # Check to make sure binning was successful
    # print(df_copy[column].value_counts())

    return df_copy

In [50]:
# Function to otherize values from multiple columns
def otherizations(df, list_of_columns_to_otherize, list_of_list_of_values_to_replace):
    for index in range(len(list_of_columns_to_otherize)):
        df = otherize_column(df, list_of_columns_to_otherize[index], list_of_list_of_values_to_replace[index])

    return df

In [51]:
# Creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()


    # number of layers including input layer but excluding output layer
    num_layers = hp.Int('num_layers', 1, 6)

    activation = hp.Choice('activation_layer_input', ['relu', 'tanh', 'leaky_relu'])

    #  Allow kerastuner to decide which activation function to use in hidden layers
    if activation == 'relu':
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('neurons_layer_input', min_value=X.shape[1], max_value=round(X.shape[1] * 2), step=5), activation='relu', input_dim=X.shape[1]))
    elif activation == 'tanh':
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('neurons_layer_input', min_value=X.shape[1], max_value=round(X.shape[1] * 2), step=5), activation='tanh', input_dim=X.shape[1]))
    elif activation == 'leaky_relu':
        activation = tf.keras.layers.LeakyReLU(alpha=0.01)
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('neurons_layer_input', min_value=X.shape[1], max_value=round(X.shape[1] * 2), step=5), activation=tf.keras.layers.LeakyReLU(alpha=0.01), input_dim=X.shape[1]))


    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(1, num_layers):

        activation = hp.Choice(f'activation_layer_{i}', ['relu', 'leaky_relu'])

        #  Allow kerastuner to decide which activation function to use in hidden layers
        if activation == 'leaky_relu':
            activation = tf.keras.layers.LeakyReLU(alpha=0.01)
        nn_model.add(tf.keras.layers.Dense(units=hp.Int(f'neurons_layer_{i}',
            min_value=2,
            max_value=round(X.shape[1] * 2),
            step=1),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # hp.Choice('random_seed2', [str(np.random.get_state())])
    # Compile the model
    lr = hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='log')

    nn_model.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Adam(learning_rate=lr), metrics = ["accuracy"])#optimizer='adam', metrics=["accuracy"])

    return nn_model

In [52]:
# Remove outliers from a given column when a value's z-score for that column is greater than a given limit
def remove_outliers(df, threshold, columns_to_remove_outliers_from):
    # Calculate Z-scores only for specified columns
    z_scores = np.abs(stats.zscore(df[columns_to_remove_outliers_from]))

    # Create a mask for outliers
    outliers = (z_scores > threshold)

    # Create a DataFrame with outliers for further inspection if needed
    outlier_rows = df[df.index.isin(df[columns_to_remove_outliers_from].index[outliers.any(axis=1)])]

    # Remove rows with outliers in the specified columns
    df_cleaned = df[~df.index.isin(df[columns_to_remove_outliers_from].index[outliers.any(axis=1)])]

    print("Outlier rows:")
    print(outlier_rows.index)
    
    return outlier_rows, df_cleaned

In [53]:
# Remove outliers using the 1.5 InterQuartile Range Rule as the cuttoff values
def remove_outliers_1_5_IQR_Rule(df):
    for column in df.columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
    
        # Filter out rows outside of the IQR for this column
        df = df[(df[column] >= Q1 - 1.5 * IQR) & (df[column] <= Q3 + 1.5 * IQR)]

    return df

In [54]:
# Extract X and y from the given dataframe
def extract_X_y(df, y_column_name):
    # Split our preprocessed data into our features and target arrays
    # print(a2pplication_df.columns)
    X = df.drop(columns=[y_column_name])
    y = df[y_column_name]
    return (X, y)

In [55]:
# Create a specific model instead of rely on keras_tuner.hyperband
def create_specific_model(list_of_activations, num_layers, list_of_number_of_nodes_hidden_layer, input_dim, output_activation, filepath_to_model_weights):

    nn = tf.keras.models.Sequential()

    for layer_number in range(0, num_layers):
        if list_of_activations[layer_number] == 'leaky_relu':
            activation = tf.keras.layers.LeakyReLU(alpha=0.01)
        else:
            activation = list_of_activations[layer_number]
        if layer_number == 0:
            # Input layer which requires input_dim
            nn.add(tf.keras.layers.Dense(units = list_of_number_of_nodes_hidden_layer[layer_number], activation = activation, input_dim = input_dim))
        else:
            nn.add(tf.keras.layers.Dense(units = list_of_number_of_nodes_hidden_layer[layer_number], activation = activation))


    # Output layer
    nn.add(tf.keras.layers.Dense(units = 1, activation = output_activation))
    
    if (filepath_to_model_weights != ''):
        nn.load_weights(filepath_to_model_weights)

    # Check the structure of the model
    print(nn.summary())
    return nn

In [56]:
# Find and return X_train_scaled and X_test_scaled
def scale_it(X_train, X_test):
    # Create a StandardScaler instances
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    return X_train_scaled, X_test_scaled

In [57]:
# Delete old dirctory used by keras_tuner.Hyperband before a new keras_tuner.Hyperbad is made
def delete_directory_and_its_content(directory_name):

    # Use shutil.rmtree() to recursively delete directories and subdirectories
    try:
        shutil.rmtree(directory_name)
        print(f"Directory '{directory_name}' has been deleted successfully.")
    except OSError as e:
        print(f"Error: {e}")

In [58]:
# Expose Random Seed of Tensorflow
class RandomSeedLogger(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        # Access the TensorFlow random seed and log it
        random_seed = tf.random.get_seed()
        print(f"Random Seed: {random_seed}")

# Create an instance of the custom callback
random_seed_logger = RandomSeedLogger()

In [59]:
# Otherize not value explicitly but by knowing the ordinal popularity of the value (e.g. '11' for 11th most common value)
def otherizer2(df, column, starting_from_least_column):
    to_replace = [df[column].value_counts().index[starting_from_least_column:]]
    # print(f'len(df[column].value_counts().index): {len(df[column].value_counts().index.tolist())}')
    print(f'len(to_replace): {len(to_replace[0].tolist())}')
    # print(f'to_replace: {to_replace}')
    # Replace in dataframe
    for cls in to_replace:
        df[column] = df[column].replace(cls,"Other")

    # print(f'len(df[column].value_counts().index): {len(df[column].value_counts().index.tolist())}')
    return df

In [60]:
# Optimization by otherizing across a range of values
def optimization2(optimization_title, df, columns, start, finish, step, reset_df):
    if reset_df:
        df = application_df.drop(columns=['EIN', 'NAME'])
    for column in columns:
        if len(df[column].value_counts()) > 5:
            print(f'start: {start}')
            for starting_from_least_column in range(start, finish, step):
                if reset_df:
                    df = application_df.drop(columns=['EIN', 'NAME'])

                print(f'start: {start}')
                print(f'starting_from_least_column: {starting_from_least_column}')
                print(f'len(df[column].value_counts().index): {len(df[column].value_counts().index.tolist())}')
                df = otherizer2(df, column, starting_from_least_column)
                print(f'len(df[column].value_counts().index): {len(df[column].value_counts().index.tolist())}')
                run_hyperparameters(optimization_title + f': otherize columns from {starting_from_least_column}', df, 8, 9, '', 20)

In [61]:
# Return the last n lines of a file
def read_last_n_lines(file_name, n):
    with open(file_name, 'r') as file:
        lines = file.readlines()
        return lines[-n:]

In [111]:
# Create hyperband models for multiple z-score thresholds
def run_hyperparameters(optimization_title, df_new, columns_to_remove_outliers_from, begin_threshold, end_threshold, column_to_remove, max_epochs, restrict_columns_to_interquartile):
    global X, y, final_results_all, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled

    if column_to_remove in df_new.columns:
        df_new_removed_column = df_new.drop(columns=[column_to_remove])
    else:
        df_new_removed_column = df_new


    df_new_removed_column = pd.get_dummies(df_new_removed_column)
    # return df_new_removed_column
    # print('number of columns', len(df_new_removed_column.columns))
    # return
    for z_threshold in range(begin_threshold, end_threshold):

        np.random.seed(42)
        tf.random.set_seed(42)
        number_of_rows_before_outliers_removed = df_new_removed_column.shape[0]

        if restrict_columns_to_interquartile:
            no_outliers_df = remove_outliers_1_5_IQR_Rule(df_new_removed_column)
        else:
            outlier_rows, no_outliers_df = remove_outliers(df_new_removed_column, z_threshold, columns_to_remove_outliers_from)

        number_of_outlier_rows = number_of_rows_before_outliers_removed - no_outliers_df.shape[0]
        


        file_path = "./DataFiles/optimization_results.txt"
        past_results_records = read_last_n_lines(file_path, 2)
        prev_number_of_outlier_rows = re.search("'number of outlier rows': (\d+)", past_results_records[1]).group(1)
        print(prev_number_of_outlier_rows)
        if int(prev_number_of_outlier_rows) == number_of_outlier_rows:
            # Skip to the next threshold
            # continue
            pass
        # print('number of columns', len(df_new_removed_column.columns))
        # return

        # Split our preprocessed data into our features and target array
        X, y = extract_X_y(no_outliers_df, 'IS_SUCCESSFUL')

        # Split the preprocessed data into a training and testing dataset
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=3,
                                                            stratify=y)

        # Standardize X_train and X_test
        X_train_scaled, X_test_scaled = scale_it(X_train, X_test)

        delete_directory_and_its_content('./untitled_project/')



        tuner = kt.Hyperband(
            create_model,
            objective="val_accuracy",
            max_epochs=max_epochs,
            hyperband_iterations=8,
            seed=1)

        # Run the kerastuner search for best hyperparameters
        tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))
        first_model = tuner.get_best_models(num_models=1)[0]
        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        model_loss, model_accuracy = first_model.evaluate(X_test_scaled,y_test,verbose=2)
        first_model.save_weights(f'./DataFiles/model_weights_{timestamp}_{float(model_accuracy*100):.2f}.h5')
        dict_results = {}
        dict_results['model_accuracy'] = model_accuracy
        dict_results['z_threshold'] = z_threshold
        
        dict_results['Optimization Title'] = optimization_title
        dict_results['number of columns'] = len(df_new_removed_column.columns)
        dict_results['param'] = tuner.get_best_hyperparameters(1)[0].values
        dict_results['columns'] = df_new_removed_column.columns
        dict_results['removed column'] = column_to_remove
        dict_results['number of outlier rows'] = number_of_outlier_rows
        dict_results['number of rows'] = no_outliers_df.shape[0]
        dict_results['max_epoch'] = max_epochs
        dict_results['hyperband seed'] = 1
        dict_results['np.rand.seed and tf.random.set_seed'] = 42
        final_results_all.append(dict_results)

        file_path = "./DataFiles/optimization_results.txt"

        # Open the file in append mode
        with open(file_path, "a") as file:
            # Append a line to the file
            # line_to_append = "This is a new line to append to the file."
            file.write(str(dict_results) + "\n")
    print(final_results_all)

# Preprocessing: Remove Non-Beneficial Columns

In [63]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
a1pplication_df = application_df.drop(columns=['EIN', 'NAME'])
a1pplication_df

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...
34294,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


# Preprocessing: Create Buckets

In [64]:
# Check for common averages of CLASSIFICATION in IS_SUCCESSFUL
groupby_classification = a1pplication_df[['CLASSIFICATION', 'IS_SUCCESSFUL']].groupby('CLASSIFICATION').mean().sort_values('IS_SUCCESSFUL')
groupby_classification

Unnamed: 0_level_0,IS_SUCCESSFUL
CLASSIFICATION,Unnamed: 1_level_1
C5200,0.0
C6100,0.0
C2600,0.0
C1732,0.0
C1820,0.0
...,...
C2150,1.0
C2170,1.0
C2500,1.0
C1283,1.0


In [65]:
# Create bucket for all 0.0 IS_SUCCESSFUL values for an average of CLASSIFICATION
bucket_0 = [groupby_classification[groupby_classification['IS_SUCCESSFUL'] == 0].index]
bucket_0

[Index(['C5200', 'C6100', 'C2600', 'C1732', 'C1820', 'C1236', 'C2380', 'C2190'], dtype='object', name='CLASSIFICATION')]

In [66]:
# Combine all CLASSIFICATION values that had a 0.0 average IS_SUCCESSFUL value
print(a1pplication_df['CLASSIFICATION'].value_counts())
a2pplication_df = a1pplication_df.copy()
# Replace in dataframe
for cls in bucket_0:
    a2pplication_df['CLASSIFICATION'] = a1pplication_df['CLASSIFICATION'].replace(cls,"bucket_0")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts()

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: CLASSIFICATION, Length: 71, dtype: int64


C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C4500        1
C2561        1
C2150        1
Name: CLASSIFICATION, Length: 64, dtype: int64

In [67]:
# Create bucket for all 1 IS_SUCCESSFUL values for an average of CLASSIFICATION
bucket_1 = [groupby_classification[groupby_classification['IS_SUCCESSFUL'] == 1].index]
bucket_1

[Index(['C3700', 'C7210', 'C4120', 'C4200', 'C4500', 'C4100', 'C1900', 'C2561',
        'C1234', 'C1235', 'C1245', 'C1246', 'C1248', 'C1256', 'C1257', 'C2570',
        'C1278', 'C1370', 'C1570', 'C1580', 'C1728', 'C2150', 'C2170', 'C2500',
        'C1283', 'C8210'],
       dtype='object', name='CLASSIFICATION')]

In [68]:
# Combine all CLASSIFICATION values that had a 1 average IS_SUCCESSFUL value
print(a2pplication_df['CLASSIFICATION'].value_counts())
# Replace in dataframe
for cls in bucket_1:
    a2pplication_df['CLASSIFICATION'] = a2pplication_df['CLASSIFICATION'].replace(cls,"bucket_1")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts().count()

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C4500        1
C2561        1
C2150        1
Name: CLASSIFICATION, Length: 64, dtype: int64


39

In [69]:
# Check for more common averages of CLASSIFICATION in IS_SUCCESSFUL
g2roupby_classification = a2pplication_df[['CLASSIFICATION', 'IS_SUCCESSFUL']].groupby('CLASSIFICATION').mean().sort_values('IS_SUCCESSFUL')
g2roupby_classification

Unnamed: 0_level_0,IS_SUCCESSFUL
CLASSIFICATION,Unnamed: 1_level_1
bucket_0,0.0
C5000,0.051724
C8200,0.090909
C2100,0.226235
C2300,0.28125
C1300,0.293103
C2710,0.333333
C1200,0.462683
C3000,0.48488
C3200,0.5


In [70]:
# Create bucket for all 0.5 IS_SUCCESSFUL values for an average of CLASSIFICATION
bucket_0_5 = [groupby_classification[groupby_classification['IS_SUCCESSFUL'] == 0.5].index]
bucket_0_5

[Index(['C3200', 'C1267'], dtype='object', name='CLASSIFICATION')]

In [71]:
# Combine all CLASSIFICATION values that had a 0.5 average IS_SUCCESSFUL value
print(a2pplication_df['CLASSIFICATION'].value_counts())
# Replace in dataframe
for cls in bucket_0_5:
    a2pplication_df['CLASSIFICATION'] = a2pplication_df['CLASSIFICATION'].replace(cls,"bucket_0_5")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts().count()

C1000       17326
C2000        6074
C1200        4837
C3000        1918
C2100        1883
C7000         777
C1700         287
C4000         194
C5000         116
C1270         114
C2700         104
C2800          95
C7100          75
bucket_1       61
C1300          58
C1280          50
C1230          36
C1400          34
C2300          32
C7200          32
C1240          30
C8000          20
C7120          18
C1500          16
C1800          15
C6000          15
C1250          14
C8200          11
C1238          10
C1237           9
bucket_0        8
C1720           6
C2400           6
C1600           5
C2710           3
C1260           3
C0              3
C1267           2
C3200           2
Name: CLASSIFICATION, dtype: int64


38

In [72]:
# Create bucket for all 0.667 IS_SUCCESSFUL values for an average of CLASSIFICATION
bucket_0_667 = [groupby_classification[(groupby_classification['IS_SUCCESSFUL'] > 0.66666) & (groupby_classification['IS_SUCCESSFUL'] < 0.6667)].index]
bucket_0_667

[Index(['C0', 'C1260', 'C1720', 'C2400'], dtype='object', name='CLASSIFICATION')]

In [73]:
# Combine all CLASSIFICATION values that had a 0.667 average IS_SUCCESSFUL value
print(a2pplication_df['CLASSIFICATION'].value_counts())
# Replace in dataframe
for cls in bucket_0_667:
    a2pplication_df['CLASSIFICATION'] = a2pplication_df['CLASSIFICATION'].replace(cls,"bucket_0_667")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts().count()

C1000         17326
C2000          6074
C1200          4837
C3000          1918
C2100          1883
C7000           777
C1700           287
C4000           194
C5000           116
C1270           114
C2700           104
C2800            95
C7100            75
bucket_1         61
C1300            58
C1280            50
C1230            36
C1400            34
C2300            32
C7200            32
C1240            30
C8000            20
C7120            18
C1500            16
C1800            15
C6000            15
C1250            14
C8200            11
C1238            10
C1237             9
bucket_0          8
C1720             6
C2400             6
C1600             5
bucket_0_5        4
C2710             3
C1260             3
C0                3
Name: CLASSIFICATION, dtype: int64


35

In [74]:
# Create bucket for all 0.833 IS_SUCCESSFUL values for an average of CLASSIFICATION
bucket_0_83 = [groupby_classification[(groupby_classification['IS_SUCCESSFUL'] > 0.8333) & (groupby_classification['IS_SUCCESSFUL'] < 0.8334)].index]
bucket_0_83

[Index(['C1230', 'C1240', 'C7120'], dtype='object', name='CLASSIFICATION')]

In [75]:
# Combine all CLASSIFICATION values that had a 0.833 average IS_SUCCESSFUL value
print(a2pplication_df['CLASSIFICATION'].value_counts())
# Replace in dataframe
for cls in bucket_0_83:
    a2pplication_df['CLASSIFICATION'] = a2pplication_df['CLASSIFICATION'].replace(cls,"bucket_0_83")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts().count()

C1000           17326
C2000            6074
C1200            4837
C3000            1918
C2100            1883
C7000             777
C1700             287
C4000             194
C5000             116
C1270             114
C2700             104
C2800              95
C7100              75
bucket_1           61
C1300              58
C1280              50
C1230              36
C1400              34
C7200              32
C2300              32
C1240              30
C8000              20
C7120              18
bucket_0_667       18
C1500              16
C1800              15
C6000              15
C1250              14
C8200              11
C1238              10
C1237               9
bucket_0            8
C1600               5
bucket_0_5          4
C2710               3
Name: CLASSIFICATION, dtype: int64


33

In [76]:
# Create bucket for all 0.8667 IS_SUCCESSFUL values for an average of CLASSIFICATION
bucket_0_8667 = [groupby_classification[(groupby_classification['IS_SUCCESSFUL'] > 0.8666) & (groupby_classification['IS_SUCCESSFUL'] < 0.8667)].index]
bucket_0_8667

[Index(['C1800', 'C6000'], dtype='object', name='CLASSIFICATION')]

In [77]:
# Combine all CLASSIFICATION values that had a 0.8667 average IS_SUCCESSFUL value
print(a2pplication_df['CLASSIFICATION'].value_counts())
# Replace in dataframe
for cls in bucket_0_8667:
    a2pplication_df['CLASSIFICATION'] = a2pplication_df['CLASSIFICATION'].replace(cls,"bucket_0_8667")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts().count()

C1000           17326
C2000            6074
C1200            4837
C3000            1918
C2100            1883
C7000             777
C1700             287
C4000             194
C5000             116
C1270             114
C2700             104
C2800              95
bucket_0_83        84
C7100              75
bucket_1           61
C1300              58
C1280              50
C1400              34
C7200              32
C2300              32
C8000              20
bucket_0_667       18
C1500              16
C1800              15
C6000              15
C1250              14
C8200              11
C1238              10
C1237               9
bucket_0            8
C1600               5
bucket_0_5          4
C2710               3
Name: CLASSIFICATION, dtype: int64


32

In [78]:
# Check for more common averages of CLASSIFICATION in IS_SUCCESSFUL
g2roupby_classification = a2pplication_df[['CLASSIFICATION', 'IS_SUCCESSFUL']].groupby('CLASSIFICATION').mean().sort_values('IS_SUCCESSFUL')
g2roupby_classification

Unnamed: 0_level_0,IS_SUCCESSFUL
CLASSIFICATION,Unnamed: 1_level_1
bucket_0,0.0
C5000,0.051724
C8200,0.090909
C2100,0.226235
C2300,0.28125
C1300,0.293103
C2710,0.333333
C1200,0.462683
C3000,0.48488
bucket_0_5,0.5


In [79]:
# Check for common averages of APPLICATION_TYPE in IS_SUCCESSFUL
g3roupby_classification = a2pplication_df[['APPLICATION_TYPE', 'IS_SUCCESSFUL']].groupby('APPLICATION_TYPE').mean().sort_values('IS_SUCCESSFUL')
g3roupby_classification

Unnamed: 0_level_0,IS_SUCCESSFUL
APPLICATION_TYPE,Unnamed: 1_level_1
T19,0.188732
T4,0.305447
T8,0.421981
T3,0.53216
T2,0.5625
T7,0.565517
T25,0.666667
T9,0.717949
T6,0.740132
T5,0.767263


In [80]:
# Create bucket for all 1 IS_SUCCESSFUL values for an average of APPLICATION_TYPE
bucket_APP_TYP_1 = [g3roupby_classification[g3roupby_classification['IS_SUCCESSFUL'] == 1 ].index]
bucket_APP_TYP_1

[Index(['T15', 'T29', 'T14', 'T17'], dtype='object', name='APPLICATION_TYPE')]

In [81]:
# Combine all APPLICATION_TYPE values that had a 1 average IS_SUCCESSFUL value
print(a2pplication_df['APPLICATION_TYPE'].value_counts())
# Replace in dataframe
for cls in bucket_APP_TYP_1:
    a2pplication_df['APPLICATION_TYPE'] = a2pplication_df['APPLICATION_TYPE'].replace(cls,"bucket_APP_TYP_1")

# Check to make sure binning was successful
a2pplication_df['APPLICATION_TYPE'].value_counts().count()

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64


14

In [82]:
# Create bucket for all 0.87878 and 0.8888 IS_SUCCESSFUL values for an average of APPLICATION_TYPE
bucket_APP_TYP_0_88 = [g3roupby_classification[(g3roupby_classification['IS_SUCCESSFUL'] > 0.87) & (g3roupby_classification['IS_SUCCESSFUL'] < 0.89)].index]
bucket_APP_TYP_0_88

[Index(['T10', 'T12'], dtype='object', name='APPLICATION_TYPE')]

In [83]:
# Combine all APPLICATION_TYPE values that had a 0.87878 and 0.8888 average IS_SUCCESSFUL value
print(a2pplication_df['APPLICATION_TYPE'].value_counts())
# Replace in dataframe
for cls in bucket_APP_TYP_0_88:
    a2pplication_df['APPLICATION_TYPE'] = a2pplication_df['APPLICATION_TYPE'].replace(cls,"bucket_APP_TYP_0_88")

# Check to make sure binning was successful
a2pplication_df['APPLICATION_TYPE'].value_counts().count()

T3                  27037
T4                   1542
T6                   1216
T5                   1173
T19                  1065
T8                    737
T7                    725
T10                   528
T9                    156
T13                    66
T12                    27
T2                     16
bucket_APP_TYP_1        8
T25                     3
Name: APPLICATION_TYPE, dtype: int64


13

In [84]:
# Create bucket for all 0.5625 and 0.5655 IS_SUCCESSFUL values for an average of APPLICATION_TYPE
bucket_APP_TYP_0_57 = [g3roupby_classification[(g3roupby_classification['IS_SUCCESSFUL'] > 0.56) & (g3roupby_classification['IS_SUCCESSFUL'] < 0.57)].index]
bucket_APP_TYP_0_57

[Index(['T2', 'T7'], dtype='object', name='APPLICATION_TYPE')]

In [85]:
# Combine all APPLICATION_TYPE values that had a 0.5625 and 0.5655 average IS_SUCCESSFUL value
print(a2pplication_df['APPLICATION_TYPE'].value_counts())
# Replace in dataframe
for cls in bucket_APP_TYP_0_57:
    a2pplication_df['APPLICATION_TYPE'] = a2pplication_df['APPLICATION_TYPE'].replace(cls,"bucket_APP_TYP_0_57")

# Check to make sure binning was successful
a2pplication_df['APPLICATION_TYPE'].value_counts().count()

T3                     27037
T4                      1542
T6                      1216
T5                      1173
T19                     1065
T8                       737
T7                       725
bucket_APP_TYP_0_88      555
T9                       156
T13                       66
T2                        16
bucket_APP_TYP_1           8
T25                        3
Name: APPLICATION_TYPE, dtype: int64


12

In [121]:
# Check for common averages of APPLICATION_TYPE in IS_SUCCESSFUL
g3roupby_classification = a2pplication_df[['APPLICATION_TYPE', 'IS_SUCCESSFUL']].groupby('APPLICATION_TYPE').mean().sort_values('IS_SUCCESSFUL')
g3roupby_classification

Unnamed: 0_level_0,IS_SUCCESSFUL
APPLICATION_TYPE,Unnamed: 1_level_1
T19,0.188732
T4,0.305447
T8,0.421981
T3,0.533239
bucket_APP_TYP_0_57,0.565452
T25,0.666667
T9,0.719178
T6,0.740132
T5,0.767918
bucket_APP_TYP_0_88,0.879061


# Preprocessing: Remove Unbound Data

In [86]:
a2pplication_df['INCOME_AMT'].unique()


array(['0', '1-9999', '10000-24999', '100000-499999', '10M-50M',
       '25000-99999', '50M+', '1M-5M', '5M-10M'], dtype=object)

In [87]:
a2pplication_df = a2pplication_df[a2pplication_df['INCOME_AMT'] != '50M+']
a2pplication_df['INCOME_AMT'].unique()


array(['0', '1-9999', '10000-24999', '100000-499999', '10M-50M',
       '25000-99999', '1M-5M', '5M-10M'], dtype=object)

In [88]:
a2pplication_df.shape

(34160, 10)

# Preprocessing: Scale INCOME_AMT

In [89]:
a2pplication_df.dtypes


APPLICATION_TYPE          object
AFFILIATION               object
CLASSIFICATION            object
USE_CASE                  object
ORGANIZATION              object
STATUS                     int64
INCOME_AMT                object
SPECIAL_CONSIDERATIONS    object
ASK_AMT                    int64
IS_SUCCESSFUL              int64
dtype: object

In [90]:
# Set data to midpoint of each range
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('1-9999',"5000")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('10000-24999',"17500")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('100000-499999',"300000")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('10M-50M',"30000000")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('1M-5M',"3000000")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('25000-99999',"62500")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('5M-10M',"7500000")

# Make into scale
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].astype('int64')
a2pplication_df.dtypes

APPLICATION_TYPE          object
AFFILIATION               object
CLASSIFICATION            object
USE_CASE                  object
ORGANIZATION              object
STATUS                     int64
INCOME_AMT                 int64
SPECIAL_CONSIDERATIONS    object
ASK_AMT                    int64
IS_SUCCESSFUL              int64
dtype: object

In [91]:
a2pplication_df['INCOME_AMT'].unique()

array([       0,     5000,    17500,   300000, 30000000,    62500,
        3000000,  7500000], dtype=int64)

# Create Model and Process Data

In [101]:
start_threshold = 8
end_threshold = 9
max_epoch = 10

columns = ['ASK_AMT']
for column in columns:
    df = a2pplication_df.copy()
    restrict_columns_to_interquartile = False
    columns_to_remove_outliers_from = []
    columns_to_remove_outliers_from.append(column)
    run_hyperparameters(f'"{columns_to_remove_outliers_from}" is the only column with outliers removed with an increased number of hyperband iterations from 2 to 8; try hyperparameters across thresholds {start_threshold} to {end_threshold} excluding {end_threshold} with max_epoch {max_epoch}', df, columns_to_remove_outliers_from, start_threshold, end_threshold, '', max_epoch, restrict_columns_to_interquartile)

Trial 240 Complete [00h 00m 13s]
val_accuracy: 0.7384777665138245

Best val_accuracy So Far: 0.7409405708312988
Total elapsed time: 00h 28m 45s
267/267 - 0s - loss: 0.5419 - accuracy: 0.7409 - 346ms/epoch - 1ms/step
[{'model_accuracy': 0.7402368783950806, 'z_threshold': 8, 'Optimization Title': '"[\'ASK_AMT\']" is the only column with outliers removed with an increased number of hyperband iterations from 2 to 8; try hyperparameters across thresholds 8 to 9 excluding 9 with max_epoch 10', 'number of columns': 65, 'param': {'num_layers': 4, 'activation_layer_input': 'relu', 'neurons_layer_input': 99, 'learning_rate': 0.0005880431087905998, 'activation_layer_1': 'leaky_relu', 'neurons_layer_1': 52, 'activation_layer_2': 'relu', 'neurons_layer_2': 67, 'activation_layer_3': 'relu', 'neurons_layer_3': 111, 'activation_layer_4': 'leaky_relu', 'neurons_layer_4': 40, 'activation_layer_5': 'leaky_relu', 'neurons_layer_5': 96, 'tuner/epochs': 10, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tun

# Reproduce Results of Model Found By Hyperband

## Here are the specs from the best fitting model

#### (These specifications along many other optimization tests were captured in the file ./DataFiles/optimization_results.txt)<br><br>

{'model_accuracy': 0.7409405708312988, <br>
'z_threshold': 8, <br>
'Optimization Title': '"[\'ASK_AMT\']" is the only column with outliers removed with an increased number of hyperband iterations from 2 to 8; <br>
try hyperparameters across thresholds 8 to 9 excluding 9 with max_epoch 10', <br>
'number of columns': 65, <br>
'param': {<br>
    'num_layers': 6, <br>
    'activation_layer_input': 'tanh', <br>
    'neurons_layer_input': 74, <br>
    'learning_rate': 0.0005293391761557029, <br>
    'activation_layer_1': 'leaky_relu', <br>
    'neurons_layer_1': 50, <br>
    'activation_layer_2': 'leaky_relu', <br>
    'neurons_layer_2': 28, <br>
    'activation_layer_3': 'relu', <br>
    'neurons_layer_3': 17, <br>
    'activation_layer_4': 'relu', <br>
    'neurons_layer_4': 48, <br>
    'activation_layer_5': 'leaky_relu', <br>
    'neurons_layer_5': 41, <br>
    'tuner/epochs': 10, <br>
    'tuner/initial_epoch': 4, <br>
    'tuner/bracket': 1, <br>
    'tuner/round': 1, <br>
    'tuner/trial_id': '0048'<br>
    }, <br>
    'columns': Index(['STATUS', 'INCOME_AMT', 'ASK_AMT', 'IS_SUCCESSFUL',<br>
       'APPLICATION_TYPE_T13', 'APPLICATION_TYPE_T19', 'APPLICATION_TYPE_T25',<br>
       'APPLICATION_TYPE_T3', 'APPLICATION_TYPE_T4', 'APPLICATION_TYPE_T5',<br>
       'APPLICATION_TYPE_T6', 'APPLICATION_TYPE_T8', 'APPLICATION_TYPE_T9',<br>
       'APPLICATION_TYPE_bucket_APP_TYP_0_57',<br>
       'APPLICATION_TYPE_bucket_APP_TYP_0_88',<br>
       'APPLICATION_TYPE_bucket_APP_TYP_1', 'AFFILIATION_CompanySponsored',<br>
       'AFFILIATION_Family/Parent', 'AFFILIATION_Independent',<br>
       'AFFILIATION_National', 'AFFILIATION_Other', 'AFFILIATION_Regional',<br>
       'CLASSIFICATION_C1000', 'CLASSIFICATION_C1200', 'CLASSIFICATION_C1237',<br>
       'CLASSIFICATION_C1238', 'CLASSIFICATION_C1250', 'CLASSIFICATION_C1270',<br>
       'CLASSIFICATION_C1280', 'CLASSIFICATION_C1300', 'CLASSIFICATION_C1400',<br>
       'CLASSIFICATION_C1500', 'CLASSIFICATION_C1600', 'CLASSIFICATION_C1700',<br>
       'CLASSIFICATION_C2000', 'CLASSIFICATION_C2100', 'CLASSIFICATION_C2300',<br>
       'CLASSIFICATION_C2700', 'CLASSIFICATION_C2710', 'CLASSIFICATION_C2800',<br>
       'CLASSIFICATION_C3000', 'CLASSIFICATION_C4000', 'CLASSIFICATION_C5000',<br>
       'CLASSIFICATION_C7000', 'CLASSIFICATION_C7100', 'CLASSIFICATION_C7200',<br>
       'CLASSIFICATION_C8000', 'CLASSIFICATION_C8200',<br>
       'CLASSIFICATION_bucket_0', 'CLASSIFICATION_bucket_0_5',<br>
       'CLASSIFICATION_bucket_0_667', 'CLASSIFICATION_bucket_0_83',<br>
       'CLASSIFICATION_bucket_0_8667', 'CLASSIFICATION_bucket_1',<br>
       'USE_CASE_CommunityServ', 'USE_CASE_Heathcare', 'USE_CASE_Other',<br>
       'USE_CASE_Preservation', 'USE_CASE_ProductDev',<br>
       'ORGANIZATION_Association', 'ORGANIZATION_Co-operative',<br>
       'ORGANIZATION_Corporation', 'ORGANIZATION_Trust',<br>
       'SPECIAL_CONSIDERATIONS_N', 'SPECIAL_CONSIDERATIONS_Y'],<br>
      dtype='object'), <br>
      'removed column': '', <br>
      'number of outlier rows': 53, <br>
      'number of rows': 34107, <br>
      'max_epoch': 10, <br>
      'hyperband seed': 1, <br>
      'np.rand.seed and tf.random.set_seed': 42}<br>


In [106]:
# Recreate as X, y, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled
np.random.seed(42)
tf.random.set_seed(42)
df_new_removed_column = a2pplication_df.copy()
df_new_removed_column = pd.get_dummies(df_new_removed_column)
z_threshold = 8
columns_to_remove_outliers_from = ['ASK_AMT']
outlier_rows, no_outliers_df = remove_outliers(df_new_removed_column, z_threshold, columns_to_remove_outliers_from)


# Split our preprocessed data into our features and target array
X, y = extract_X_y(no_outliers_df, 'IS_SUCCESSFUL')

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=3,
                                                    stratify=y)

# Standardize X_train and X_test
X_train_scaled, X_test_scaled = scale_it(X_train, X_test)

Outlier rows:
Int64Index([  676,   941,  1972,  2328,  3477,  6457,  6465,  8016,  9289,
            10005, 10291, 10806, 10979, 11903, 11943, 12049, 12606, 17266,
            17267, 20469, 20966, 21156, 21209, 21227, 21301, 21555, 21612,
            21634, 21640, 21697, 21937, 23887, 24128, 24844, 25168, 25514,
            25848, 28083, 29079, 29180, 29488, 29732, 30102, 30563, 31583,
            32542, 32552, 33023, 33064, 33277, 33450, 34034, 34187],
           dtype='int64')


In [118]:
# Recreate tensorflow.keras.models.Sequential from the specs captured in './DataFiles/optimization_results.txt' and
# the weights found in './DataFiles/model_weights_20231113-013859_74.09.h5'
num_layers = 6
input_dim = X.shape[1]
list_of_activations = ['tanh', 'leaky_relu', 'leaky_relu', 'relu', 'relu', 'leaky_relu']
list_of_number_of_nodes_hidden_layer = [74, 50, 28, 17, 48, 41]
output_activation = 'sigmoid'
filepath_to_model_weights = './DataFiles/model_weights_20231113-013859_74.09.h5'

replicated_model = create_specific_model(list_of_activations, num_layers, list_of_number_of_nodes_hidden_layer, input_dim, output_activation, filepath_to_model_weights)
replicated_model


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_28 (Dense)            (None, 74)                4810      
                                                                 
 dense_29 (Dense)            (None, 50)                3750      
                                                                 
 dense_30 (Dense)            (None, 28)                1428      
                                                                 
 dense_31 (Dense)            (None, 17)                493       
                                                                 
 dense_32 (Dense)            (None, 48)                864       
                                                                 
 dense_33 (Dense)            (None, 41)                2009      
                                                                 
 dense_34 (Dense)            (None, 1)                

<keras.src.engine.sequential.Sequential at 0x1de2bd96890>

In [119]:
# These values were also found in './DataFiles/optimization_results.txt' with the exception of epochs which I increased
# to 100 in the hopes to maximize accuracy

lr = 0.0005293391761557029
replicated_model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=lr), metrics=['accuracy'])

fit_model = replicated_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [120]:
# Evaluate the model for accuracy and loss
model_loss, model_accuracy = replicated_model.evaluate(X_test_scaled,y_test,verbose=2)

267/267 - 0s - loss: 0.5512 - accuracy: 0.7409 - 296ms/epoch - 1ms/step
