In [76]:
# Import our dependencies
# !pip install keras_tuner
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import keras_tuner as kt
import datetime
import shutil


#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
final_results_all = []
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
a1pplication_df = application_df.drop(columns=['EIN', 'NAME'])
a1pplication_df

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1
...,...,...,...,...,...,...,...,...,...,...
34294,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1


In [28]:
def otherize_column(df, column, list_of_values_to_replace):
    df_copy = df.copy()


    # Replace in dataframe
    for app in list_of_values_to_replace:
        df_copy[column] = df_copy[column].replace(app,"Other")

    # Check to make sure binning was successful
    # print(df_copy[column].value_counts())

    return df_copy

In [29]:
def otherizations(df, list_of_columns_to_otherize, list_of_list_of_values_to_replace):
    for index in range(len(list_of_columns_to_otherize)):
        df = otherize_column(df, list_of_columns_to_otherize[index], list_of_list_of_values_to_replace[index])

    return df

In [40]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()


    # number of layers including input layer but excluding output layer
    num_layers = hp.Int('num_layers', 1, 6)

    activation = hp.Choice('activation_layer_input', ['relu', 'tanh', 'leaky_relu'])

    #  Allow kerastuner to decide which activation function to use in hidden layers
    if activation == 'relu':
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('neurons_layer_input', min_value=X.shape[1], max_value=round(X.shape[1] * 2), step=5), activation='relu', input_dim=X.shape[1]))
    elif activation == 'tanh':
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('neurons_layer_input', min_value=X.shape[1], max_value=round(X.shape[1] * 2), step=5), activation='tanh', input_dim=X.shape[1]))
    elif activation == 'leaky_relu':
        activation = tf.keras.layers.LeakyReLU(alpha=0.01)
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('neurons_layer_input', min_value=X.shape[1], max_value=round(X.shape[1] * 2), step=5), activation=tf.keras.layers.LeakyReLU(alpha=0.01), input_dim=X.shape[1]))


    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(1, num_layers):

        activation = hp.Choice(f'activation_layer_{i}', ['relu', 'leaky_relu'])

        #  Allow kerastuner to decide which activation function to use in hidden layers
        if activation == 'leaky_relu':
            activation = tf.keras.layers.LeakyReLU(alpha=0.01)
        nn_model.add(tf.keras.layers.Dense(units=hp.Int(f'neurons_layer_{i}',
            min_value=2,
            max_value=round(X.shape[1] * 2),
            step=1),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # hp.Choice('random_seed2', [str(np.random.get_state())])
    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [31]:
def remove_outliers(df, threshold):
    # Calculate Z-scores for each column using scipy.stats.zscore
    z_scores = np.abs(stats.zscore(df))

    # Identify outliers using boolean masks
    outliers = (z_scores > threshold)

    # Print the rows with outliers
    outlier_rows = df[outliers.any(axis=1)]

    # print(outlier_rows)
    print(outlier_rows.index)
    return (outlier_rows, df[~outliers.any(axis=1)])

In [32]:
def extract_X_y(df, y_column_name):
    # Split our preprocessed data into our features and target arrays
    # print(a2pplication_df.columns)
    X = df.drop(columns=[y_column_name])
    y = df[y_column_name]
    return (X, y)

In [33]:
def create_specific_model(activation, num_layers, num_nodes_input_layer, list_of_number_of_nodes_hidden_layer, input_dim, output_activation):

    nn = tf.keras.models.Sequential()

    if activation == 'leaky_relu':
        activation = tf.keras.layers.LeakyReLU(alpha=0.01)

    # First hidden layer
    nn.add(tf.keras.layers.Dense(units = num_nodes_input_layer, activation = activation, input_dim = input_dim))

    for hidden_layer_num in range(0, num_layers):
        nn.add(tf.keras.layers.Dense(units = list_of_number_of_nodes_hidden_layer[hidden_layer_num], activation = activation))

    # Output layer
    nn.add(tf.keras.layers.Dense(units = 1, activation = output_activation))


    # Check the structure of the model
    print(nn.summary())
    return nn

In [34]:
def scale_it(X_train, X_test):
    # Create a StandardScaler instances
    scaler = StandardScaler()

    # Fit the StandardScaler
    X_scaler = scaler.fit(X_train)

    # Scale the data
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    return X_train_scaled, X_test_scaled

In [35]:
def delete_directory_and_its_content(directory_name):

    # Use shutil.rmtree() to recursively delete directories and subdirectories
    try:
        shutil.rmtree(directory_name)
        print(f"Directory '{directory_name}' has been deleted successfully.")
    except OSError as e:
        print(f"Error: {e}")

In [36]:
class RandomSeedLogger(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        # Access the TensorFlow random seed and log it
        random_seed = tf.random.get_seed()
        print(f"Random Seed: {random_seed}")

# Create an instance of the custom callback
random_seed_logger = RandomSeedLogger()

In [37]:
def otherizer2(df, column, starting_from_least_column):
    to_replace = [df[column].value_counts().index[starting_from_least_column:]]
    # print(f'len(df[column].value_counts().index): {len(df[column].value_counts().index.tolist())}')
    print(f'len(to_replace): {len(to_replace[0].tolist())}')
    # print(f'to_replace: {to_replace}')
    # Replace in dataframe
    for cls in to_replace:
        df[column] = df[column].replace(cls,"Other")

    # print(f'len(df[column].value_counts().index): {len(df[column].value_counts().index.tolist())}')
    return df

In [42]:

def optimization2(optimization_title, df, columns, start, finish, step, reset_df):
    if reset_df:
        df = application_df.drop(columns=['EIN', 'NAME'])
    for column in columns:
        if len(df[column].value_counts()) > 5:
            print(f'start: {start}')
            for starting_from_least_column in range(start, finish, step):
                if reset_df:
                    df = application_df.drop(columns=['EIN', 'NAME'])

                print(f'start: {start}')
                print(f'starting_from_least_column: {starting_from_least_column}')
                print(f'len(df[column].value_counts().index): {len(df[column].value_counts().index.tolist())}')
                df = otherizer2(df, column, starting_from_least_column)
                print(f'len(df[column].value_counts().index): {len(df[column].value_counts().index.tolist())}')
                run_hyperparameters(optimization_title + f': otherize columns from {starting_from_least_column}', df, 8, 9, '', 20)

In [82]:
def run_hyperparameters(optimization_title, df_new, begin_threshold, end_threshold, column_to_remove, max_epochs):
    global X, y, final_results_all, X_train, X_test, y_train, y_test, X_train_scaled, X_test_scaled

    if column_to_remove in df_new.columns:
        df_new_removed_column = df_new.drop(columns=[column_to_remove])
    else:
        df_new_removed_column = df_new


    df_new_removed_column = pd.get_dummies(df_new_removed_column)
    # return df_new_removed_column
    # print('number of columns', len(df_new_removed_column.columns))
    # return
    for z_threshold in range(begin_threshold, end_threshold):

        np.random.seed(42)
        tf.random.set_seed(42)
        outlier_rows, no_outliers_df = remove_outliers(df_new_removed_column, z_threshold)
        # print('number of columns', len(df_new_removed_column.columns))
        # return

        # Split our preprocessed data into our features and target array
        X, y = extract_X_y(no_outliers_df, 'IS_SUCCESSFUL')

        # Split the preprocessed data into a training and testing dataset
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=3,
                                                            stratify=y)

        # Standardize X_train and X_test
        X_train_scaled, X_test_scaled = scale_it(X_train, X_test)

        delete_directory_and_its_content('./untitled_project/')
        dict_results = {}
        dict_results['Optimization Title'] = optimization_title
        dict_results['columns'] = df_new_removed_column.columns
        dict_results['removed column'] = column_to_remove
        dict_results['z_threshold'] = z_threshold
        dict_results['number of outlier rows'] = outlier_rows.shape[0]
        dict_results['number of rows'] = no_outliers_df.shape[0]
        dict_results['max_epoch'] = max_epochs

        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath = 'model_weights_epoch_{epoch:02d}.h5',
            save_weights_only = True,
            period = 1
        )

        tuner = kt.Hyperband(
            create_model,
            objective="val_accuracy",
            max_epochs=max_epochs,
            hyperband_iterations=2,
            seed=1)#,
            #callbacks=[checkpoint_callback])

        # Run the kerastuner search for best hyperparameters
        tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))
        dict_results['param'] = tuner.get_best_hyperparameters(1)[0].values
        first_model = tuner.get_best_models(num_models=1)[0]
        timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        model_loss, model_accuracy = first_model.evaluate(X_test_scaled,y_test,verbose=2)
        first_model.save_weights(f'model_weights_{timestamp}_{int(model_accuracy*100)}.h5')
        dict_results['number of columns'] = len(df_new_removed_column.columns)
        dict_results['model_accuracy'] = model_accuracy
        dict_results['hyperband seed'] = 1
        dict_results['np.rand.seed and tf.random.set_seed'] = 42
        final_results_all.append(dict_results)
        file_path = "optimization_results.txt"

        # Open the file in append mode
        with open(file_path, "a") as file:
            # Append a line to the file
            # line_to_append = "This is a new line to append to the file."
            file.write(str(dict_results) + "\n")
    print(final_results_all)

In [17]:

# a1pplication_df
start = 21
end = 22
step = 1
print(len(a1pplication_df.columns))
optimization2('increase number of epochs from 20 to 100, increase maximum number of layers from 6 to 11 (includes input layer but not output)', a1pplication_df, ['CLASSIFICATION'], start, end, step, True)

10
start: 21
start: 21
starting_from_least_column: 21
len(df[column].value_counts().index): 71
len(to_replace): 50
len(df[column].value_counts().index): 22


NameError: ignored

In [None]:
start_threshold = 7
end_threshold = 11
max_epoch = 30
df = a1pplication_df.copy()
run_hyperparameters('try hyperparameters across thresholds 7 to 11 with max_epoch 30 and 11 possible layers', df, start_threshold, end_threshold, '', 30)

Trial 51 Complete [00h 01m 23s]
val_accuracy: 0.7227670550346375

Best val_accuracy So Far: 0.7228995561599731
Total elapsed time: 00h 10m 32s

Search: Running Trial #52

Value             |Best Value So Far |Hyperparameter
2                 |2                 |num_layers
leaky_relu        |leaky_relu        |activation_layer_input
191               |141               |neurons_layer_input
leaky_relu        |relu              |activation_layer_1
155               |2                 |neurons_layer_1
tanh              |relu              |activation_layer_2
37                |92                |neurons_layer_2
leaky_relu        |relu              |activation_layer_3
5                 |73                |neurons_layer_3
leaky_relu        |relu              |activation_layer_4
148               |203               |neurons_layer_4
leaky_relu        |tanh              |activation_layer_5
183               |125               |neurons_layer_5
tanh              |relu              |activation_laye

In [50]:
groupby_classification = a1pplication_df[['CLASSIFICATION', 'IS_SUCCESSFUL']].groupby('CLASSIFICATION').mean().sort_values('IS_SUCCESSFUL')
groupby_classification

Unnamed: 0_level_0,IS_SUCCESSFUL
CLASSIFICATION,Unnamed: 1_level_1
C5200,0.0
C6100,0.0
C2600,0.0
C1732,0.0
C1820,0.0
...,...
C2150,1.0
C2170,1.0
C2500,1.0
C1283,1.0


In [51]:
bucket_0 = [groupby_classification[groupby_classification['IS_SUCCESSFUL'] == 0].index]
bucket_0

[Index(['C5200', 'C6100', 'C2600', 'C1732', 'C1820', 'C1236', 'C2380', 'C2190'], dtype='object', name='CLASSIFICATION')]

In [52]:
print(a1pplication_df['CLASSIFICATION'].value_counts())
a2pplication_df = a1pplication_df.copy()
# Replace in dataframe
for cls in bucket_0:
    a2pplication_df['CLASSIFICATION'] = a1pplication_df['CLASSIFICATION'].replace(cls,"bucket_0")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts()

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C2561        1
C4500        1
C2150        1
Name: CLASSIFICATION, Length: 71, dtype: int64


C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C4500        1
C2561        1
C2150        1
Name: CLASSIFICATION, Length: 64, dtype: int64

In [53]:
bucket_1 = [groupby_classification[groupby_classification['IS_SUCCESSFUL'] == 1].index]
bucket_1

[Index(['C3700', 'C7210', 'C4120', 'C4200', 'C4500', 'C4100', 'C1900', 'C2561',
        'C1234', 'C1235', 'C1245', 'C1246', 'C1248', 'C1256', 'C1257', 'C2570',
        'C1278', 'C1370', 'C1570', 'C1580', 'C1728', 'C2150', 'C2170', 'C2500',
        'C1283', 'C8210'],
       dtype='object', name='CLASSIFICATION')]

In [54]:
print(a2pplication_df['CLASSIFICATION'].value_counts())
# Replace in dataframe
for cls in bucket_1:
    a2pplication_df['CLASSIFICATION'] = a2pplication_df['CLASSIFICATION'].replace(cls,"bucket_1")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts().count()

C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C4120        1
C8210        1
C4500        1
C2561        1
C2150        1
Name: CLASSIFICATION, Length: 64, dtype: int64


39

In [55]:
g2roupby_classification = a2pplication_df[['CLASSIFICATION', 'IS_SUCCESSFUL']].groupby('CLASSIFICATION').mean().sort_values('IS_SUCCESSFUL')
g2roupby_classification

Unnamed: 0_level_0,IS_SUCCESSFUL
CLASSIFICATION,Unnamed: 1_level_1
bucket_0,0.0
C5000,0.051724
C8200,0.090909
C2100,0.226235
C2300,0.28125
C1300,0.293103
C2710,0.333333
C1200,0.462683
C3000,0.48488
C3200,0.5


In [56]:
bucket_0_5 = [groupby_classification[groupby_classification['IS_SUCCESSFUL'] == 0.5].index]
bucket_0_5

[Index(['C3200', 'C1267'], dtype='object', name='CLASSIFICATION')]

In [57]:
print(a2pplication_df['CLASSIFICATION'].value_counts())
# Replace in dataframe
for cls in bucket_0_5:
    a2pplication_df['CLASSIFICATION'] = a2pplication_df['CLASSIFICATION'].replace(cls,"bucket_0_5")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts().count()

C1000       17326
C2000        6074
C1200        4837
C3000        1918
C2100        1883
C7000         777
C1700         287
C4000         194
C5000         116
C1270         114
C2700         104
C2800          95
C7100          75
bucket_1       61
C1300          58
C1280          50
C1230          36
C1400          34
C2300          32
C7200          32
C1240          30
C8000          20
C7120          18
C1500          16
C1800          15
C6000          15
C1250          14
C8200          11
C1238          10
C1237           9
bucket_0        8
C1720           6
C2400           6
C1600           5
C2710           3
C1260           3
C0              3
C1267           2
C3200           2
Name: CLASSIFICATION, dtype: int64


38

In [58]:
bucket_0_667 = [groupby_classification[(groupby_classification['IS_SUCCESSFUL'] > 0.66666) & (groupby_classification['IS_SUCCESSFUL'] < 0.6667)].index]
bucket_0_667

[Index(['C0', 'C1260', 'C1720', 'C2400'], dtype='object', name='CLASSIFICATION')]

In [59]:
print(a2pplication_df['CLASSIFICATION'].value_counts())
# Replace in dataframe
for cls in bucket_0_667:
    a2pplication_df['CLASSIFICATION'] = a2pplication_df['CLASSIFICATION'].replace(cls,"bucket_0_667")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts().count()

C1000         17326
C2000          6074
C1200          4837
C3000          1918
C2100          1883
C7000           777
C1700           287
C4000           194
C5000           116
C1270           114
C2700           104
C2800            95
C7100            75
bucket_1         61
C1300            58
C1280            50
C1230            36
C1400            34
C2300            32
C7200            32
C1240            30
C8000            20
C7120            18
C1500            16
C1800            15
C6000            15
C1250            14
C8200            11
C1238            10
C1237             9
bucket_0          8
C1720             6
C2400             6
C1600             5
bucket_0_5        4
C2710             3
C1260             3
C0                3
Name: CLASSIFICATION, dtype: int64


35

In [60]:
bucket_0_83 = [groupby_classification[(groupby_classification['IS_SUCCESSFUL'] > 0.8333) & (groupby_classification['IS_SUCCESSFUL'] < 0.8334)].index]
bucket_0_83

[Index(['C1230', 'C1240', 'C7120'], dtype='object', name='CLASSIFICATION')]

In [61]:
print(a2pplication_df['CLASSIFICATION'].value_counts())
# Replace in dataframe
for cls in bucket_0_83:
    a2pplication_df['CLASSIFICATION'] = a2pplication_df['CLASSIFICATION'].replace(cls,"bucket_0_83")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts().count()

C1000           17326
C2000            6074
C1200            4837
C3000            1918
C2100            1883
C7000             777
C1700             287
C4000             194
C5000             116
C1270             114
C2700             104
C2800              95
C7100              75
bucket_1           61
C1300              58
C1280              50
C1230              36
C1400              34
C7200              32
C2300              32
C1240              30
C8000              20
C7120              18
bucket_0_667       18
C1500              16
C1800              15
C6000              15
C1250              14
C8200              11
C1238              10
C1237               9
bucket_0            8
C1600               5
bucket_0_5          4
C2710               3
Name: CLASSIFICATION, dtype: int64


33

In [62]:
bucket_0_8667 = [groupby_classification[(groupby_classification['IS_SUCCESSFUL'] > 0.8666) & (groupby_classification['IS_SUCCESSFUL'] < 0.8667)].index]
bucket_0_8667

[Index(['C1800', 'C6000'], dtype='object', name='CLASSIFICATION')]

In [63]:
print(a2pplication_df['CLASSIFICATION'].value_counts())
# Replace in dataframe
for cls in bucket_0_8667:
    a2pplication_df['CLASSIFICATION'] = a2pplication_df['CLASSIFICATION'].replace(cls,"bucket_0_8667")

# Check to make sure binning was successful
a2pplication_df['CLASSIFICATION'].value_counts().count()

C1000           17326
C2000            6074
C1200            4837
C3000            1918
C2100            1883
C7000             777
C1700             287
C4000             194
C5000             116
C1270             114
C2700             104
C2800              95
bucket_0_83        84
C7100              75
bucket_1           61
C1300              58
C1280              50
C1400              34
C7200              32
C2300              32
C8000              20
bucket_0_667       18
C1500              16
C1800              15
C6000              15
C1250              14
C8200              11
C1238              10
C1237               9
bucket_0            8
C1600               5
bucket_0_5          4
C2710               3
Name: CLASSIFICATION, dtype: int64


32

In [64]:
g2roupby_classification = a2pplication_df[['CLASSIFICATION', 'IS_SUCCESSFUL']].groupby('CLASSIFICATION').mean().sort_values('IS_SUCCESSFUL')
g2roupby_classification

Unnamed: 0_level_0,IS_SUCCESSFUL
CLASSIFICATION,Unnamed: 1_level_1
bucket_0,0.0
C5000,0.051724
C8200,0.090909
C2100,0.226235
C2300,0.28125
C1300,0.293103
C2710,0.333333
C1200,0.462683
C3000,0.48488
bucket_0_5,0.5


In [65]:
g3roupby_classification = a2pplication_df[['APPLICATION_TYPE', 'IS_SUCCESSFUL']].groupby('APPLICATION_TYPE').mean().sort_values('IS_SUCCESSFUL')
g3roupby_classification

Unnamed: 0_level_0,IS_SUCCESSFUL
APPLICATION_TYPE,Unnamed: 1_level_1
T19,0.188732
T4,0.305447
T8,0.421981
T3,0.53216
T2,0.5625
T7,0.565517
T25,0.666667
T9,0.717949
T6,0.740132
T5,0.767263


In [66]:
bucket_APP_TYP_1 = [g3roupby_classification[g3roupby_classification['IS_SUCCESSFUL'] == 1 ].index]
bucket_APP_TYP_1

[Index(['T15', 'T29', 'T14', 'T17'], dtype='object', name='APPLICATION_TYPE')]

In [67]:
print(a2pplication_df['APPLICATION_TYPE'].value_counts())
# Replace in dataframe
for cls in bucket_APP_TYP_1:
    a2pplication_df['APPLICATION_TYPE'] = a2pplication_df['APPLICATION_TYPE'].replace(cls,"bucket_APP_TYP_1")

# Check to make sure binning was successful
a2pplication_df['APPLICATION_TYPE'].value_counts().count()

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64


14

In [68]:
bucket_APP_TYP_0_88 = [g3roupby_classification[(g3roupby_classification['IS_SUCCESSFUL'] > 0.87) & (g3roupby_classification['IS_SUCCESSFUL'] < 0.89)].index]
bucket_APP_TYP_0_88

[Index(['T10', 'T12'], dtype='object', name='APPLICATION_TYPE')]

In [69]:
print(a2pplication_df['APPLICATION_TYPE'].value_counts())
# Replace in dataframe
for cls in bucket_APP_TYP_0_88:
    a2pplication_df['APPLICATION_TYPE'] = a2pplication_df['APPLICATION_TYPE'].replace(cls,"bucket_APP_TYP_0_88")

# Check to make sure binning was successful
a2pplication_df['APPLICATION_TYPE'].value_counts().count()

T3                  27037
T4                   1542
T6                   1216
T5                   1173
T19                  1065
T8                    737
T7                    725
T10                   528
T9                    156
T13                    66
T12                    27
T2                     16
bucket_APP_TYP_1        8
T25                     3
Name: APPLICATION_TYPE, dtype: int64


13

In [70]:
bucket_APP_TYP_0_57 = [g3roupby_classification[(g3roupby_classification['IS_SUCCESSFUL'] > 0.56) & (g3roupby_classification['IS_SUCCESSFUL'] < 0.57)].index]
bucket_APP_TYP_0_57

[Index(['T2', 'T7'], dtype='object', name='APPLICATION_TYPE')]

In [71]:
print(a2pplication_df['APPLICATION_TYPE'].value_counts())
# Replace in dataframe
for cls in bucket_APP_TYP_0_57:
    a2pplication_df['APPLICATION_TYPE'] = a2pplication_df['APPLICATION_TYPE'].replace(cls,"bucket_APP_TYP_0_57")

# Check to make sure binning was successful
a2pplication_df['APPLICATION_TYPE'].value_counts().count()

T3                     27037
T4                      1542
T6                      1216
T5                      1173
T19                     1065
T8                       737
T7                       725
bucket_APP_TYP_0_88      555
T9                       156
T13                       66
T2                        16
bucket_APP_TYP_1           8
T25                        3
Name: APPLICATION_TYPE, dtype: int64


12

In [43]:

# a1pplication_df
start = 21
end = 22
step = 1
print(len(a1pplication_df.columns))
optimization2('add buckets for classification', a2pplication_df, ['CLASSIFICATION'], start, end, step, True)

Trial 24 Complete [00h 00m 07s]
val_accuracy: 0.7351545691490173

Best val_accuracy So Far: 0.7381044030189514
Total elapsed time: 00h 02m 35s

Search: Running Trial #25

Value             |Best Value So Far |Hyperparameter
4                 |4                 |num_layers
tanh              |tanh              |activation_layer_input
67                |102               |neurons_layer_input
leaky_relu        |relu              |activation_layer_1
95                |92                |neurons_layer_1
leaky_relu        |leaky_relu        |activation_layer_2
134               |29                |neurons_layer_2
relu              |relu              |activation_layer_3
79                |52                |neurons_layer_3
relu              |leaky_relu        |activation_layer_4
83                |7                 |neurons_layer_4
leaky_relu        |None              |activation_layer_5
54                |None              |neurons_layer_5
20                |7                 |tuner/epochs
7 

KeyboardInterrupt: 

In [None]:

# a1pplication_df
start = 21
end = 22
step = 1
print(len(a1pplication_df.columns))
optimization2('increase number of epochs from 20 to 100, increase maximum number of layers from 6 to 11 (includes input layer but not output)', a2pplication_df, ['CLASSIFICATION'], start, end, step, True)

In [72]:
start_threshold = 150
end_threshold = 151
max_epoch = 30
df = a1pplication_df.copy()
run_hyperparameters('try hyperparameters across thresholds 7 to 11 with max_epoch 30 and 11 possible layers', df, start_threshold, end_threshold, '', max_epoch)

Trial 180 Complete [00h 00m 33s]
val_accuracy: 0.7326096892356873

Best val_accuracy So Far: 0.7345938086509705
Total elapsed time: 00h 29m 08s
268/268 - 0s - loss: 0.5504 - accuracy: 0.7346 - 343ms/epoch - 1ms/step
[{'Optimization Title': 'try hyperparameters across thresholds 7 to 11 with max_epoch 30 and 11 possible layers', 'columns': Index(['STATUS', 'ASK_AMT', 'IS_SUCCESSFUL', 'APPLICATION_TYPE_T10',
       'APPLICATION_TYPE_T12', 'APPLICATION_TYPE_T13', 'APPLICATION_TYPE_T14',
       'APPLICATION_TYPE_T15', 'APPLICATION_TYPE_T17', 'APPLICATION_TYPE_T19',
       ...
       'INCOME_AMT_1-9999', 'INCOME_AMT_10000-24999',
       'INCOME_AMT_100000-499999', 'INCOME_AMT_10M-50M', 'INCOME_AMT_1M-5M',
       'INCOME_AMT_25000-99999', 'INCOME_AMT_50M+', 'INCOME_AMT_5M-10M',
       'SPECIAL_CONSIDERATIONS_N', 'SPECIAL_CONSIDERATIONS_Y'],
      dtype='object', length=117), 'removed column': '', 'z_threshold': 150, 'number of outlier rows': 27, 'number of rows': 34272, 'max_epoch': 30, 'par

In [84]:
start_threshold = 8
end_threshold = 10
max_epoch = 20
df = a1pplication_df.copy()
run_hyperparameters('try hyperparameters across thresholds 7 to 11 with max_epoch 30 and 11 possible layers', df, start_threshold, end_threshold, '', max_epoch)

Trial 60 Complete [00h 00m 26s]
val_accuracy: 0.7238664031028748

Best val_accuracy So Far: 0.7260256409645081
Total elapsed time: 00h 11m 07s
247/247 - 0s - loss: 0.5600 - accuracy: 0.7260 - 322ms/epoch - 1ms/step
[{'Optimization Title': 'try hyperparameters across thresholds 7 to 11 with max_epoch 30 and 11 possible layers', 'columns': Index(['STATUS', 'ASK_AMT', 'IS_SUCCESSFUL', 'APPLICATION_TYPE_T10',
       'APPLICATION_TYPE_T12', 'APPLICATION_TYPE_T13', 'APPLICATION_TYPE_T14',
       'APPLICATION_TYPE_T15', 'APPLICATION_TYPE_T17', 'APPLICATION_TYPE_T19',
       ...
       'INCOME_AMT_1-9999', 'INCOME_AMT_10000-24999',
       'INCOME_AMT_100000-499999', 'INCOME_AMT_10M-50M', 'INCOME_AMT_1M-5M',
       'INCOME_AMT_25000-99999', 'INCOME_AMT_50M+', 'INCOME_AMT_5M-10M',
       'SPECIAL_CONSIDERATIONS_N', 'SPECIAL_CONSIDERATIONS_Y'],
      dtype='object', length=117), 'removed column': '', 'z_threshold': 8, 'number of outlier rows': 3114, 'number of rows': 31185, 'max_epoch': 20, 'para

# Reproduce Results of Model

In [None]:
a1pplication_df.dtypes

APPLICATION_TYPE          object
AFFILIATION               object
CLASSIFICATION            object
USE_CASE                  object
ORGANIZATION              object
STATUS                     int64
INCOME_AMT                object
SPECIAL_CONSIDERATIONS    object
ASK_AMT                    int64
IS_SUCCESSFUL              int64
dtype: object

In [None]:
a2pplication_df = a1pplication_df.copy()
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('1-9999',"5000")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('10000-24999',"17500")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('100000-499999',"300000")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('10M-50M',"30000000")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('1M-5M',"3000000")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('25000-99999',"62500")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('50M+',"50000000")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('5M-10M',"7500000")
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].replace('50M+',"50000000")
# a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].
a2pplication_df['INCOME_AMT'] = a2pplication_df['INCOME_AMT'].astype('int64')
a2pplication_df.dtypes

APPLICATION_TYPE          object
AFFILIATION               object
CLASSIFICATION            object
USE_CASE                  object
ORGANIZATION              object
STATUS                     int64
INCOME_AMT                 int64
SPECIAL_CONSIDERATIONS    object
ASK_AMT                    int64
IS_SUCCESSFUL              int64
dtype: object

In [None]:
len(df[column].value_counts())-1

In [None]:
optimization2('check if otherizing APPLICATION_TYPE increases accuracy', a1pplication_df, ['APPLICATION_TYPE', -5])

TypeError: optimization2() missing 1 required positional argument: 'step'

In [None]:
print(str(tf.random.get_seed()))

AttributeError: module 'tensorflow._api.v2.random' has no attribute 'get_seed'

In [None]:
import tensorflow as tf

# Custom TensorFlow operation to capture random seed
@tf.function
def get_random_seed():
    seed = tf.random.uniform(shape=(), maxval=2**31 - 1, dtype=tf.int32)
    return seed

# Define a custom callback to log the random seed during training
class RandomSeedLogger(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs=None):
        random_seed = self.model.get_layer(index=0).call(get_random_seed)
        print(f"Random Seed: {random_seed.numpy()}")

# Create an instance of the custom callback
random_seed_logger = RandomSeedLogger()
get_random_seed()

<tf.Tensor: shape=(), dtype=int32, numpy=454295297>

In [None]:
def print_all(datalist, list_of_keys):
    for row in datalist:
        for key in list_of_keys:
            print(f'{key}: {row[key]}')

In [None]:
print(len(final_results_all))
print(final_results_all[0].keys())
print_all(final_results_all, ['z_threshold', 'model_accuracy', 'param'])
# final_results_all


3
dict_keys(['columns', 'removed column', 'z_threshold', 'outlier rows', 'number of rows', 'param', 'model_accuracy'])
z_threshold: 10
model_accuracy: 0.7370668053627014
param: {'activation': 'tanh', 'first_units': 186, 'num_layers': 1, 'units_0': 172, 'units_1': 208, 'units_2': 169, 'units_3': 29, 'units_4': 7, 'tuner/epochs': 20, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
z_threshold: 10
model_accuracy: 0.7361878156661987
param: {'activation': 'relu', 'first_units': 166, 'num_layers': 4, 'units_0': 139, 'units_1': 216, 'units_2': 203, 'units_3': 214, 'units_4': 131, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0023'}
z_threshold: 10
model_accuracy: 0.7296745777130127
param: {'activation': 'relu', 'first_units': 83, 'num_layers': 4, 'units_0': 86, 'units_1': 49, 'units_2': 72, 'units_3': 33, 'units_4': 18, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': 

In [None]:
print(len(final_results_all))
print(final_results_all[0].keys())
print_all(final_results_all, ['z_threshold', 'model_accuracy', 'param'])
final_results_all


2
dict_keys(['columns', 'removed column', 'z_threshold', 'outlier rows', 'number of rows', 'param', 'model_accuracy'])
z_threshold: 9
model_accuracy: 0.7260256409645081
param: {'activation': 'leaky_relu', 'first_units': 126, 'num_layers': 3, 'units_0': 185, 'units_1': 129, 'units_2': 220, 'units_3': 58, 'units_4': 189, 'tuner/epochs': 3, 'tuner/initial_epoch': 0, 'tuner/bracket': 2, 'tuner/round': 0}
z_threshold: 10
model_accuracy: 0.7383224368095398
param: {'activation': 'relu', 'first_units': 126, 'num_layers': 4, 'units_0': 62, 'units_1': 134, 'units_2': 89, 'units_3': 7, 'units_4': 135, 'tuner/epochs': 20, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}


[{'columns': Index(['STATUS', 'ASK_AMT', 'IS_SUCCESSFUL', 'APPLICATION_TYPE_T10',
         'APPLICATION_TYPE_T12', 'APPLICATION_TYPE_T13', 'APPLICATION_TYPE_T14',
         'APPLICATION_TYPE_T15', 'APPLICATION_TYPE_T17', 'APPLICATION_TYPE_T19',
         ...
         'INCOME_AMT_1-9999', 'INCOME_AMT_10000-24999',
         'INCOME_AMT_100000-499999', 'INCOME_AMT_10M-50M', 'INCOME_AMT_1M-5M',
         'INCOME_AMT_25000-99999', 'INCOME_AMT_50M+', 'INCOME_AMT_5M-10M',
         'SPECIAL_CONSIDERATIONS_N', 'SPECIAL_CONSIDERATIONS_Y'],
        dtype='object', length=117),
  'removed column': '',
  'z_threshold': 9,
  'outlier rows':        STATUS    ASK_AMT  IS_SUCCESSFUL  APPLICATION_TYPE_T10  \
  4           1     142590              1                     0   
  7           1    7508025              1                     0   
  13          1       5301              1                     0   
  17          1   86380556              0                     0   
  41          1    1329410         

In [None]:
X.shape[1]

116

In [None]:
replicated_model = create_specific_model('leaky_relu', 1, 58, [5], X.shape[1], 'sigmoid')#create_specific_model(activation, num_layers, num_nodes_input_layer, list_of_number_of_nodes_hidden_layer, input_dim, output_activation):
replicated_model

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 58)                2552      
                                                                 
 dense_6 (Dense)             (None, 5)                 295       
                                                                 
 dense_7 (Dense)             (None, 1)                 6         
                                                                 
Total params: 2853 (11.14 KB)
Trainable params: 2853 (11.14 KB)
Non-trainable params: 0 (0.00 Byte)


_________________________________________________________________
None


<keras.src.engine.sequential.Sequential at 0x20355a06cb0>

In [None]:
# df_new_removed_column = a1pplication_df

# df_new_removed_column = pd.get_dummies(df_new_removed_column)

# outlier_rows, no_outliers_df = remove_outliers(df_new_removed_column, 10)

# # Split our preprocessed data into our features and target array
# X, y = extract_X_y(no_outliers_df, 'IS_SUCCESSFUL')

# print(X.shape)

# # Split the preprocessed data into a training and testing dataset
# X_train, X_test, y_train, y_test = train_test_split(X,
#                                                     y,
#                                                     random_state=3,
#                                                     stratify=y)

# # Standardize X_train and X_test
# X_train_scaled, X_test_scaled = scale_it(X_train, X_test)

# Compile the model
replicated_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

fit_model = replicated_model.fit(X_train_scaled, y_train, epochs=7)#, callbacks = [checkpoint_callback])

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
