In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
import os
from tensorflow.keras.callbacks import ModelCheckpoint

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("./Resources/charity_data.csv")
application_df.tail()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1
34298,996086871,WATERHOUSE CHARITABLE TR,T3,Independent,C1000,Preservation,Co-operative,1,1M-5M,N,36500179,0


In [2]:
# Drop EIN and NAME
application_df = application_df.drop(columns=["EIN", "NAME"])


In [3]:
# Determine the number of unique values in each column
application_cat = application_df.dtypes[application_df.dtypes == "object"].index.tolist()
application_df[application_cat].nunique()

APPLICATION_TYPE          17
AFFILIATION                6
CLASSIFICATION            71
USE_CASE                   5
ORGANIZATION               4
INCOME_AMT                 9
SPECIAL_CONSIDERATIONS     2
dtype: int64

In [4]:
# Bin application types
application_counts = application_df.APPLICATION_TYPE.value_counts()
replace_application = list(application_counts[application_counts < 200].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(app, "Other")

# Ensure binning successful
application_df.APPLICATION_TYPE.value_counts()

T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: APPLICATION_TYPE, dtype: int64

In [5]:
# Bin classification types
classification_counts = application_df.CLASSIFICATION.value_counts()
replace_class = list(classification_counts[classification_counts < 1000].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(cls, "Other")
    
# Ensure binning successful
application_df.CLASSIFICATION.value_counts()

C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [6]:
# Generate a categorical variable list
application_cat = application_df.dtypes[application_df.dtypes == "object"].index.tolist()


In [7]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names tot he dataframe
encode_df.columns = enc.get_feature_names(application_cat)


In [8]:
# Merge OneHotEncoder features and drop the originals
application_df = application_df.merge(encode_df, left_index=True, right_index=True)
application_df = application_df.drop(application_cat, 1)

In [9]:
# Split our preprocessed data into our features and target arrays
y = application_df["IS_SUCCESSFUL"].values
X = application_df.drop(["IS_SUCCESSFUL"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [10]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


### First Attempt - changed layer 1 and 2 activation, increased the nodes in layer 2 - decrease accuracy

In [11]:
# Define the model 
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 40

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="sigmoid"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

                             

In [12]:
# Compile the nodel
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics="accuracy")

In [13]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints_opt/", exist_ok=True)
checkpoint_path = "checkpoints_opt/weights.{epoch:02d}.hdf5"

# Create a callback that saves the models weights every 5 epocks
cp_callback = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_weights_only=True, save_frequ=100)

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[cp_callback])

Epoch 1/100

Epoch 00001: saving model to checkpoints_opt\weights.01.hdf5
Epoch 2/100

Epoch 00002: saving model to checkpoints_opt\weights.02.hdf5
Epoch 3/100

Epoch 00003: saving model to checkpoints_opt\weights.03.hdf5
Epoch 4/100

Epoch 00004: saving model to checkpoints_opt\weights.04.hdf5
Epoch 5/100

Epoch 00005: saving model to checkpoints_opt\weights.05.hdf5
Epoch 6/100

Epoch 00006: saving model to checkpoints_opt\weights.06.hdf5
Epoch 7/100

Epoch 00007: saving model to checkpoints_opt\weights.07.hdf5
Epoch 8/100

Epoch 00008: saving model to checkpoints_opt\weights.08.hdf5
Epoch 9/100

Epoch 00009: saving model to checkpoints_opt\weights.09.hdf5
Epoch 10/100

Epoch 00010: saving model to checkpoints_opt\weights.10.hdf5
Epoch 11/100

Epoch 00011: saving model to checkpoints_opt\weights.11.hdf5
Epoch 12/100

Epoch 00012: saving model to checkpoints_opt\weights.12.hdf5
Epoch 13/100

Epoch 00013: saving model to checkpoints_opt\weights.13.hdf5
Epoch 14/100

Epoch 00014: saving 


Epoch 00050: saving model to checkpoints_opt\weights.50.hdf5
Epoch 51/100

Epoch 00051: saving model to checkpoints_opt\weights.51.hdf5
Epoch 52/100

Epoch 00052: saving model to checkpoints_opt\weights.52.hdf5
Epoch 53/100

Epoch 00053: saving model to checkpoints_opt\weights.53.hdf5
Epoch 54/100

Epoch 00054: saving model to checkpoints_opt\weights.54.hdf5
Epoch 55/100

Epoch 00055: saving model to checkpoints_opt\weights.55.hdf5
Epoch 56/100

Epoch 00056: saving model to checkpoints_opt\weights.56.hdf5
Epoch 57/100

Epoch 00057: saving model to checkpoints_opt\weights.57.hdf5
Epoch 58/100

Epoch 00058: saving model to checkpoints_opt\weights.58.hdf5
Epoch 59/100

Epoch 00059: saving model to checkpoints_opt\weights.59.hdf5
Epoch 60/100

Epoch 00060: saving model to checkpoints_opt\weights.60.hdf5
Epoch 61/100

Epoch 00061: saving model to checkpoints_opt\weights.61.hdf5
Epoch 62/100

Epoch 00062: saving model to checkpoints_opt\weights.62.hdf5
Epoch 63/100

Epoch 00063: saving mode


Epoch 00099: saving model to checkpoints_opt\weights.99.hdf5
Epoch 100/100

Epoch 00100: saving model to checkpoints_opt\weights.100.hdf5


In [14]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5551 - accuracy: 0.7258
Loss: 0.5551105737686157, Accuracy: 0.7258309125900269


In [15]:
# Export the model to HDF5 file
nn.save("AlphabetSoupCharityOpt1.h5")

### Second Attempt - added additional layer, increased nodes in layer 2, and changed activation to sigmoid for all layers - Decreased accuracy

In [16]:
# Define the model 
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 40
hidden_nodes_layer3 = 20


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="sigmoid"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


In [17]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics="accuracy")

In [18]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints_opt/", exist_ok=True)
checkpoint_path = "checkpoints_opt/weights.{epoch:02d}.hdf5"

# Create a callback that saves the models weights every 5 epocks
cp_callback = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_weights_only=True, save_frequ=100)

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100, callbacks=[cp_callback])

Epoch 1/100

Epoch 00001: saving model to checkpoints_opt\weights.01.hdf5
Epoch 2/100

Epoch 00002: saving model to checkpoints_opt\weights.02.hdf5
Epoch 3/100

Epoch 00003: saving model to checkpoints_opt\weights.03.hdf5
Epoch 4/100

Epoch 00004: saving model to checkpoints_opt\weights.04.hdf5
Epoch 5/100

Epoch 00005: saving model to checkpoints_opt\weights.05.hdf5
Epoch 6/100

Epoch 00006: saving model to checkpoints_opt\weights.06.hdf5
Epoch 7/100

Epoch 00007: saving model to checkpoints_opt\weights.07.hdf5
Epoch 8/100

Epoch 00008: saving model to checkpoints_opt\weights.08.hdf5
Epoch 9/100

Epoch 00009: saving model to checkpoints_opt\weights.09.hdf5
Epoch 10/100

Epoch 00010: saving model to checkpoints_opt\weights.10.hdf5
Epoch 11/100

Epoch 00011: saving model to checkpoints_opt\weights.11.hdf5
Epoch 12/100

Epoch 00012: saving model to checkpoints_opt\weights.12.hdf5
Epoch 13/100

Epoch 00013: saving model to checkpoints_opt\weights.13.hdf5
Epoch 14/100

Epoch 00014: saving 


Epoch 00051: saving model to checkpoints_opt\weights.51.hdf5
Epoch 52/100

Epoch 00052: saving model to checkpoints_opt\weights.52.hdf5
Epoch 53/100

Epoch 00053: saving model to checkpoints_opt\weights.53.hdf5
Epoch 54/100

Epoch 00054: saving model to checkpoints_opt\weights.54.hdf5
Epoch 55/100

Epoch 00055: saving model to checkpoints_opt\weights.55.hdf5
Epoch 56/100

Epoch 00056: saving model to checkpoints_opt\weights.56.hdf5
Epoch 57/100

Epoch 00057: saving model to checkpoints_opt\weights.57.hdf5
Epoch 58/100

Epoch 00058: saving model to checkpoints_opt\weights.58.hdf5
Epoch 59/100

Epoch 00059: saving model to checkpoints_opt\weights.59.hdf5
Epoch 60/100

Epoch 00060: saving model to checkpoints_opt\weights.60.hdf5
Epoch 61/100

Epoch 00061: saving model to checkpoints_opt\weights.61.hdf5
Epoch 62/100

Epoch 00062: saving model to checkpoints_opt\weights.62.hdf5
Epoch 63/100

Epoch 00063: saving model to checkpoints_opt\weights.63.hdf5
Epoch 64/100

Epoch 00064: saving mode

In [19]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5530 - accuracy: 0.7282
Loss: 0.5530101656913757, Accuracy: 0.7281632423400879


In [20]:
# Export the model to HDF5 file
nn.save("AlphabetSoupCharityOpt2.h5")

### Third Attempt - add additional layer, changed optimizer, removed Status column - Decreased accuracy

In [21]:
#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("./Resources/charity_data.csv")
application_df.tail()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1
34298,996086871,WATERHOUSE CHARITABLE TR,T3,Independent,C1000,Preservation,Co-operative,1,1M-5M,N,36500179,0


In [22]:
# Drop EIN and NAME and STATUS
application_df = application_df.drop(columns=["EIN", "NAME", "STATUS"])

In [23]:
# Determine the number of unique values in each column
application_cat = application_df.dtypes[application_df.dtypes == "object"].index.tolist()
application_df[application_cat].nunique()

APPLICATION_TYPE          17
AFFILIATION                6
CLASSIFICATION            71
USE_CASE                   5
ORGANIZATION               4
INCOME_AMT                 9
SPECIAL_CONSIDERATIONS     2
dtype: int64

In [24]:
# Bin application types
application_counts = application_df.APPLICATION_TYPE.value_counts()
replace_application = list(application_counts[application_counts < 200].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(app, "Other")

# Ensure binning successful
application_df.APPLICATION_TYPE.value_counts()

T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: APPLICATION_TYPE, dtype: int64

In [25]:
# Bin classification types
classification_counts = application_df.CLASSIFICATION.value_counts()
replace_class = list(classification_counts[classification_counts < 1000].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(cls, "Other")
    
# Ensure binning successful
application_df.CLASSIFICATION.value_counts()

C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [26]:
# Generate a categorical variable list
application_cat = application_df.dtypes[application_df.dtypes == "object"].index.tolist()


In [27]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names tot he dataframe
encode_df.columns = enc.get_feature_names(application_cat)


In [28]:
# Merge OneHotEncoder features and drop the originals
application_df = application_df.merge(encode_df, left_index=True, right_index=True)
application_df = application_df.drop(application_cat, 1)

In [29]:
# Split our preprocessed data into our features and target arrays
y = application_df["IS_SUCCESSFUL"].values
X = application_df.drop(["IS_SUCCESSFUL"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [30]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [31]:
# Define the model 
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 90
hidden_nodes_layer2 = 70
hidden_nodes_layer3 = 30


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


In [32]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics="accuracy")

In [33]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints_opt/", exist_ok=True)
checkpoint_path = "checkpoints_opt/weights.{epoch:02d}.hdf5"

# Create a callback that saves the models weights every 5 epocks
cp_callback = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_weights_only=True, save_frequ=100)

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=70, callbacks=[cp_callback])

Epoch 1/70

Epoch 00001: saving model to checkpoints_opt\weights.01.hdf5
Epoch 2/70

Epoch 00002: saving model to checkpoints_opt\weights.02.hdf5
Epoch 3/70

Epoch 00003: saving model to checkpoints_opt\weights.03.hdf5
Epoch 4/70

Epoch 00004: saving model to checkpoints_opt\weights.04.hdf5
Epoch 5/70

Epoch 00005: saving model to checkpoints_opt\weights.05.hdf5
Epoch 6/70

Epoch 00006: saving model to checkpoints_opt\weights.06.hdf5
Epoch 7/70

Epoch 00007: saving model to checkpoints_opt\weights.07.hdf5
Epoch 8/70

Epoch 00008: saving model to checkpoints_opt\weights.08.hdf5
Epoch 9/70

Epoch 00009: saving model to checkpoints_opt\weights.09.hdf5
Epoch 10/70

Epoch 00010: saving model to checkpoints_opt\weights.10.hdf5
Epoch 11/70

Epoch 00011: saving model to checkpoints_opt\weights.11.hdf5
Epoch 12/70

Epoch 00012: saving model to checkpoints_opt\weights.12.hdf5
Epoch 13/70

Epoch 00013: saving model to checkpoints_opt\weights.13.hdf5
Epoch 14/70

Epoch 00014: saving model to check


Epoch 00051: saving model to checkpoints_opt\weights.51.hdf5
Epoch 52/70

Epoch 00052: saving model to checkpoints_opt\weights.52.hdf5
Epoch 53/70

Epoch 00053: saving model to checkpoints_opt\weights.53.hdf5
Epoch 54/70

Epoch 00054: saving model to checkpoints_opt\weights.54.hdf5
Epoch 55/70

Epoch 00055: saving model to checkpoints_opt\weights.55.hdf5
Epoch 56/70

Epoch 00056: saving model to checkpoints_opt\weights.56.hdf5
Epoch 57/70

Epoch 00057: saving model to checkpoints_opt\weights.57.hdf5
Epoch 58/70

Epoch 00058: saving model to checkpoints_opt\weights.58.hdf5
Epoch 59/70

Epoch 00059: saving model to checkpoints_opt\weights.59.hdf5
Epoch 60/70

Epoch 00060: saving model to checkpoints_opt\weights.60.hdf5
Epoch 61/70

Epoch 00061: saving model to checkpoints_opt\weights.61.hdf5
Epoch 62/70

Epoch 00062: saving model to checkpoints_opt\weights.62.hdf5
Epoch 63/70

Epoch 00063: saving model to checkpoints_opt\weights.63.hdf5
Epoch 64/70

Epoch 00064: saving model to checkpoi

In [34]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5550 - accuracy: 0.7264
Loss: 0.5549871921539307, Accuracy: 0.7264139652252197


In [35]:
# Export the model to HDF5 file
nn.save("AlphabetSoupCharityOpt3.h5")

### Fourth Attempt - removed status, binned name, changed activation - achieved greater than 75%

In [36]:
#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("./Resources/charity_data.csv")
application_df.tail()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
34294,996009318,THE LIONS CLUB OF HONOLULU KAMEHAMEHA,T4,Independent,C1000,ProductDev,Association,1,0,N,5000,0
34295,996010315,INTERNATIONAL ASSOCIATION OF LIONS CLUBS,T4,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
34296,996012607,PTA HAWAII CONGRESS,T3,CompanySponsored,C2000,Preservation,Association,1,0,N,5000,0
34297,996015768,AMERICAN FEDERATION OF GOVERNMENT EMPLOYEES LO...,T5,Independent,C3000,ProductDev,Association,1,0,N,5000,1
34298,996086871,WATERHOUSE CHARITABLE TR,T3,Independent,C1000,Preservation,Co-operative,1,1M-5M,N,36500179,0


In [37]:
# Drop EIN, NAME, STATUS and ASK AMT 
application_df = application_df.drop(columns=["EIN", "STATUS"])

In [38]:
# Determine the number of unique values in each column
application_cat = application_df.dtypes[application_df.dtypes == "object"].index.tolist()
application_df[application_cat].nunique()

NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
dtype: int64

In [39]:
# Bin application types
application_counts = application_df.APPLICATION_TYPE.value_counts()
replace_application = list(application_counts[application_counts < 200].index)
for app in replace_application:
    application_df.APPLICATION_TYPE = application_df.APPLICATION_TYPE.replace(app, "Other")

# Ensure binning successful
application_df.APPLICATION_TYPE.value_counts()

T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
Other      276
Name: APPLICATION_TYPE, dtype: int64

In [40]:
# Bin classification types
classification_counts = application_df.CLASSIFICATION.value_counts()
replace_class = list(classification_counts[classification_counts < 1000].index)
for cls in replace_class:
    application_df.CLASSIFICATION = application_df.CLASSIFICATION.replace(cls, "Other")
    
# Ensure binning successful
application_df.CLASSIFICATION.value_counts()

C1000    17326
C2000     6074
C1200     4837
Other     2261
C3000     1918
C2100     1883
Name: CLASSIFICATION, dtype: int64

In [41]:
# Bin names
name_counts = application_df.NAME.value_counts()
replace_name = list(name_counts[name_counts < 5].index)
for name in replace_name:
    application_df.NAME = application_df.NAME.replace(name, "Other")

# Ensure binning successful
application_df.NAME.value_counts()

Other                                                          19803
PARENT BOOSTER USA INC                                          1260
TOPS CLUB INC                                                    765
UNITED STATES BOWLING CONGRESS INC                               700
WASHINGTON STATE UNIVERSITY                                      492
                                                               ...  
DEPARTMENT OF NEVADA VETERANS OF FOREIGN WARS OF THE US INC        5
NATIONAL ORGANIZATION FOR WOMEN INC                                5
VETERANS OF FOREIGN WARS OF THE US DEPT OF TEXAS AUXILIARY         5
HIGH TWELVE INTERNATIONAL                                          5
INTERNATIONAL ASSOCIATION OF FORENSIC NURSES                       5
Name: NAME, Length: 403, dtype: int64

In [42]:
# Generate a categorical variable list
application_cat = application_df.dtypes[application_df.dtypes == "object"].index.tolist()


In [43]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(application_df[application_cat]))

# Add the encoded variable names tot he dataframe
encode_df.columns = enc.get_feature_names(application_cat)


In [44]:
# Merge OneHotEncoder features and drop the originals
application_df = application_df.merge(encode_df, left_index=True, right_index=True)
application_df = application_df.drop(application_cat, 1)
application_df

Unnamed: 0,ASK_AMT,IS_SUCCESSFUL,NAME_AACE INTERNATIONAL,NAME_ACE MENTOR PROGRAM OF AMERICA INC,NAME_ACTS MINISTRY,NAME_ACTS MISSIONS,NAME_AFRICAN-AMERICAN POSTAL LEAGUE UNITED FOR SUCCESS A-PLUS,NAME_AIR FORCE ASSOCIATION,NAME_ALABAMA FEDERATION OF WOMENS CLUBS,NAME_ALABAMA TREASURE FOREST ASSOCIATION,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,5000,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,108590,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,6692,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,142590,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34294,5000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
34295,5000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
34296,5000,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
34297,5000,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [45]:
# Split our preprocessed data into our features and target arrays
y = application_df["IS_SUCCESSFUL"].values
X = application_df.drop(["IS_SUCCESSFUL"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [46]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [47]:
# Define the model 
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 90
hidden_nodes_layer2 = 40



nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="sigmoid"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))


In [48]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics="accuracy")

In [49]:
# Define the checkpoint path and filenames
os.makedirs("checkpoints_opt/", exist_ok=True)
checkpoint_path = "checkpoints_opt/weights.{epoch:02d}.hdf5"

# Create a callback that saves the models weights every 5 epocks
cp_callback = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_weights_only=True, save_frequ=100)

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=70, callbacks=[cp_callback])

Epoch 1/70

Epoch 00001: saving model to checkpoints_opt\weights.01.hdf5
Epoch 2/70

Epoch 00002: saving model to checkpoints_opt\weights.02.hdf5
Epoch 3/70

Epoch 00003: saving model to checkpoints_opt\weights.03.hdf5
Epoch 4/70

Epoch 00004: saving model to checkpoints_opt\weights.04.hdf5
Epoch 5/70

Epoch 00005: saving model to checkpoints_opt\weights.05.hdf5
Epoch 6/70

Epoch 00006: saving model to checkpoints_opt\weights.06.hdf5
Epoch 7/70

Epoch 00007: saving model to checkpoints_opt\weights.07.hdf5
Epoch 8/70

Epoch 00008: saving model to checkpoints_opt\weights.08.hdf5
Epoch 9/70

Epoch 00009: saving model to checkpoints_opt\weights.09.hdf5
Epoch 10/70

Epoch 00010: saving model to checkpoints_opt\weights.10.hdf5
Epoch 11/70

Epoch 00011: saving model to checkpoints_opt\weights.11.hdf5
Epoch 12/70

Epoch 00012: saving model to checkpoints_opt\weights.12.hdf5
Epoch 13/70

Epoch 00013: saving model to checkpoints_opt\weights.13.hdf5
Epoch 14/70

Epoch 00014: saving model to check


Epoch 00051: saving model to checkpoints_opt\weights.51.hdf5
Epoch 52/70

Epoch 00052: saving model to checkpoints_opt\weights.52.hdf5
Epoch 53/70

Epoch 00053: saving model to checkpoints_opt\weights.53.hdf5
Epoch 54/70

Epoch 00054: saving model to checkpoints_opt\weights.54.hdf5
Epoch 55/70

Epoch 00055: saving model to checkpoints_opt\weights.55.hdf5
Epoch 56/70

Epoch 00056: saving model to checkpoints_opt\weights.56.hdf5
Epoch 57/70

Epoch 00057: saving model to checkpoints_opt\weights.57.hdf5
Epoch 58/70

Epoch 00058: saving model to checkpoints_opt\weights.58.hdf5
Epoch 59/70

Epoch 00059: saving model to checkpoints_opt\weights.59.hdf5
Epoch 60/70

Epoch 00060: saving model to checkpoints_opt\weights.60.hdf5
Epoch 61/70

Epoch 00061: saving model to checkpoints_opt\weights.61.hdf5
Epoch 62/70

Epoch 00062: saving model to checkpoints_opt\weights.62.hdf5
Epoch 63/70

Epoch 00063: saving model to checkpoints_opt\weights.63.hdf5
Epoch 64/70

Epoch 00064: saving model to checkpoi

In [50]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.4475 - accuracy: 0.7916
Loss: 0.4474904537200928, Accuracy: 0.7916035056114197


In [51]:
# Export the model to HDF5 file
nn.save("AlphabetSoupCharityOpt4.h5")