In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

# Import and read the charity_data.csv.
from google.colab import files
uploaded = files.upload()

Saving charity_data.csv to charity_data.csv


In [97]:
application_df = pd.read_csv('charity_data.csv')
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=['EIN','NAME','SPECIAL_CONSIDERATIONS'])

In [98]:
# Determine the number of unique values in each column.
application_df.nunique()

APPLICATION_TYPE      17
AFFILIATION            6
CLASSIFICATION        71
USE_CASE               5
ORGANIZATION           4
STATUS                 2
INCOME_AMT             9
ASK_AMT             8747
IS_SUCCESSFUL          2
dtype: int64

In [99]:
print(application_df['ASK_AMT'].loc[(application_df['ASK_AMT'] <= 5000)].count())
print(application_df['ASK_AMT'].loc[(application_df['ASK_AMT'] < 1000000) & (application_df['ASK_AMT'] > 5000)].count())
print(application_df['ASK_AMT'].loc[(application_df['ASK_AMT'] >= 1000000)].count())

25398
7324
1577


In [100]:
# BINNING

# Look at APPLICATION_TYPE value counts for binning (changed from 500 to 1000 for optimization run 5)
app_types = application_df['APPLICATION_TYPE'].value_counts()
application_types_to_replace = []
for index, value in app_types.items():
  if value < 1000:
    application_types_to_replace.append(index)
    print(f"Index : {index}, Value : {value}")
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Look at CLASSIFICATION value counts for binning
# binning everything with less than 100 rows (changed to <1000 for optimization run 5)
class_types = application_df['CLASSIFICATION'].value_counts()
classifications_to_replace = []
for index, value in class_types.items():
  if value < 1000:
    classifications_to_replace.append(index)
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Additional bins beyond what was done in first attempt
# ASK_AMT min = 5000 max = 8,597,806,340 median = 5000
ask_amts = application_df['ASK_AMT'].value_counts()
amts_to_replace_5000 = []
amts_to_replace_5000_1000000 = []
amts_to_replace_1000000 = []
for index, value in ask_amts.items():
  if value <= 5000:
    amts_to_replace_5000.append(index)
  elif value < 1000000:
    amts_to_replace_5000_1000000.append(index)
  else:
    amts_to_replace_1000000.append(index)
for amt in amts_to_replace_5000:
    application_df['ASK_AMT'] = application_df['ASK_AMT'].replace(amt,"<=5000")
for amt in amts_to_replace_5000_1000000:
    application_df['ASK_AMT'] = application_df['ASK_AMT'].replace(amt,"5000-1000000")
for amt in amts_to_replace_1000000:
    application_df['ASK_AMT'] = application_df['ASK_AMT'].replace(amt,">1000000")

Index : T8, Value : 737
Index : T7, Value : 725
Index : T10, Value : 528
Index : T9, Value : 156
Index : T13, Value : 66
Index : T12, Value : 27
Index : T2, Value : 16
Index : T25, Value : 3
Index : T14, Value : 3
Index : T29, Value : 2
Index : T15, Value : 2
Index : T17, Value : 1


In [101]:
# Convert categorical data to numeric with `pd.get_dummies`
types = application_df.dtypes
for index, value in types.items():
  if value == 'object':
    print(index)
    ohe = pd.get_dummies(application_df[index])
    application_df = pd.merge(application_df, ohe, left_index=True, right_index=True)
    application_df = application_df.drop(columns=[index])

APPLICATION_TYPE
AFFILIATION
CLASSIFICATION
USE_CASE
ORGANIZATION
INCOME_AMT
ASK_AMT


  import sys


In [102]:
application_df.head()

Unnamed: 0,STATUS,IS_SUCCESSFUL,Other_x,T19,T3,T4,T5,T6,CompanySponsored,Family/Parent,...,1-9999,10000-24999,100000-499999,10M-50M,1M-5M,25000-99999,50M+,5M-10M,5000-1000000,<=5000
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,1,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,1,1,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
4,1,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [103]:
# Split our preprocessed data into our features and target arrays
target = application_df['IS_SUCCESSFUL']
data = application_df.drop(columns=['IS_SUCCESSFUL'])
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(data, target)

In [104]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [105]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
inputs = X_train_scaled.shape[1]
inputs

39

In [106]:
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=200, activation="relu",input_dim=inputs))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=150, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=150, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_13 (Dense)            (None, 200)               8000      
                                                                 
 dense_14 (Dense)            (None, 150)               30150     
                                                                 
 dense_15 (Dense)            (None, 150)               22650     
                                                                 
 dense_16 (Dense)            (None, 1)                 151       
                                                                 
Total params: 60,951
Trainable params: 60,951
Non-trainable params: 0
_________________________________________________________________


In [107]:
# Compile the model
nn.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])

In [108]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [109]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - loss: 0.9761 - accuracy: 0.7290 - 584ms/epoch - 2ms/step
Loss: 0.9760890603065491, Accuracy: 0.7289795875549316


In [110]:
# Export our model to HDF5 file
nn.save("AlphabetSoupCharity_Optimization.h5")

# Optimization Summary

Notes and accuracy measurements for each attempt at optimizing the model. Note that these are additive unless otherwise stated.  

---

First attempt after adding binning for ASK_AMT:  
268/268 - 1s - loss: 1.2309 - accuracy: 0.7269 - 565ms/epoch - 2ms/step  
Loss: 1.2309399843215942, Accuracy: 0.7268804907798767  


---


Second attempt, removing "SPECIAL_CONSIDERATIONS" column:  
268/268 - 0s - loss: 0.5703 - accuracy: 0.7320 - 493ms/epoch - 2ms/step  
Loss: 0.5703492760658264, Accuracy: 0.7320116758346558  


---


Third attempt - since my train and test scores are similar at 100 epochs (indicating the model is not overtrained), bumped epoches to 200:  
268/268 - 0s - loss: 0.6099 - accuracy: 0.7300 - 401ms/epoch - 1ms/step  
Loss: 0.6098824739456177, Accuracy: 0.7300291657447815  


---


Fourth attempt - adding a third hidden layer (set epoch back to 100):  
268/268 - 1s - loss: 0.5928 - accuracy: 0.7327 - 561ms/epoch - 2ms/step  
**Loss: 0.592848002910614, Accuracy: 0.7327113747596741**


---


Fifth attempt - reducing number of bins, and thus inputs  
268/268 - 1s - loss: 0.9761 - accuracy: 0.7290 - 584ms/epoch - 2ms/step  
Loss: 0.9760890603065491, Accuracy: 0.7289795875549316    



---

# Conclusion

The forth attempt resulted in the highest accuracy of 73.27%, though this is only a minor increase from the model prior to optimization attempts. 
