In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import numpy as np
import pandas as pd 
from sklearn.decomposition import PCA

application_df = pd.read_csv("Resources/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Set Up Variables
# ---------------
new_df = application_df.drop(['EIN','NAME'], axis = 1)
unique = new_df.nunique()
selected_cols = [col for col in list(dict(unique).keys()) if dict(unique)[col] > 10][:2]
application_counts = new_df[selected_cols[0]].value_counts()
classification_counts = new_df[selected_cols[1]].value_counts()

# Bin Application Column and Classifications Column
# -------------------------------------------------
# Application Set Up
# ------------------
cut_off = 27037
application_types_to_replace = [type for type in list(dict(application_counts).keys()) if dict(application_counts)[type] == cut_off ]

for app in application_types_to_replace:
    new_df['APPLICATION_TYPE'] = new_df['APPLICATION_TYPE'].replace(app,"Other")

# Classifications Set Up
# ----------------------
classifications_to_replace = []
min_cut_off = 1
cut_off = 1883

for type in [x for x in list(dict(classification_counts).keys())]:
    if dict(classification_counts)[type] < cut_off or dict(classification_counts)[type] == min_cut_off:
        classifications_to_replace.append(type)

for cls in classifications_to_replace:
    new_df['CLASSIFICATION'] = new_df['CLASSIFICATION'].replace(cls,"Other")

new_df['APPLICATION_TYPE'].value_counts()

Other    27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
T9         156
T13         66
T12         27
T2          16
T25          3
T14          3
T15          2
T29          2
T17          1
Name: APPLICATION_TYPE, dtype: int64

## Added PCA Attempt

In [5]:
# Set up final Dataframe that will used for model training and evaluation
# -----------------------------------------------------------------------
dum_df = pd.get_dummies(new_df)
target = dum_df['IS_SUCCESSFUL']
#features = dum_df.drop(['IS_SUCCESSFUL','USE_CASE_Other', 'USE_CASE_Preservation', 'USE_CASE_ProductDev','SPECIAL_CONSIDERATIONS_N', 'SPECIAL_CONSIDERATIONS_Y'], axis = 1)
features = dum_df.drop(['IS_SUCCESSFUL'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(features,target)

# Scale
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Convert to Numpy Arrays to avoid data type issue with TensorFlow
scale_train_x = np.asarray(X_train_scaled)
train_y = np.asarray(y_train)
scale_test_x = np.asarray(X_test_scaled)
test_y = np.asarray(y_test)

# PCA attempt to reduce number of dimensions 
pca = PCA(n_components = 0.80)
pca_data_train = pca.fit_transform(scale_train_x)
pca_data_test = pca.transform(scale_test_x)


In [None]:
# Define and fit the deep neural net model 
# ---------------------------------
#number_input_features = len(pca_data_train.iloc[0])
number_input_features = pca_data_train.shape[1]
nn_model = tf.keras.models.Sequential()

# Input Layer
nn_model.add(tf.keras.layers.Dense(units=32, activation="relu", input_dim=number_input_features))

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=16, activation="relu"))

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=16, activation="relu"))

# Third hidden layer
nn_model.add(tf.keras.layers.Dense(units=5, activation="tanh"))

# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()

# Compile the Model 
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# fit the model 
# fit_model = nn_model.fit(scale_train_x, train_y, epochs=50)
fit_model = nn_model.fit(pca_data_train, train_y, epochs=5)

# Evaluate the Model and Save to h5 file
# --------------------------------------
model_loss, model_accuracy = nn_model.evaluate(pca_data_test,test_y,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# Save and export to h5 file
nn_model.save("AlphabetSoupCharity_Optimization.h5")

## Without PCA Attempy

In [15]:
# Set up final Dataframe that will used for model training and evaluation
# -----------------------------------------------------------------------
dum_df = pd.get_dummies(new_df)
target = dum_df['IS_SUCCESSFUL']
#features = dum_df.drop(['IS_SUCCESSFUL','USE_CASE_Other', 'USE_CASE_Preservation', 'USE_CASE_ProductDev','SPECIAL_CONSIDERATIONS_N', 'SPECIAL_CONSIDERATIONS_Y'], axis = 1)
features = dum_df.drop(['IS_SUCCESSFUL'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(features,target)

# Scale
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Convert to Numpy Arrays to avoid data type issue with TensorFlow
scale_train_x = np.asarray(X_train_scaled)
train_y = np.asarray(y_train)
scale_test_x = np.asarray(X_test_scaled)
test_y = np.asarray(y_test)

In [21]:
# Define and fit the deep neural net model 
# ---------------------------------
#number_input_features = len(pca_data_train.iloc[0])
number_input_features = len(scale_train_x[0])
nn_model = tf.keras.models.Sequential()

# Input Layer
nn_model.add(tf.keras.layers.Dense(units=32, activation="relu", input_dim=number_input_features))

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=16, activation="relu"))

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=16, activation="relu"))

# Third hidden layer
nn_model.add(tf.keras.layers.Dense(units=5, activation="tanh"))

# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()

# Compile the Model 
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
# fit the model 
# fit_model = nn_model.fit(scale_train_x, train_y, epochs=50)
fit_model = nn_model.fit(scale_train_x, train_y, epochs=25)

# Evaluate the Model and Save to h5 file
# --------------------------------------
model_loss, model_accuracy = nn_model.evaluate(scale_test_x,test_y,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# Save and export to h5 file
nn_model.save("AlphabetSoupCharity_Optimization.h5")

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_53 (Dense)             (None, 32)                1664      
_________________________________________________________________
dense_54 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_55 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_56 (Dense)             (None, 5)                 85        
_________________________________________________________________
dense_57 (Dense)             (None, 1)                 6         
Total params: 2,555
Trainable params: 2,555
Non-trainable params: 0
_________________________________________________________________
Train on 25724 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 

## Using Tuner Attempt

In [7]:
import keras_tuner as kt
import random

number_input_features = len(pca_data_train[0])

def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh',"softmax"])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=6,
        max_value=32,
        step=2), activation=activation, input_dim=number_input_features))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 3)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=6,
            max_value=32,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model
directory_name  = "Tuner" + str(random.randint(1,1234456))
tuner = kt.Hyperband(create_model,objective="val_accuracy",max_epochs=5,hyperband_iterations=2,directory = directory_name)
tuner.search(pca_data_train,train_y,epochs=5,validation_data=(pca_data_test,test_y))

Trial 20 Complete [00h 00m 41s]
val_accuracy: 0.7329446077346802

Best val_accuracy So Far: 0.7351603507995605
Total elapsed time: 00h 09m 50s
INFO:tensorflow:Oracle triggered exit


In [None]:
# Attempted PCA to lower number 
# Unbinning the application type data column actually increaseased the accuracy of the NN
# tuner attempt achieved best accuracy with pca components = .90, lowering pca ratio slightly increased accuracy