## Model Optimization for Alphabet Soup Charity

In [20]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [21]:
# Drop any non-beneficial ID columns
# This time, keep NAME and encode

df = df.drop(["EIN"], axis=1)

In [22]:
# Find the number of unique values in each feature

df.nunique()

Unnamed: 0,0
NAME,19568
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747


In [23]:
# We need to group and possibly eliminate applicants with low number of applications
# This helps the model train on a more evenly distributed range of data
# Group any applicants with applications fewer than 5 with "Other"

application_count = df['NAME'].value_counts()

#  Application counts greater than 5

application_count[application_count>5]

Unnamed: 0_level_0,count
NAME,Unnamed: 1_level_1
PARENT BOOSTER USA INC,1260
TOPS CLUB INC,765
UNITED STATES BOWLING CONGRESS INC,700
WASHINGTON STATE UNIVERSITY,492
AMATEUR ATHLETIC UNION OF THE UNITED STATES INC,408
...,...
OLD OAK CLIFF CONSERVATION LEAGUE INC,6
AMERICAN NEPHROLOGY NURSES ASSOCIATION,6
HUMBLE ISD EDUCATIONAL SUPPORT GROUPS INC,6
PROFESSIONAL LOADMASTER ASSOCIATION,6


In [24]:
# Create list of applicants with application counts <= 5
app_count_fiveandbelow = list(application_count[application_count <= 5].index)

# Iterate through list, group into "Other", replace in df
for application in app_count_fiveandbelow:
    df['NAME'] = df['NAME'].replace(application,"Other")

# Did it work?
df['NAME'].value_counts()

Unnamed: 0_level_0,count
NAME,Unnamed: 1_level_1
Other,20043
PARENT BOOSTER USA INC,1260
TOPS CLUB INC,765
UNITED STATES BOWLING CONGRESS INC,700
WASHINGTON STATE UNIVERSITY,492
...,...
HABITAT FOR HUMANITY INTERNATIONAL,6
DAMAGE PREVENTION COUNCIL OF TEXAS,6
FLEET RESERVE ASSOCIATION,6
HUGH OBRIAN YOUTH LEADERSHIP,6


In [25]:
# Do the same with APPLICATION_TYPE - we will need to group application types with low counts into a new "Other" value
application_type_counts = df['APPLICATION_TYPE'].value_counts()
application_type_counts

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


In [26]:
# Create a list of application types that have counts less than 500
application_types_under500 = list(application_type_counts[application_type_counts < 500].index)

# Iterate through list, group into "Other", replace in df
for application in application_types_under500:
    df['APPLICATION_TYPE'] = df['APPLICATION_TYPE'].replace(application,"Other")

# Did it work?
df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


In [27]:
# Do the same with CLASSIFICATION - we will need to group application types with low counts into a new "Other" value
classification_counts = df['CLASSIFICATION'].value_counts()
classification_counts

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
...,...
C4120,1
C8210,1
C2561,1
C4500,1


In [28]:
# Create a list of classifications that have counts less than 1000
classifications_under1000 = list(classification_counts[classification_counts < 1000].index)

# Iterate through list, group into "Other", replace in df
for classification in classifications_under1000:
    df['CLASSIFICATION'] = df['CLASSIFICATION'].replace(classification,"Other")

# Did it work?
df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,2261
C3000,1918
C2100,1883


In [29]:
# Do the same with ASK_AMT
ask_counts = df['ASK_AMT'].value_counts()
ask_counts

Unnamed: 0_level_0,count
ASK_AMT,Unnamed: 1_level_1
5000,25398
10478,3
15583,3
63981,3
6725,3
...,...
5371754,1
30060,1
43091152,1
18683,1


In [30]:
# Create a list of ask amounts that have counts outside of 5000
ask_amount_not_5000 = list(ask_counts[ask_counts < 5].index)

# Iterate through list, group into "Other", replace in df
for amount in ask_amount_not_5000:
    df['ASK_AMT'] = df['ASK_AMT'].replace(amount,"Other")

# Did it work?
df['ASK_AMT'].value_counts()

Unnamed: 0_level_0,count
ASK_AMT,Unnamed: 1_level_1
5000,25398
Other,8901


In [31]:
# Create new index list based on all object features in DataFrame
categories = df.dtypes[df.dtypes == "object"].index.tolist()

In [32]:
# Like before, convert categorical data to numeric with 'pd.get_dummies'
dummies_df = pd.get_dummies(df)

In [33]:
# Split our preprocessed data into our features and target arrays
X = dummies_df.drop(["IS_SUCCESSFUL"], axis='columns').values
y = dummies_df["IS_SUCCESSFUL"].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=99)

In [34]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [35]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

nn = tf.keras.models.Sequential()

input_features_len = len(X_train[0])


# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=200, input_dim=input_features_len, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=60, activation="sigmoid"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=20, activation="sigmoid"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [36]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [40]:
# Train the model
train = nn.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.7982 - loss: 0.4219
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7992 - loss: 0.4176
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8071 - loss: 0.4093
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8000 - loss: 0.4145
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7986 - loss: 0.4185
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7992 - loss: 0.4173
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8002 - loss: 0.4144
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7954 - loss: 0.4180
Epoch 9/100
[1m804/804[0m [32

In [41]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - 1ms/step - accuracy: 0.7991 - loss: 0.4435
Loss: 0.44354236125946045, Accuracy: 0.7990670800209045


In [42]:
# Export our model to HDF5 file
nn.save("AlphabetSoupCharity_Optimization.h5")

