##Optimize the model

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
#drop the nonbeneficial ID columns
application_df=application_df.drop(['EIN'],axis=1)

In [2]:
application_df.nunique()

Unnamed: 0,0
EIN,34299
NAME,19568
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2


In [3]:
#look at name value counts to identify and replace with "other"
name_counts = application_df["NAME"].value_counts()

#how many counts are greater than 5
name_counts[name_counts>5]

Unnamed: 0_level_0,count
NAME,Unnamed: 1_level_1
PARENT BOOSTER USA INC,1260
TOPS CLUB INC,765
UNITED STATES BOWLING CONGRESS INC,700
WASHINGTON STATE UNIVERSITY,492
AMATEUR ATHLETIC UNION OF THE UNITED STATES INC,408
...,...
OLD OAK CLIFF CONSERVATION LEAGUE INC,6
AMERICAN NEPHROLOGY NURSES ASSOCIATION,6
HUMBLE ISD EDUCATIONAL SUPPORT GROUPS INC,6
PROFESSIONAL LOADMASTER ASSOCIATION,6


In [4]:
#determine which values to replace if counts are less than or equal to 5
names_to_replace = list(name_counts[name_counts<=5].index)

#replace in dataframe
for app in names_to_replace:
    application_df['NAME']=application_df['NAME'].replace(app, "Other")

#check to make sure replacement was successful
application_df['NAME'].value_counts()

Unnamed: 0_level_0,count
NAME,Unnamed: 1_level_1
Other,20043
PARENT BOOSTER USA INC,1260
TOPS CLUB INC,765
UNITED STATES BOWLING CONGRESS INC,700
WASHINGTON STATE UNIVERSITY,492
...,...
HABITAT FOR HUMANITY INTERNATIONAL,6
DAMAGE PREVENTION COUNCIL OF TEXAS,6
FLEET RESERVE ASSOCIATION,6
HUGH OBRIAN YOUTH LEADERSHIP,6


In [5]:
#look at APPLICATION_TYPE value counts to identify and replace with other
application_counts = application_df['APPLICATION_TYPE'].value_counts()
application_counts

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


In [6]:
application_types_to_replace = list(application_counts[application_counts<500].index)

#replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE']=application_df['APPLICATION_TYPE'].replace(app, "Other")

#check to make sure replacement was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


In [7]:
#look at classification value counts to identify and replace with other
classification_counts = application_df["CLASSIFICATION"].value_counts()
classification_counts

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
...,...
C4120,1
C8210,1
C2561,1
C4500,1


In [8]:
#determine which values to replace if counts are less than 1000
classes_to_replace = list(classification_counts[classification_counts<1000].index)

for cls in classes_to_replace:
    application_df['CLASSIFICATION']=application_df['CLASSIFICATION'].replace(cls, "Other")

#check to make sure replacement was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,2261
C3000,1918
C2100,1883


In [9]:
#generate our categorical variable lists
application_cat = application_df.dtypes[application_df.dtypes=="object"].index.tolist()


In [10]:
application_with_dummies_df = pd.get_dummies(application_df)

In [11]:
#split our preprocessed data into our features and target arrays
X = application_with_dummies_df.drop(["IS_SUCCESSFUL"], axis="columns").values
y = application_with_dummies_df["IS_SUCCESSFUL"].values

#split the preprocessed data into a training and testing dataset
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=78)

In [12]:
#create a standardscaler instance
scaler = StandardScaler()

#fit the standardscaler
X_scaler = scaler.fit(X_train)

#scale the data
x_train_scaled = X_scaler.transform(X_train)
x_test_scaled = X_scaler.transform(X_test)

In [19]:
#define the model
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential()
#add hidden layers with more neurons and different activation functions
model.add(layers.Dense(units=128, activation='relu', input_dim=x_train_scaled.shape[1]))
model.add(layers.Dense(units=64, activation='relu'))
model.add(layers.Dense(units=32, activation='relu'))
model.add(layers.Dense(units=1, activation='sigmoid'))  # Output layer

#compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
#train the model with a higher number of epochs
history = model.fit(x_train_scaled, y_train, epochs=150, batch_size=32, validation_split=0.2)

Epoch 1/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7501 - loss: 0.5233 - val_accuracy: 0.7858 - val_loss: 0.4496
Epoch 2/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7952 - loss: 0.4317 - val_accuracy: 0.7858 - val_loss: 0.4512
Epoch 3/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7978 - loss: 0.4232 - val_accuracy: 0.7872 - val_loss: 0.4457
Epoch 4/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8037 - loss: 0.4133 - val_accuracy: 0.7879 - val_loss: 0.4525
Epoch 5/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8016 - loss: 0.4134 - val_accuracy: 0.7870 - val_loss: 0.4503
Epoch 6/150
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8002 - loss: 0.4174 - val_accuracy: 0.7891 - val_loss: 0.4557
Epoch 7/150
[1m644/64

In [22]:
#evaluate the model on the test dataset
test_loss, test_accuracy = model.evaluate(x_test_scaled, y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')

[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7894 - loss: 0.5979
Test Accuracy: 0.79


In [23]:
model.save('AlphabetSoupCharity_Optimization.h5')

