In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd
application_df = pd.read_csv("https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [2]:
# Find value counts of 'STATUS' column, if low then drop
application_df['STATUS'].value_counts()

Unnamed: 0_level_0,count
STATUS,Unnamed: 1_level_1
1,34294
0,5


In [3]:
# Find value counts of 'SPECIAL_CONSIDERATIONS', if low then drop
application_df['SPECIAL_CONSIDERATIONS'].value_counts()

Unnamed: 0_level_0,count
SPECIAL_CONSIDERATIONS,Unnamed: 1_level_1
N,34272
Y,27


In [4]:
# Drop data with 'STATUS' = 0, drop 'SPECIAL_CONSIDERATIONS' = Y, and 'EIN' column
application_df = application_df.drop(columns=['STATUS', 'SPECIAL_CONSIDERATIONS', 'EIN'])
application_df.head()

Unnamed: 0,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,INCOME_AMT,ASK_AMT,IS_SUCCESSFUL
0,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,0,5000,1
1,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1-9999,108590,1
2,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,0,5000,0
3,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,10000-24999,6692,1
4,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,100000-499999,142590,1


In [6]:
# Display number of unique values
application_df.nunique()

Unnamed: 0,0
NAME,19568
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
INCOME_AMT,9
ASK_AMT,8747
IS_SUCCESSFUL,2


In [8]:
# Look at NAME value counts for binning
name_counts = application_df['NAME'].value_counts()

# How many name counts are greater than 5?
name_counts[name_counts > 5]

Unnamed: 0_level_0,count
NAME,Unnamed: 1_level_1
PARENT BOOSTER USA INC,1260
TOPS CLUB INC,765
UNITED STATES BOWLING CONGRESS INC,700
WASHINGTON STATE UNIVERSITY,492
AMATEUR ATHLETIC UNION OF THE UNITED STATES INC,408
...,...
OLD OAK CLIFF CONSERVATION LEAGUE INC,6
AMERICAN NEPHROLOGY NURSES ASSOCIATION,6
HUMBLE ISD EDUCATIONAL SUPPORT GROUPS INC,6
PROFESSIONAL LOADMASTER ASSOCIATION,6


In [9]:
# Determine whihc values to replace if counts are less than or equal to 5
replace_names = list(name_counts[name_counts <= 5].index)

# Replace in dataframe
for name in replace_names:
    application_df['NAME'] = application_df['NAME'].replace(name, 'Other')

# Check to make sure binning was successful
application_df['NAME'].value_counts()

Unnamed: 0_level_0,count
NAME,Unnamed: 1_level_1
Other,20043
PARENT BOOSTER USA INC,1260
TOPS CLUB INC,765
UNITED STATES BOWLING CONGRESS INC,700
WASHINGTON STATE UNIVERSITY,492
...,...
HABITAT FOR HUMANITY INTERNATIONAL,6
DAMAGE PREVENTION COUNCIL OF TEXAS,6
FLEET RESERVE ASSOCIATION,6
HUGH OBRIAN YOUTH LEADERSHIP,6


In [11]:
# Look at APPLICATION_TYPE value counts for binning
application_type_counts = application_df['APPLICATION_TYPE'].value_counts()
application_counts

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
T9,156
T13,66


In [12]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`
application_types_to_replace = list(application_type_counts[application_type_counts < 500].index)

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Check to make sure binning was successful
application_df['APPLICATION_TYPE'].value_counts()

Unnamed: 0_level_0,count
APPLICATION_TYPE,Unnamed: 1_level_1
T3,27037
T4,1542
T6,1216
T5,1173
T19,1065
T8,737
T7,725
T10,528
Other,276


In [13]:
# Look at CLASSIFICATION value counts for binning
classification_counts = application_df['CLASSIFICATION'].value_counts()
classification_counts

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
C3000,1918
C2100,1883
...,...
C4120,1
C8210,1
C2561,1
C4500,1


In [14]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
classifications_to_replace = list(classification_counts[classification_counts < 1000].index)

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Check to make sure binning was successful
application_df['CLASSIFICATION'].value_counts()

Unnamed: 0_level_0,count
CLASSIFICATION,Unnamed: 1_level_1
C1000,17326
C2000,6074
C1200,4837
Other,2261
C3000,1918
C2100,1883


In [15]:
# Generate categorical variable lists
application_cat = application_df.dtypes[application_df.dtypes == 'object'].index.tolist()
application_cat

['NAME',
 'APPLICATION_TYPE',
 'AFFILIATION',
 'CLASSIFICATION',
 'USE_CASE',
 'ORGANIZATION',
 'INCOME_AMT']

In [16]:
# Convert categorical data to numeric with `pd.get_dummies`
application_df = pd.get_dummies(application_df)
application_df.head()

Unnamed: 0,ASK_AMT,IS_SUCCESSFUL,NAME_AACE INTERNATIONAL,NAME_ACE MENTOR PROGRAM OF AMERICA INC,NAME_AFRICAN-AMERICAN POSTAL LEAGUE UNITED FOR SUCCESS A-PLUS,NAME_AIR FORCE ASSOCIATION,NAME_ALABAMA FEDERATION OF WOMENS CLUBS,NAME_ALABAMA TREASURE FOREST ASSOCIATION,NAME_ALBANY STATE UNIVERSITY NATIONAL ALUMNI ASSOCIATION,NAME_ALPHA PHI OMEGA,...,ORGANIZATION_Trust,INCOME_AMT_0,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M
0,5000,1,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1,108590,1,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,5000,0,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,6692,1,False,False,False,False,False,False,False,False,...,True,False,False,True,False,False,False,False,False,False
4,142590,1,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False


In [17]:
# Split our preprocessed data into our features and target arrays
y = application_df['IS_SUCCESSFUL']
X = application_df.drop(columns = 'IS_SUCCESSFUL').values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [18]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train[0])
nodes_layer1 = 100
nodes_layer2 = 30
nodes_layer3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units = nodes_layer1, input_dim = input_features, activation = 'relu')
)

# Second hidden layer
nn.add(
    tf.keras.layers.Dense(units = nodes_layer2, activation = 'relu')
)

# Third hidden layer
nn.add(
    tf.keras.layers.Dense(units = nodes_layer3, activation = 'relu')
)

# Output layer
nn.add(
    tf.keras.layers.Dense(units = 1, input_dim = nodes_layer2, activation = 'sigmoid')
)

# Check the structure of the model
nn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
# Compile the model
nn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [21]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs = 100)

Epoch 1/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7576 - loss: 0.5164
Epoch 2/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7935 - loss: 0.4314
Epoch 3/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7947 - loss: 0.4234
Epoch 4/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.7984 - loss: 0.4225
Epoch 5/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7991 - loss: 0.4235
Epoch 6/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8013 - loss: 0.4205
Epoch 7/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8013 - loss: 0.4149
Epoch 8/100
[1m804/804[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.7987 - loss: 0.4197
Epoch 9/100
[1m804/804[0m [32

In [22]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 1s - 2ms/step - accuracy: 0.7924 - loss: 0.4618
Loss: 0.4617970585823059, Accuracy: 0.7924198508262634


In [23]:
# Export our model to HDF5 file
nn.save('AlphabetSoupCharity_Optimization.h5')



# **Neural Network Model Report**

## **Overview**

#### The goal of this analysis is to build a tool that can help predict which organizations have the best chances of being successful.

## **Results**

### Data Processing:
  * Target Variable: IS_SUCCESSFUL
  * Features: All variables except EIN and NAME
  * Removed Variables: EIN and NAME

### Compiling, Training, and Evaluating the Model:
  * Neurons, Layers, and Activation Functions:
    * Input Layer: 80 neurons with ReLU
    * Hidden Layer: 30 neurons with ReLU
    * Output Layer: 1 neuron with Sigmoid
  * Model Performance:
    * The model was not able to achieve target performance with an accuracy performance of 73% with 56% loss on the test data
  * Optimization Attempt:
    * Input Layer: 100 neurons with ReLU
    * Hidden Layer: 30 neurons with ReLU
    * Addition Hidden Layer: 10 neuron with Relu
    * Output Layer: 1 neuron with Sigmoid
  * Optimized Model Performance:
    * The optimized model was able to achieve target performance with an accuracy score of 80% with 46% loss

### **Summary**

#### The model performed reasonably well and the optimization attempts significantly improved the model’s performance, achieving the desired accuracy target. The adjustments to the number of neurons and layers were crucial in enhancing the model’s predictive ability, but further optimization could include trying a different model like Random Forest.

