## Preprocessing

In [147]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")

# Bypass decription error
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False

application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [148]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
tidied_application_df = application_df.drop(columns=['EIN', 'NAME'])
tidied_application_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [149]:
# Determine the number of unique values in each column.
unique_values = tidied_application_df.nunique()
unique_values

APPLICATION_TYPE            17
AFFILIATION                  6
CLASSIFICATION              71
USE_CASE                     5
ORGANIZATION                 4
STATUS                       2
INCOME_AMT                   9
SPECIAL_CONSIDERATIONS       2
ASK_AMT                   8747
IS_SUCCESSFUL                2
dtype: int64

In [150]:
# Extract the numerical part of APPLICATION_TYPE values and convert them to integers
numerical_order = sorted([int(x[1:]) for x in tidied_application_df['APPLICATION_TYPE'].unique()])

# Add 'T' back to the numerical order to reconstruct the desired order
desired_order = ['T' + str(num) for num in numerical_order]

# Look at APPLICATION_TYPE value counts for binning and order numerically
application_unique = tidied_application_df['APPLICATION_TYPE'].value_counts().reindex(desired_order)
application_unique

APPLICATION_TYPE
T2        16
T3     27037
T4      1542
T5      1173
T6      1216
T7       725
T8       737
T9       156
T10      528
T12       27
T13       66
T14        3
T15        2
T17        1
T19     1065
T25        3
T29        2
Name: count, dtype: int64

In [151]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `application_types_to_replace`

cutoff = 100
other = 0
application_refined = []
application_types_to_replace = []

for application_type, count in application_unique.items():
    if count < cutoff:
        other += count
        application_types_to_replace.append(application_type)
    else:
        application_refined.append(f"{application_type}: {count}")

application_refined.append(f"other: {other}")

application_refined_list = ["APPLICATION_TYPE"]
application_refined_list.extend(application_refined)

print(f"application types to be replaced: {application_types_to_replace}")

application types to be replaced: ['T2', 'T12', 'T13', 'T14', 'T15', 'T17', 'T25', 'T29']


In [152]:
#application refined list (application type values below the cutoff are totalled in 'other')
application_refined_list

['APPLICATION_TYPE',
 'T3: 27037',
 'T4: 1542',
 'T5: 1173',
 'T6: 1216',
 'T7: 725',
 'T8: 737',
 'T9: 156',
 'T10: 528',
 'T19: 1065',
 'other: 120']

In [153]:
# Extract the numerical part of CLASSIFICATION values and convert them to integers
numerical_order = sorted([int(x[1:]) for x in tidied_application_df['CLASSIFICATION'].unique()])

# Add 'C' back to the numerical order to reconstruct the desired order
desired_order = ['C' + str(num) for num in numerical_order]

# Look at CLASSIFICATION value counts for binning and order numerically
classification_unique = tidied_application_df['CLASSIFICATION'].value_counts().reindex(desired_order)
classification_unique

CLASSIFICATION
C0           3
C1000    17326
C1200     4837
C1230       36
C1234        2
         ...  
C7200       32
C7210        7
C8000       20
C8200       11
C8210        1
Name: count, Length: 71, dtype: int64

In [154]:
# Look at CLASSIFICATION value counts for binning
classification_unique = classification_unique.astype(int)

# Sort the Series based on the values
sorted_classification = classification_unique.sort_values(ascending=False)
sorted_classification

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
         ...  
C2190        1
C2380        1
C2500        1
C2561        1
C8210        1
Name: count, Length: 71, dtype: int32

In [155]:
# You may find it helpful to look at CLASSIFICATION value counts >1
cutoff = 1
classification_refined = []

for classification, count in sorted_classification.items():
    if count > cutoff:
        classification_refined.append(f"{classification}: {count}")

classification_refined_list = ["CLASSIFICATION"]
classification_refined_list.extend(classification_refined)

classification_refined_list

['CLASSIFICATION',
 'C1000: 17326',
 'C2000: 6074',
 'C1200: 4837',
 'C3000: 1918',
 'C2100: 1883',
 'C7000: 777',
 'C1700: 287',
 'C4000: 194',
 'C5000: 116',
 'C1270: 114',
 'C2700: 104',
 'C2800: 95',
 'C7100: 75',
 'C1300: 58',
 'C1280: 50',
 'C1230: 36',
 'C1400: 34',
 'C2300: 32',
 'C7200: 32',
 'C1240: 30',
 'C8000: 20',
 'C7120: 18',
 'C1500: 16',
 'C1800: 15',
 'C6000: 15',
 'C1250: 14',
 'C8200: 11',
 'C1278: 10',
 'C1238: 10',
 'C1235: 9',
 'C1237: 9',
 'C7210: 7',
 'C1720: 6',
 'C4100: 6',
 'C2400: 6',
 'C1600: 5',
 'C1257: 5',
 'C2710: 3',
 'C1260: 3',
 'C0: 3',
 'C1267: 2',
 'C1246: 2',
 'C1256: 2',
 'C3200: 2',
 'C1234: 2']

In [156]:
# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
cutoff = 10
other = 0
classification_refined = []
classifications_to_replace = []

for classification_entry in classification_refined_list[1:]:  # Excluding the first element which is "CLASSIFICATION"
    classification, count = classification_entry.split(': ')
    count = int(count)
    if count < cutoff:
        other += count
        classifications_to_replace.append(classification)
    else:
        classification_refined.append(f"{classification}: {count}")

classification_refined.append(f"other: {other}")

classification_refined_list = ["CLASSIFICATION"]
classification_refined_list.extend(classification_refined)

classification_refined_list

['CLASSIFICATION',
 'C1000: 17326',
 'C2000: 6074',
 'C1200: 4837',
 'C3000: 1918',
 'C2100: 1883',
 'C7000: 777',
 'C1700: 287',
 'C4000: 194',
 'C5000: 116',
 'C1270: 114',
 'C2700: 104',
 'C2800: 95',
 'C7100: 75',
 'C1300: 58',
 'C1280: 50',
 'C1230: 36',
 'C1400: 34',
 'C2300: 32',
 'C7200: 32',
 'C1240: 30',
 'C8000: 20',
 'C7120: 18',
 'C1500: 16',
 'C1800: 15',
 'C6000: 15',
 'C1250: 14',
 'C8200: 11',
 'C1278: 10',
 'C1238: 10',
 'other: 72']

In [157]:
# Convert categorical data to numeric with `pd.get_dummies`
dummied_df = pd.get_dummies(tidied_application_df, columns=['USE_CASE'])
dummied_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,USE_CASE_CommunityServ,USE_CASE_Heathcare,USE_CASE_Other,USE_CASE_Preservation,USE_CASE_ProductDev
0,T10,Independent,C1000,Association,1,0,N,5000,1,False,False,False,False,True
1,T3,Independent,C2000,Co-operative,1,1-9999,N,108590,1,False,False,False,True,False
2,T5,CompanySponsored,C3000,Association,1,0,N,5000,0,False,False,False,False,True
3,T3,CompanySponsored,C2000,Trust,1,10000-24999,N,6692,1,False,False,False,True,False
4,T3,Independent,C1000,Trust,1,100000-499999,N,142590,1,False,True,False,False,False


In [158]:
# Split our preprocessed data into our features and target arrays
X = dummied_df.drop(columns=['APPLICATION_TYPE', 'AFFILIATION', 'CLASSIFICATION', 'ORGANIZATION', 'INCOME_AMT', 'SPECIAL_CONSIDERATIONS', 'IS_SUCCESSFUL']) 
y = dummied_df['IS_SUCCESSFUL']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [159]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [160]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

# Define the number of input features
number_input_features = X_train.shape[1]  
# Define the number of nodes for each hidden layer
hidden_nodes_layer1 = 10
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 10)                80        
                                                                 
 dense_10 (Dense)            (None, 5)                 55        
                                                                 
 dense_11 (Dense)            (None, 1)                 6         
                                                                 
Total params: 141 (564.00 Byte)
Trainable params: 141 (564.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [161]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [162]:
# Train the model
nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x22ffad63790>

In [163]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

215/215 - 1s - loss: 0.6897 - accuracy: 0.5402 - 574ms/epoch - 3ms/step
Loss: 0.689677894115448, Accuracy: 0.5402332544326782


In [165]:
# Export our model to HDF5 file
nn.save("AlphabetSoupCharity_Optimisation.h5")