## Preprocessing

In [8]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pandas as pd
import tensorflow as tf

%run helper_functions.ipynb

# Import and read the charity_data.csv.
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


__Target Variable__: `IS_SUCCESSFUL`

__Feature Variables__: All other columns

In [9]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=['EIN'])

In [10]:
# Determine the number of unique values in each column.
application_df.nunique()

NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                    8747
IS_SUCCESSFUL                 2
dtype: int64

In [11]:
# Limit columns to 10 unique values
limit_unique(
    application_df,
    max_value = 10,
    columns_to_limit = ['NAME', 'APPLICATION_TYPE', 'CLASSIFICATION']
)

NAME
Other                                              29369
PARENT BOOSTER USA INC                              1260
TOPS CLUB INC                                        765
UNITED STATES BOWLING CONGRESS INC                   700
WASHINGTON STATE UNIVERSITY                          492
AMATEUR ATHLETIC UNION OF THE UNITED STATES INC      408
PTA TEXAS CONGRESS                                   368
SOROPTIMIST INTERNATIONAL OF THE AMERICAS INC        331
ALPHA PHI SIGMA                                      313
TOASTMASTERS INTERNATIONAL                           293
Name: count, dtype: int64
Number of unique values: 10

APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
T9         156
Other      120
Name: count, dtype: int64
Number of unique values: 10

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
Other      887
C7000      777
C1700      287
C4000      19

### Map to Binary Integers

In [12]:
# Since binary, convert SPECIAL_CONSIDERATIONS to 0s and 1s.
print(f"Before conversion: {application_df['SPECIAL_CONSIDERATIONS'].value_counts()}\n")

application_df['SPECIAL_CONSIDERATIONS'] = application_df['SPECIAL_CONSIDERATIONS'].map({
    'N': 0,
    'Y': 1
})
print(f"After conversion: {application_df['SPECIAL_CONSIDERATIONS'].value_counts()}")

Before conversion: SPECIAL_CONSIDERATIONS
N    34272
Y       27
Name: count, dtype: int64

After conversion: SPECIAL_CONSIDERATIONS
0    34272
1       27
Name: count, dtype: int64


### Ordinal Encoding
- There is no data on: `500000-1M`
- Solution: Convert to ordinal encoding to account for this

In [13]:
# Convert each value to an ordinal score
income_map = {
    '0': 0,
    '1-9999': 1,
    '10000-24999': 2,
    '25000-99999': 3,
    '100000-499999': 4,
    '500000-1M': 5, # No data in this category
    '1M-5M': 6,
    '5M-10M': 7,
    '10M-50M': 8,
    '50M+': 9
}
application_df['ORDINAL_INCOME_AMT'] = application_df['INCOME_AMT'].map(income_map)

### Numerical Income Amounts

In [14]:
# Display all possible values for INCOME_AMT
application_df['INCOME_AMT'].value_counts()

INCOME_AMT
0                24388
25000-99999       3747
100000-499999     3374
1M-5M              955
1-9999             728
10000-24999        543
10M-50M            240
5M-10M             185
50M+               139
Name: count, dtype: int64

In [15]:
# Split the bounded values by the hyphen and create new columns
application_df[['LOWER_INCOME', 'UPPER_INCOME']] = application_df['INCOME_AMT'].str.split('-', expand=True)

In [16]:
# If INCOME_AMT contains "M", convert to e6.
application_df['UPPER_INCOME'] = application_df['UPPER_INCOME'].str.replace('M', 'e6')
application_df['LOWER_INCOME'] = application_df['LOWER_INCOME'].str.replace('M', 'e6')

# Confirm changes
display(application_df[['LOWER_INCOME', 'UPPER_INCOME']].value_counts())
application_df.head()

LOWER_INCOME  UPPER_INCOME
25000         99999           3747
100000        499999          3374
1e6           5e6              955
1             9999             728
10000         24999            543
10e6          50e6             240
5e6           10e6             185
Name: count, dtype: int64

Unnamed: 0,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,ORDINAL_INCOME_AMT,LOWER_INCOME,UPPER_INCOME
0,Other,T10,Independent,C1000,ProductDev,Association,1,0,0,5000,1,0,0,
1,Other,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,0,108590,1,1,1,9999.0
2,Other,T5,CompanySponsored,C3000,ProductDev,Association,1,0,0,5000,0,0,0,
3,Other,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,0,6692,1,2,10000,24999.0
4,Other,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,0,142590,1,4,100000,499999.0


In [17]:
# Address 'None' values in UPPER_INCOME. If INCOME_AMT is '0', set UPPER_INCOME to 0.
application_df.loc[application_df['LOWER_INCOME'] == '0', 'UPPER_INCOME'] = 0
application_df.head()

Unnamed: 0,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,ORDINAL_INCOME_AMT,LOWER_INCOME,UPPER_INCOME
0,Other,T10,Independent,C1000,ProductDev,Association,1,0,0,5000,1,0,0,0
1,Other,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,0,108590,1,1,1,9999
2,Other,T5,CompanySponsored,C3000,ProductDev,Association,1,0,0,5000,0,0,0,0
3,Other,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,0,6692,1,2,10000,24999
4,Other,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,0,142590,1,4,100000,499999


In [18]:
# Address INCOME_AMT values with `50M+`
application_df.loc[application_df['INCOME_AMT'] == '50M+']

# Convert UPPER_INCOME NaN values to 50M
application_df.loc[application_df['INCOME_AMT'] == '50M+', 'UPPER_INCOME'] = 50e6

# Convert to '50M+' to 50e6
application_df['LOWER_INCOME'] = application_df['LOWER_INCOME'].replace('50e6+', '50e6')

# Convert column to integer
application_df['LOWER_INCOME'] = pd.to_numeric(application_df['LOWER_INCOME']).astype(int)
application_df['UPPER_INCOME'] = pd.to_numeric(application_df['UPPER_INCOME']).astype(int)

In [19]:
# Create an average column
application_df['AVG_INCOME'] = application_df[['LOWER_INCOME', 'UPPER_INCOME']].mean(axis=1)

### Ask Amount vs Average Income Comparison

In [20]:
# Create a new column that compares the ASK_AMT to LOWER_INCOME
application_df['ASK_VS_INCOME'] = application_df['ASK_AMT'] > application_df['AVG_INCOME']

# Convert boolean to Y/N for consistency
application_df['ASK_VS_INCOME'] = application_df['ASK_VS_INCOME'].map({
    False: 0,
    True: 1
})

# Display updated dataframe
application_df.head()

Unnamed: 0,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL,ORDINAL_INCOME_AMT,LOWER_INCOME,UPPER_INCOME,AVG_INCOME,ASK_VS_INCOME
0,Other,T10,Independent,C1000,ProductDev,Association,1,0,0,5000,1,0,0,0,0.0,1
1,Other,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,0,108590,1,1,1,9999,5000.0,1
2,Other,T5,CompanySponsored,C3000,ProductDev,Association,1,0,0,5000,0,0,0,0,0.0,1
3,Other,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,0,6692,1,2,10000,24999,17499.5,0
4,Other,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,0,142590,1,4,100000,499999,299999.5,0


### Combination Columns

In [21]:
# Create a new combination column
application_df['AFFILIATION_ORGANIZATION'] = application_df['AFFILIATION'] + "_" + application_df['ORGANIZATION']
application_df['AFFILIATION_ORGANIZATION'].value_counts()

AFFILIATION_ORGANIZATION
Independent_Trust                16212
CompanySponsored_Association      8479
CompanySponsored_Trust            7206
Independent_Association           1764
Independent_Co-operative           465
Family/Parent_Trust                 55
Independent_Corporation             39
National_Trust                      29
CompanySponsored_Co-operative       18
Regional_Trust                      11
Family/Parent_Association            9
National_Association                 2
National_Co-operative                2
Other_Corporation                    2
CompanySponsored_Corporation         2
Other_Trust                          2
Regional_Association                 1
Regional_Co-operative                1
Name: count, dtype: int64

In [22]:
# Create another combination column
application_df['AFFILIATION_USECASE'] = application_df['AFFILIATION'] + "_" + application_df['USE_CASE']
application_df['AFFILIATION_USECASE'].value_counts()

AFFILIATION_USECASE
Independent_Preservation          16218
CompanySponsored_Preservation     11779
CompanySponsored_ProductDev        3542
Independent_ProductDev             2117
CompanySponsored_CommunityServ      357
Independent_Heathcare               116
Family/Parent_Preservation           57
CompanySponsored_Heathcare           27
National_Preservation                26
Independent_CommunityServ            26
Regional_Preservation                12
Family/Parent_ProductDev              6
National_ProductDev                   4
Independent_Other                     3
National_Heathcare                    3
Other_Preservation                    3
Other_ProductDev                      1
Regional_ProductDev                   1
Family/Parent_CommunityServ           1
Name: count, dtype: int64

In [23]:
# Limit columns to 10 unique values
limit_unique(
    application_df,
    max_value = 10,
    columns_to_limit = ['AFFILIATION_ORGANIZATION', 'AFFILIATION_USECASE']
)

AFFILIATION_ORGANIZATION
Independent_Trust                16212
CompanySponsored_Association      8479
CompanySponsored_Trust            7206
Independent_Association           1764
Independent_Co-operative           465
Family/Parent_Trust                 55
Independent_Corporation             39
Other                               32
National_Trust                      29
CompanySponsored_Co-operative       18
Name: count, dtype: int64
Number of unique values: 10

AFFILIATION_USECASE
Independent_Preservation          16218
CompanySponsored_Preservation     11779
CompanySponsored_ProductDev        3542
Independent_ProductDev             2117
CompanySponsored_CommunityServ      357
Independent_Heathcare               116
Family/Parent_Preservation           57
Other                                34
CompanySponsored_Heathcare           27
Independent_CommunityServ            26
National_Preservation                26
Name: count, dtype: int64
Number of unique values: 11



### Encoding

In [24]:
# Get all the columns which are categorical
categorical_hot = list(application_df.select_dtypes(include="object").columns)

# Convert categorical data to numeric with `pd.get_dummies`
encoded_columns = pd.get_dummies(application_df[categorical_hot]).astype(int)
encoded_columns.head()

Unnamed: 0,NAME_ALPHA PHI SIGMA,NAME_AMATEUR ATHLETIC UNION OF THE UNITED STATES INC,NAME_Other,NAME_PARENT BOOSTER USA INC,NAME_PTA TEXAS CONGRESS,NAME_SOROPTIMIST INTERNATIONAL OF THE AMERICAS INC,NAME_TOASTMASTERS INTERNATIONAL,NAME_TOPS CLUB INC,NAME_UNITED STATES BOWLING CONGRESS INC,NAME_WASHINGTON STATE UNIVERSITY,...,AFFILIATION_USECASE_CompanySponsored_Heathcare,AFFILIATION_USECASE_CompanySponsored_Preservation,AFFILIATION_USECASE_CompanySponsored_ProductDev,AFFILIATION_USECASE_Family/Parent_Preservation,AFFILIATION_USECASE_Independent_CommunityServ,AFFILIATION_USECASE_Independent_Heathcare,AFFILIATION_USECASE_Independent_Preservation,AFFILIATION_USECASE_Independent_ProductDev,AFFILIATION_USECASE_National_Preservation,AFFILIATION_USECASE_Other
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [25]:
# Split our preprocessed data into our features and target arrays
# Isolate the target array
y = application_df['IS_SUCCESSFUL']

# Isolate the feature array
X = application_df.drop(columns=['IS_SUCCESSFUL'])

# Replace features with encoded equivalents
X.drop(categorical_hot, axis=1, inplace=True)

# Calculate updated feature array
X = pd.concat([X, encoded_columns], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [26]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [27]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train.columns)
hidden_nodes_layer0 = 80
hidden_nodes_layer1 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(
    units = hidden_nodes_layer0,
    input_dim = number_input_features,
    activation = "relu"
))

# Second hidden layer
nn.add(tf.keras.layers.Dense(
    units = hidden_nodes_layer1,
    activation = "relu"
))

# Output layer
nn.add(tf.keras.layers.Dense(
    units = 1,
    activation = "sigmoid"
))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 80)                6720      
                                                                 
 dense_1 (Dense)             (None, 30)                2430      
                                                                 
 dense_2 (Dense)             (None, 1)                 31        
                                                                 
Total params: 9181 (35.86 KB)
Trainable params: 9181 (35.86 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
# Compile the model
nn.compile(
    loss = "binary_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy"]
)

In [29]:
# Train the model
fit_model = nn.fit(
    X_train_scaled,
    y_train,
    epochs = 100,
    verbose = 1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [30]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(
    X_test_scaled,
    y_test,
    verbose = 2
)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

268/268 - 0s - loss: 0.5144 - accuracy: 0.7504 - 121ms/epoch - 453us/step
Loss: 0.514432966709137, Accuracy: 0.750437319278717


In [31]:
# Export our model to HDF5 file
nn.save('models/AlphabetSoupCharity_iteration6.h5', save_format='h5')

  saving_api.save_model(
