# Preprocessing

In [1]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pandas as pd
from pprint import pprint

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint
import keras_tuner as kt

In [2]:
# Import and read the charity_data.csv
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")
application_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


__Target Variable__: `IS_SUCCESSFUL`

__Feature Variables__: All other columns

In [3]:
# Drop the non-beneficial ID columns, 'EIN'.
application_df = application_df.drop(columns=['EIN'])

In [4]:
# Determine the number of unique values in each column.
application_df.nunique()

NAME                      19568
APPLICATION_TYPE             17
AFFILIATION                   6
CLASSIFICATION               71
USE_CASE                      5
ORGANIZATION                  4
STATUS                        2
INCOME_AMT                    9
SPECIAL_CONSIDERATIONS        2
ASK_AMT                    8747
IS_SUCCESSFUL                 2
dtype: int64

In [None]:
# For encoding consistency, convert STATUS to Y/N instead of 1/0.
application_df['STATUS'] = application_df['STATUS'].map({
    0: 'N',
    1: 'Y'
})

In [None]:
# Display the data types and non-null count
application_df.info()

In [None]:
# Determine whether there is an imbalance in the target variable
application_df['IS_SUCCESSFUL'].value_counts()

No, there is no apparent imbalance in the target variable.

## Categorical Columns Exploration

In [None]:
# Get all the columns which are categorical
categorical_cols = list(application_df.select_dtypes(include="object").columns)
categorical_cols

### Limit the Number of Unique Values

In [None]:
# Define the upper limit of unique values
max_unique = 10

In [None]:
# Determine number of unique values in each column
many_unique = []
for col in categorical_cols:
    num_unique = len(application_df[col].unique())
    
    # Append columns which exceed max_unique
    if num_unique > max_unique:
        many_unique.append(col)
        # if col == "NAME":
        #     continue
        # else:
        #     many_unique.append(col)
    
    print(f'{col}: {num_unique}')

print(f'\nColumns with >{max_unique} values: {many_unique}')

In [5]:
def limit_unique(df, max_value, columns_to_limit):
    """
    Purpose of the function is to limit the number of unique values
    """
    
    # Loop through each column
    for col in columns_to_limit:
        # Get the value counts of the column
        total_counts = df[col].value_counts()
        
        # Get the top values to retain, not including "Other"
        top_counts = total_counts[:max_value-1]
        
        # Define the cutoff
        cutoff_value = top_counts.iloc[-1]
        
        # Create a list of values to replace
        replace_values = total_counts.loc[total_counts.values < cutoff_value].index
        
        # Replace in dataframe
        for value in replace_values:
            df[col] = df[col].replace(value, "Other")
        
        # Check to make sure binning was successful
        print(df[col].value_counts())
        print(f'Number of unique values: {df[col].nunique()}\n')

In [None]:
# Limit the unique values of columns with >10 unique values
limit_unique(application_df, max_unique, many_unique)

In [6]:
# Limit APPLICATION_TYPE and CLASSIFICATION to 10 unique values
limit_unique(application_df, 10, ['APPLICATION_TYPE', 'CLASSIFICATION'])

APPLICATION_TYPE
T3       27037
T4        1542
T6        1216
T5        1173
T19       1065
T8         737
T7         725
T10        528
T9         156
Other      120
Name: count, dtype: int64
Number of unique values: 10

CLASSIFICATION
C1000    17326
C2000     6074
C1200     4837
C3000     1918
C2100     1883
Other      887
C7000      777
C1700      287
C4000      194
C5000      116
Name: count, dtype: int64
Number of unique values: 10



In [7]:
# Limit NAME to 20 unique values
limit_unique(application_df, 10, ['NAME'])

NAME
Other                                              29369
PARENT BOOSTER USA INC                              1260
TOPS CLUB INC                                        765
UNITED STATES BOWLING CONGRESS INC                   700
WASHINGTON STATE UNIVERSITY                          492
AMATEUR ATHLETIC UNION OF THE UNITED STATES INC      408
PTA TEXAS CONGRESS                                   368
SOROPTIMIST INTERNATIONAL OF THE AMERICAS INC        331
ALPHA PHI SIGMA                                      313
TOASTMASTERS INTERNATIONAL                           293
Name: count, dtype: int64
Number of unique values: 10



### Explore the distribution of values

In [None]:
# Explore the distribution of values for each column
for col in categorical_cols:
    if col in many_unique:
        continue
    else:
        print(application_df[col].value_counts(), '\n')

In [None]:
# Explore AFFILIATION
counts = application_df['SPECIAL_CONSIDERATIONS'].loc[application_df['IS_SUCCESSFUL'] == 1].value_counts()
total_count = application_df['SPECIAL_CONSIDERATIONS'].loc[application_df['IS_SUCCESSFUL'] == 1].count()
percentages = counts / total_count * 100

print(counts)
percentages

__TEST IDEAS__

- Test whether removing these imbalanced columns will improve performance.
- Test whether one-hot encoding or label encoding will be better.

__Hypothesis__: Although label encoding seems more logical (since binary), it might be better to use one-hot encoding to prevent the "heavier" weighting to the '1' value compared to '0'.

### Explore discrepancy in `INCOME_AMT`
- There is no data on: `500000-1M`
- Solution: Convert to ordinal encoding to account for this

In [None]:
# Convert each value to an ordinal score
income_map = {
    '0': 0,
    '1-9999': 1,
    '10000-24999': 2,
    '25000-99999': 3,
    '100000-499999': 4,
    '500000-1M': 5, # No data in this category
    '1M-5M': 6,
    '5M-10M': 7,
    '10M-50M': 8,
    '50M+': 9
}
application_df['ORDINAL_INCOME_AMT'] = application_df['INCOME_AMT'].map(income_map)

__TEST IDEA__

Explore whether there is an effect to dropping the `INCOME_AMT`.

### Compare `ASK_AMT` to `INCOME_AMT`

In [None]:
# Display all possible values for INCOME_AMT
application_df['INCOME_AMT'].value_counts()

In [None]:
# Split the bounded values by the hyphen and create new columns
application_df[['LOWER_INCOME', 'UPPER_INCOME']] = application_df['INCOME_AMT'].str.split('-', expand=True)

In [None]:
# If INCOME_AMT contains "M", convert to e6.
application_df['UPPER_INCOME'] = application_df['UPPER_INCOME'].str.replace('M', 'e6')
application_df['LOWER_INCOME'] = application_df['LOWER_INCOME'].str.replace('M', 'e6')

# Confirm changes
display(application_df[['LOWER_INCOME', 'UPPER_INCOME']].value_counts())
application_df.head()

In [None]:
# Address 'None' values in UPPER_INCOME. If INCOME_AMT is '0', set UPPER_INCOME to 0.
application_df.loc[application_df['LOWER_INCOME'] == '0', 'UPPER_INCOME'] = 0
application_df.head()

In [None]:
# Address INCOME_AMT values with `50M+`
application_df.loc[application_df['INCOME_AMT'] == '50M+']

# Convert UPPER_INCOME NaN values to 100M
application_df.loc[application_df['INCOME_AMT'] == '50M+', 'UPPER_INCOME'] = 1000e6

# Convert to '50M+' to 50e6
application_df['LOWER_INCOME'] = application_df['LOWER_INCOME'].replace('50e6+', '50e6')

# Convert column to integer
application_df['LOWER_INCOME'] = pd.to_numeric(application_df['LOWER_INCOME']).astype(int)
application_df['UPPER_INCOME'] = pd.to_numeric(application_df['UPPER_INCOME']).astype(int)

In [None]:
# Create a new column that compares the ASK_AMT to LOWER_INCOME
application_df['ASK_VS_INCOME'] = application_df['ASK_AMT'] > application_df['LOWER_INCOME']

# Convert boolean to Y/N for consistency
application_df['ASK_VS_INCOME'] = application_df['ASK_VS_INCOME'].map({
    False: 'N',
    True: 'Y'
})

# Display updated dataframe
application_df.head()

In [None]:
application_df['ASK_VS_INCOME'].value_counts()

### AFFILIATION vs ORGANIZATION

In [None]:
application_df['AFFILIATION'].value_counts()

In [None]:
application_df['ORGANIZATION'].value_counts()

In [None]:
application_df['AFFILIATION_ORGANIZATION'] = application_df['AFFILIATION'] + "_" + application_df['ORGANIZATION']
application_df['AFFILIATION_ORGANIZATION'].value_counts()

In [None]:
limit_unique(application_df, max_unique, ['AFFILIATION_ORGANIZATION'])

### AFFILIATION vs USE_CASE

In [None]:
application_df['AFFILIATION_USECASE'] = application_df['AFFILIATION'] + "_" + application_df['USE_CASE']
application_df['AFFILIATION_USECASE'].value_counts()

In [None]:
limit_unique(application_df, max_unique, ['AFFILIATION_USECASE'])

## Encoding

In [None]:
# Check datatypes before encoding
application_df.info()

In [8]:
# Get all the columns which are categorical
categorical_hot = list(application_df.select_dtypes(include="object").columns)
categorical_hot

['NAME',
 'APPLICATION_TYPE',
 'AFFILIATION',
 'CLASSIFICATION',
 'USE_CASE',
 'ORGANIZATION',
 'INCOME_AMT',
 'SPECIAL_CONSIDERATIONS']

In [9]:
# Convert categorical data to numeric with `pd.get_dummies`
encoded_columns = pd.get_dummies(application_df[categorical_hot]).astype(int)
encoded_columns.head()

Unnamed: 0,NAME_ALPHA PHI SIGMA,NAME_AMATEUR ATHLETIC UNION OF THE UNITED STATES INC,NAME_Other,NAME_PARENT BOOSTER USA INC,NAME_PTA TEXAS CONGRESS,NAME_SOROPTIMIST INTERNATIONAL OF THE AMERICAS INC,NAME_TOASTMASTERS INTERNATIONAL,NAME_TOPS CLUB INC,NAME_UNITED STATES BOWLING CONGRESS INC,NAME_WASHINGTON STATE UNIVERSITY,...,INCOME_AMT_1-9999,INCOME_AMT_10000-24999,INCOME_AMT_100000-499999,INCOME_AMT_10M-50M,INCOME_AMT_1M-5M,INCOME_AMT_25000-99999,INCOME_AMT_50M+,INCOME_AMT_5M-10M,SPECIAL_CONSIDERATIONS_N,SPECIAL_CONSIDERATIONS_Y
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


## Split and Scale the Dataset

In [10]:
# Split our preprocessed data into our features and target arrays
# Isolate the target array
y = application_df['IS_SUCCESSFUL']

# Isolate the feature array
X = application_df.drop(columns=['IS_SUCCESSFUL'])

# Replace features with encoded equivalents
X.drop(categorical_hot, axis=1, inplace=True)

# Calculate updated feature array
X = pd.concat([X, encoded_columns], axis=1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [None]:
# Display the feature array
X.head()

In [11]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Tuning

In [None]:
# Define the model parameters
number_input_features = len(X_train.columns)
max_hidden_layers = 3 # lowest value permitted is 2, min. required for DL
max_num_neurons = number_input_features * 2 - 1
step_count = 5
activation_functions = [
    'relu', 'leaky_relu', 'tanh',
    'elu', 'selu', 'exponential',
    'softmax', 'softplus'
]
optimiser = "Adam"

In [None]:
def create_model(hp):
    nn_model = tf.keras.models.Sequential()
    
    # Choose activation function in hidden layers
    activation_first_hidden = hp.Choice('activation_layer_0', activation_functions)
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(
        units = hp.Int(
            'units_layer_0',
            min_value = 1,
            max_value = max_num_neurons,
            step = step_count),
        activation = activation_first_hidden,
        # kernel_regularizer = tf.keras.regularizers.L1(0.01),
        input_dim = number_input_features
    ))
    
    # # Tune whether to use dropout based on the Boolean hyperparameter
    # if hp.Boolean("use_dropout"):
    #     # Add a dropout layer if the Boolean hyperparameter is True
    #     nn_model.add(layers.Dropout(rate=0.5))  # Adjust the dropout rate as needed
    
    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    num_layers = hp.Int('num_layers', 1, max_hidden_layers-1) # options: 1, 2
    
    for i in range(1, num_layers+1): # i-values: 1, 2 only
        # Choose the number of neurons per layer
        units_layer_i = hp.Int(
            f'units_layer_{i}',
            min_value = 1,
            max_value = max_num_neurons,
            step = step_count
        )
        
        # Choose a different activation function for each layer
        activation_layer_i = hp.Choice(f'activation_layer_{i}', activation_functions)

        nn_model.add(tf.keras.layers.Dense(
            units = units_layer_i,
            activation = activation_layer_i
        ))
        
        # # Tune whether to use dropout based on the Boolean hyperparameter
        # if hp.Boolean("use_dropout"):
        #     # Add a dropout layer if the Boolean hyperparameter is True
        #     nn_model.add(layers.Dropout(rate=0.5))  # Adjust the dropout rate as needed

    # Add the output layer
    nn_model.add(tf.keras.layers.Dense(
        units = 1,
        activation = "sigmoid"
    ))

    # Compile the model
    nn_model.compile(
        loss = "binary_crossentropy",
        optimizer = optimiser,
        metrics = ["accuracy"]
    )
    
    return(nn_model)

In [None]:
# Initialise the Hyperband tuner
tuner = kt.Hyperband(
    create_model,
    objective = "val_accuracy",
    max_epochs = 20,
    hyperband_iterations = 2
)

In [None]:
# Find the best hyperparameters
tuner.search(
    X_train_scaled,
    y_train,
    epochs = 20,
    validation_data = (X_test_scaled, y_test)
)

In [None]:
# Get the top 3 model hyperparameters
top3_hyper = tuner.get_best_hyperparameters(3)

for model in top3_hyper:
    pprint(model.values)

# Compile, Train, and Evaluate the Best Model

In [None]:
best_hyper = top3_hyper[0]
best_hyper.values

In [None]:
# Define the model parameters
number_input_features = len(X_train.columns)
hidden_layer0_neurons = best_hyper.values['units_layer_0']
hidden_layer0_activation = best_hyper.values['activation_layer_0']

# Return the number of hidden layers
total_hidden = best_hyper.values['num_layers'] + 1

In [None]:
# Initialise the sequential model
nn = tf.keras.models.Sequential()

# Create the input layer and first hidden layer
nn.add(tf.keras.layers.Dense(
    units = hidden_layer0_neurons,
    activation = hidden_layer0_activation,
    input_dim = number_input_features
))

# nn.add(layers.Dropout(rate=0.5))
        
# Create additional hidden layers
for layer in range(1, total_hidden):
    nn.add(tf.keras.layers.Dense(
        units = best_hyper.values[f'units_layer_{layer}'],
        activation = best_hyper.values[f'activation_layer_{layer}']
    ))
    # nn.add(layers.Dropout(rate=0.5))

# Create the output layer
nn.add(tf.keras.layers.Dense(
    units = 1,
    activation = "sigmoid"
))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(
    loss = "binary_crossentropy",
    optimizer = optimiser,
    metrics = ["accuracy"]
)

In [None]:
# Train the model
fit_model = nn.fit(
    X_train_scaled,
    y_train,
    epochs = 50,
    verbose = 1
)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(
    X_test_scaled,
    y_test,
    verbose = 2
)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Print the model architecture
print(f"Input features: {number_input_features}")
print(f"Hidden Layer 0: {hidden_layer0_activation}, {hidden_layer0_neurons}")

for layer in range(1, total_hidden):
    activation = best_hyper.values[f'activation_layer_{layer}']
    neurons = best_hyper.values[f'units_layer_{layer}']
    print(f"Hidden Layer {layer}: {activation}, {neurons}")
    