<a href="https://colab.research.google.com/github/brianna-mitri/deep-learning-challenge/blob/main/AlphabetSoupCharity_Optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [1]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m758.9 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [2]:
#imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
import numpy as np
import tensorflow as tf
import keras_tuner as kt

In [3]:
# read in data into df
charity_df = pd.read_csv('https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv')
charity_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


## Data Preprocessing
---

### Drop ID columns

In [4]:
# drop id columns (EIN, NAME)
charity_df = charity_df.iloc[:, 2:].copy()
charity_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [5]:
# check dtypes
charity_df.dtypes

Unnamed: 0,0
APPLICATION_TYPE,object
AFFILIATION,object
CLASSIFICATION,object
USE_CASE,object
ORGANIZATION,object
STATUS,int64
INCOME_AMT,object
SPECIAL_CONSIDERATIONS,object
ASK_AMT,int64
IS_SUCCESSFUL,int64


### Grouping smaller category levels (not ask amt becaues numeric)

In [6]:
# check number of unique columns and shape
print(charity_df.shape)
charity_df.nunique()

(34299, 10)


Unnamed: 0,0
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747
IS_SUCCESSFUL,2


In [7]:
# get list of columns with more than 10 unique values
long_cols = [col for col in list(charity_df.columns) if charity_df[col].nunique() > 10]
long_cols

['APPLICATION_TYPE', 'CLASSIFICATION', 'ASK_AMT']

In [8]:
# function to replace "rare" col values with "other"
def label_other(index, cutoff_num):
  col = long_cols[index]
  cnts = charity_df[col].value_counts()
  others = cnts[cnts < cutoff_num].index

  # replace with other
  charity_df[col] = charity_df[col].replace(others, "Other")

In [9]:
# label other for value_counts() < cutoff_num
label_other(0, 500)  #app type
label_other(1, 1500)  #classification

In [10]:
# check unique count 3now
charity_df.nunique()

Unnamed: 0,0
APPLICATION_TYPE,9
AFFILIATION,6
CLASSIFICATION,6
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747
IS_SUCCESSFUL,2


### See distribution of output variable

In [11]:
charity_df['IS_SUCCESSFUL'].value_counts()

Unnamed: 0_level_0,count
IS_SUCCESSFUL,Unnamed: 1_level_1
1,18261
0,16038


### Train/Test split

In [12]:
# separate target
x_vars = charity_df.iloc[:, 0:-1]
y_var = charity_df.iloc[:, -1]

In [13]:
# do train test split
x_train, x_test, y_train, y_test = train_test_split(
    x_vars,
    y_var,
    stratify=y_var,  #preserve the class ratio
    random_state=1
)

### Encode categorical variables & scale numeric

In [14]:
# avoid data leakage and identify column type (categorical/numeric) from x_train
cat_cols = x_vars.select_dtypes(include=['object']).columns.tolist()
cat_cols.append('STATUS')

num_cols = x_train.columns.difference(cat_cols).tolist()

# review cat and num cols
display(cat_cols)
display(num_cols)

['APPLICATION_TYPE',
 'AFFILIATION',
 'CLASSIFICATION',
 'USE_CASE',
 'ORGANIZATION',
 'INCOME_AMT',
 'SPECIAL_CONSIDERATIONS',
 'STATUS']

['ASK_AMT']

In [15]:
# create transformers for numeric (standardscaler) and categorical columns (onehotencoder)
cat_transformer = OneHotEncoder(
    #drop='first',  #not really needed to do in nn
    handle_unknown='ignore', #prevents errors if test/new data has unforseen categories
    sparse_output=False
)

num_transformer = StandardScaler()

In [16]:
# combine transformers with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols), #apply onehotencoder to category cols
        ('num', num_transformer, num_cols)  #apply scaling to numeric cols
    ],
    #remainder='passthrough' #keep rest of the columns untransformed
    remainder='drop' #drop rest of the columns
)

In [17]:
# fit/transform on x train and transform x test
x_train_processed = preprocessor.fit_transform(x_train)
x_test_processed = preprocessor.transform(x_test)

### Review processed data

In [18]:
# look into col names
encoded_feature_names = preprocessor.get_feature_names_out()
encoded_feature_names

array(['cat__APPLICATION_TYPE_Other', 'cat__APPLICATION_TYPE_T10',
       'cat__APPLICATION_TYPE_T19', 'cat__APPLICATION_TYPE_T3',
       'cat__APPLICATION_TYPE_T4', 'cat__APPLICATION_TYPE_T5',
       'cat__APPLICATION_TYPE_T6', 'cat__APPLICATION_TYPE_T7',
       'cat__APPLICATION_TYPE_T8', 'cat__AFFILIATION_CompanySponsored',
       'cat__AFFILIATION_Family/Parent', 'cat__AFFILIATION_Independent',
       'cat__AFFILIATION_National', 'cat__AFFILIATION_Other',
       'cat__AFFILIATION_Regional', 'cat__CLASSIFICATION_C1000',
       'cat__CLASSIFICATION_C1200', 'cat__CLASSIFICATION_C2000',
       'cat__CLASSIFICATION_C2100', 'cat__CLASSIFICATION_C3000',
       'cat__CLASSIFICATION_Other', 'cat__USE_CASE_CommunityServ',
       'cat__USE_CASE_Heathcare', 'cat__USE_CASE_Other',
       'cat__USE_CASE_Preservation', 'cat__USE_CASE_ProductDev',
       'cat__ORGANIZATION_Association', 'cat__ORGANIZATION_Co-operative',
       'cat__ORGANIZATION_Corporation', 'cat__ORGANIZATION_Trust',
       'cat

In [19]:
# look into df version
x_train_processed_df = pd.DataFrame(x_train_processed, columns=encoded_feature_names)
x_train_processed_df.head()

Unnamed: 0,cat__APPLICATION_TYPE_Other,cat__APPLICATION_TYPE_T10,cat__APPLICATION_TYPE_T19,cat__APPLICATION_TYPE_T3,cat__APPLICATION_TYPE_T4,cat__APPLICATION_TYPE_T5,cat__APPLICATION_TYPE_T6,cat__APPLICATION_TYPE_T7,cat__APPLICATION_TYPE_T8,cat__AFFILIATION_CompanySponsored,...,cat__INCOME_AMT_10M-50M,cat__INCOME_AMT_1M-5M,cat__INCOME_AMT_25000-99999,cat__INCOME_AMT_50M+,cat__INCOME_AMT_5M-10M,cat__SPECIAL_CONSIDERATIONS_N,cat__SPECIAL_CONSIDERATIONS_Y,cat__STATUS_0,cat__STATUS_1,num__ASK_AMT
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.013342
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.030891
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.030891
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.030891
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.030891


## Compile/Train model
---

In [20]:
# function that creates a new sequential model with hyperparameter options
def create_model(hp):
  nn_model = tf.keras.models.Sequential()

  # hidden layers: activation options
  #activation = hp.Choice('activation', ['relu', 'tanh', 'elu'])

  # input shape equal to x features
  input_dim = x_train_processed.shape[1]  #number of features after onehotencoding
  nn_model.add(tf.keras.Input(shape=(input_dim,)))

  # hidden layers: pick from 2 to 5 hidden layers
  num_layers = hp.Int('num_layers', min_value=2, max_value=5)

  # hidden layers: activation options
  hidden_activation = hp.Choice('activation', ['relu', 'tanh', 'elu'])

  # for each hidden layer, tune number of units
  for i in range(num_layers):
    #units = hp.Int(f'units_{i}', min_value=16, max_value=128, step=16)
    units = hp.Int(f'units_{i}', min_value=16, max_value=256, step=16)
    nn_model.add(tf.keras.layers.Dense(units=units, activation=hidden_activation))

  # output layer (binary classification)
  nn_model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

  # compile the model
  nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return nn_model


In [21]:
# get tuner
tuner = kt.Hyperband(
    create_model,
    objective='val_accuracy',
    max_epochs=20,
    hyperband_iterations=2
)

In [22]:
# run kerastuner to search for best hyperparameters
tuner.search(
    x_train_processed, y_train,
    epochs=20,
    #validation_data=(x_test_processed, y_test)  #keep test data separate (avoid leakage)
    validation_split=0.2,  # 20% of train for validation,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)]  #early stopping to prevent overfitting (if val loss doesn't improve after 5 epochs)
)

Trial 60 Complete [00h 00m 26s]
val_accuracy: 0.7278911471366882

Best val_accuracy So Far: 0.7319728136062622
Total elapsed time: 00h 20m 28s


In [23]:
# check results summary
tuner.results_summary()

Results summary
Results in ./untitled_project
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 0026 summary
Hyperparameters:
num_layers: 3
activation: tanh
units_0: 176
units_1: 112
units_2: 64
units_3: 256
units_4: 224
tuner/epochs: 20
tuner/initial_epoch: 0
tuner/bracket: 0
tuner/round: 0
Score: 0.7319728136062622

Trial 0046 summary
Hyperparameters:
num_layers: 3
activation: relu
units_0: 16
units_1: 16
units_2: 96
units_3: 224
units_4: 192
tuner/epochs: 20
tuner/initial_epoch: 7
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: 0045
Score: 0.7310009598731995

Trial 0040 summary
Hyperparameters:
num_layers: 3
activation: relu
units_0: 208
units_1: 112
units_2: 240
units_3: 256
units_4: 144
tuner/epochs: 3
tuner/initial_epoch: 0
tuner/bracket: 2
tuner/round: 0
Score: 0.7308065891265869

Trial 0014 summary
Hyperparameters:
num_layers: 4
activation: relu
units_0: 128
units_1: 96
units_2: 112
units_3: 48
units_4: 128
tuner/epochs: 7
tuner/initial_epoch: 3
tune

In [24]:
# get best model hp's
best_hp = tuner.get_best_hyperparameters()[0]
best_hp.values

{'num_layers': 3,
 'activation': 'tanh',
 'units_0': 176,
 'units_1': 112,
 'units_2': 64,
 'units_3': 256,
 'units_4': 224,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [25]:
# evaluate best model
best_model = tuner.get_best_models()[0]
model_loss, model_accuracy = best_model.evaluate(x_test_processed, y_test, verbose=2)
print(f'Loss: {model_loss}, Accuracy: {model_accuracy}')

268/268 - 1s - 3ms/step - accuracy: 0.7282 - loss: 0.5536
Loss: 0.5535897016525269, Accuracy: 0.7281632423400879
