<a href="https://colab.research.google.com/github/brianna-mitri/deep-learning-challenge/blob/main/charity_nn_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [16]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [1]:
#imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
# read in data into df
charity_df = pd.read_csv('https://static.bc-edx.com/data/dl-1-2/m21/lms/starter/charity_data.csv')
charity_df.head()

Unnamed: 0,EIN,NAME,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,10520599,BLUE KNIGHTS MOTORCYCLE CLUB,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,10531628,AMERICAN CHESAPEAKE CLUB CHARITABLE TR,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,10547893,ST CLOUD PROFESSIONAL FIREFIGHTERS,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,10553066,SOUTHSIDE ATHLETIC ASSOCIATION,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,10556103,GENETIC RESEARCH INSTITUTE OF THE DESERT,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


## Data Preprocessing
---

### Drop ID columns

In [3]:
# drop id columns (EIN, NAME)
charity_df = charity_df.iloc[:, 2:].copy()
charity_df.head()

Unnamed: 0,APPLICATION_TYPE,AFFILIATION,CLASSIFICATION,USE_CASE,ORGANIZATION,STATUS,INCOME_AMT,SPECIAL_CONSIDERATIONS,ASK_AMT,IS_SUCCESSFUL
0,T10,Independent,C1000,ProductDev,Association,1,0,N,5000,1
1,T3,Independent,C2000,Preservation,Co-operative,1,1-9999,N,108590,1
2,T5,CompanySponsored,C3000,ProductDev,Association,1,0,N,5000,0
3,T3,CompanySponsored,C2000,Preservation,Trust,1,10000-24999,N,6692,1
4,T3,Independent,C1000,Heathcare,Trust,1,100000-499999,N,142590,1


In [4]:
# check dtypes
charity_df.dtypes

Unnamed: 0,0
APPLICATION_TYPE,object
AFFILIATION,object
CLASSIFICATION,object
USE_CASE,object
ORGANIZATION,object
STATUS,int64
INCOME_AMT,object
SPECIAL_CONSIDERATIONS,object
ASK_AMT,int64
IS_SUCCESSFUL,int64


### Grouping smaller category levels (not ask amt becaues numeric)

In [5]:
# check number of unique columns and shape
print(charity_df.shape)
charity_df.nunique()

(34299, 10)


Unnamed: 0,0
APPLICATION_TYPE,17
AFFILIATION,6
CLASSIFICATION,71
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747
IS_SUCCESSFUL,2


In [6]:
# get list of columns with more than 10 unique values
long_cols = [col for col in list(charity_df.columns) if charity_df[col].nunique() > 10]
long_cols

['APPLICATION_TYPE', 'CLASSIFICATION', 'ASK_AMT']

In [7]:
# function to replace "rare" col values with "other"
def label_other(index, cutoff_num):
  col = long_cols[index]
  cnts = charity_df[col].value_counts()
  others = cnts[cnts < cutoff_num].index

  # replace with other
  charity_df[col] = charity_df[col].replace(others, "Other")

In [8]:
# label other for value_counts() < cutoff_num
label_other(0, 500)  #app type
label_other(1, 1500)  #classification

In [9]:
# check unique count 3now
charity_df.nunique()

Unnamed: 0,0
APPLICATION_TYPE,9
AFFILIATION,6
CLASSIFICATION,6
USE_CASE,5
ORGANIZATION,4
STATUS,2
INCOME_AMT,9
SPECIAL_CONSIDERATIONS,2
ASK_AMT,8747
IS_SUCCESSFUL,2


### Train/Test split

In [10]:
# separate target
x_vars = charity_df.iloc[:, 0:-1]
y_var = charity_df.iloc[:, -1]

In [11]:
# do train test split
x_train, x_test, y_train, y_test = train_test_split(x_vars, y_var, random_state=1)

### Encode categorical variables & scale numeric

In [12]:
# avoid data leakage and identify column type (categorical/numeric) from x_train
cat_cols = x_train.dtypes[x_train.dtypes == "object"].index.tolist()
num_cols = x_train.select_dtypes(include=[np.number]).columns.tolist()

In [13]:
# create transformers for numeric (standardscaler) and categorical columns (onehotencoder)
cat_transformer = OneHotEncoder(
    drop='first',
    handle_unknown='ignore', #prevents errors if test/new data has unforseen categories
    sparse_output=False
)

num_transformer = StandardScaler()

In [14]:
# combine transformers with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_cols), #apply onehotencoder to category cols
        ('num', num_transformer, num_cols)  #apply scaling to numeric cols
    ],
    #remainder='passthrough' #keep rest of the columns untransformed
    remainder='drop' #drop rest of the columns
)

In [17]:
# fit/transform on x train and transform x test
x_train_processed = preprocessor.fit_transform(x_train)
x_test_processed = preprocessor.transform(x_test)

### Review processed data

In [26]:
# look into col names
encoded_feature_names = preprocessor.get_feature_names_out()
encoded_feature_names

array(['cat__APPLICATION_TYPE_T10', 'cat__APPLICATION_TYPE_T19',
       'cat__APPLICATION_TYPE_T3', 'cat__APPLICATION_TYPE_T4',
       'cat__APPLICATION_TYPE_T5', 'cat__APPLICATION_TYPE_T6',
       'cat__APPLICATION_TYPE_T7', 'cat__APPLICATION_TYPE_T8',
       'cat__AFFILIATION_Family/Parent', 'cat__AFFILIATION_Independent',
       'cat__AFFILIATION_National', 'cat__AFFILIATION_Other',
       'cat__AFFILIATION_Regional', 'cat__CLASSIFICATION_C1200',
       'cat__CLASSIFICATION_C2000', 'cat__CLASSIFICATION_C2100',
       'cat__CLASSIFICATION_C3000', 'cat__CLASSIFICATION_Other',
       'cat__USE_CASE_Heathcare', 'cat__USE_CASE_Other',
       'cat__USE_CASE_Preservation', 'cat__USE_CASE_ProductDev',
       'cat__ORGANIZATION_Co-operative', 'cat__ORGANIZATION_Corporation',
       'cat__ORGANIZATION_Trust', 'cat__INCOME_AMT_1-9999',
       'cat__INCOME_AMT_10000-24999', 'cat__INCOME_AMT_100000-499999',
       'cat__INCOME_AMT_10M-50M', 'cat__INCOME_AMT_1M-5M',
       'cat__INCOME_AMT_25000-

In [27]:
# look into df version
x_train_processed_df = pd.DataFrame(x_train_processed, columns=encoded_feature_names)
x_train_processed_df.head()

Unnamed: 0,cat__APPLICATION_TYPE_T10,cat__APPLICATION_TYPE_T19,cat__APPLICATION_TYPE_T3,cat__APPLICATION_TYPE_T4,cat__APPLICATION_TYPE_T5,cat__APPLICATION_TYPE_T6,cat__APPLICATION_TYPE_T7,cat__APPLICATION_TYPE_T8,cat__AFFILIATION_Family/Parent,cat__AFFILIATION_Independent,...,cat__INCOME_AMT_10000-24999,cat__INCOME_AMT_100000-499999,cat__INCOME_AMT_10M-50M,cat__INCOME_AMT_1M-5M,cat__INCOME_AMT_25000-99999,cat__INCOME_AMT_50M+,cat__INCOME_AMT_5M-10M,cat__SPECIAL_CONSIDERATIONS_Y,num__STATUS,num__ASK_AMT
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0108,-0.029571
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0108,-0.029571
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0108,-0.029571
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0108,-0.029571
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0108,-0.029571


## Compile/Train model
---

In [None]:
# function that creates a new sequential model with hyperparameter options
def create_model(hp):
  nn_model = tf.keras.models.Sequential()

  # hidden layers: activation options
  activation = hp.Choice('activation', ['relu', 'tanh', 'elu'])

  # input shape equal to x features
  input_dim = x_train_processed.shape[1]  #number of features after onehotencoding
  nn_model.add(tf.keras.Input(shape=(input_dim,)))

  # hidden layers: pick from 1 to 2 hidden layers
  num_layers = hp.Int('num_layers', min_value=1, max_value=2)

  # hidden layers: activation options
  hidden_activation = hp.Choice('activation', ['relu', 'tanh', 'elu'])

  # for each hidden layer, tune number of units
  for i in range(num_layers):
    units = hp.Int(f'units_{i}', min_value=16, max_value=128, step=16)
    nn_model.add(tf.keras.layers.Dense(units=units, activation=hidden_activation))

  # output layer (binary classification)
  nn_model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

  # compile the model
  nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  return nn_model
