## Preprocessing

In [1]:
# Import our dependencies

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import keras_tuner as kt
import pandas as pd
import seaborn as sns
import tensorflow as tf

In [2]:
pd.options.display.max_rows = 500
pd.options.display.max_columns = 50

In [3]:
def load_dataset():
    """ Reads dataset csv and returns pandas dataframe """
    
    filepath = "Resources/charity_data.csv"

    df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
    
    return df

In [4]:
def clean_dataset(a_df):
    """ Returns deduped, na-dropped, index-reset dataframe """    
    
    a_df = a_df.drop_duplicates()   
        
    a_df = a_df.dropna()
        
    a_df = a_df.reset_index(drop=True)
    
    return a_df   

In [5]:
def examine_dataset(a_df):
    """ Provides summary info and visualizations of dataset """
    
    print(a_df.info())
           
    # Determine the number of unique values in each column.

    for col in a_df.columns:
        print(f"{col} \n{a_df[col].value_counts()}\n\n")   

In [6]:
df = clean_dataset(load_dataset())

In [7]:
examine_dataset(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34299 entries, 0 to 34298
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   EIN                     34299 non-null  int64 
 1   NAME                    34299 non-null  object
 2   APPLICATION_TYPE        34299 non-null  object
 3   AFFILIATION             34299 non-null  object
 4   CLASSIFICATION          34299 non-null  object
 5   USE_CASE                34299 non-null  object
 6   ORGANIZATION            34299 non-null  object
 7   STATUS                  34299 non-null  int64 
 8   INCOME_AMT              34299 non-null  object
 9   SPECIAL_CONSIDERATIONS  34299 non-null  object
 10  ASK_AMT                 34299 non-null  int64 
 11  IS_SUCCESSFUL           34299 non-null  int64 
dtypes: int64(4), object(8)
memory usage: 3.1+ MB
None
EIN 
10520599     1
626274659    1
630475330    1
630416100    1
630357662    1
            ..
383880377    

In [8]:
# Drop the non-beneficial columns

df = df.drop(columns=["EIN", "NAME", "STATUS", "SPECIAL_CONSIDERATIONS"])

In [9]:
def reduce_cats(a_col, a_cutoff):
    """ Inputs are a series and a cutoff value for 'Other' """
    
    print(f"BEFORE: \n\n{df[a_col].value_counts()}\n\n")

    types_to_replace = (df[a_col].value_counts().loc[lambda x: x < int(a_cutoff)]).keys().tolist()

    for code in types_to_replace:        
        df[a_col] = df[a_col].replace(code, "Other")

    # Check to make sure binning was successful

    print(f"AFTER: \n\n{df[a_col].value_counts()}\n\n")   

In [10]:
reduce_cats("AFFILIATION", 15_000)

reduce_cats("APPLICATION_TYPE", 1000)

reduce_cats("ASK_AMT", 25_000)

reduce_cats("CLASSIFICATION", 1000)

reduce_cats("INCOME_AMT", 3000)

reduce_cats("ORGANIZATION", 10_000)

reduce_cats("USE_CASE", 5000)

BEFORE: 

Independent         18480
CompanySponsored    15705
Family/Parent          64
National               33
Regional               13
Other                   4
Name: AFFILIATION, dtype: int64


AFTER: 

Independent         18480
CompanySponsored    15705
Other                 114
Name: AFFILIATION, dtype: int64


BEFORE: 

T3     27037
T4      1542
T6      1216
T5      1173
T19     1065
T8       737
T7       725
T10      528
T9       156
T13       66
T12       27
T2        16
T25        3
T14        3
T29        2
T15        2
T17        1
Name: APPLICATION_TYPE, dtype: int64


AFTER: 

T3       27037
Other     2266
T4        1542
T6        1216
T5        1173
T19       1065
Name: APPLICATION_TYPE, dtype: int64


BEFORE: 

5000        25398
10478           3
15583           3
63981           3
6725            3
            ...  
5371754         1
30060           1
43091152        1
18683           1
36500179        1
Name: ASK_AMT, Length: 8747, dtype: int64


AFTER: 

5000     2

In [None]:
df.head()

In [11]:
def encode_df(a_df):
    """ Returns one-hot encoded dataframe """
    
    categorical_list = a_df.dtypes[a_df.dtypes == "object"].index.tolist() 
    
    print(f"CATEGORIES FOR EACH CATEGORICAL FEATURE ENCODED:\n\n{a_df[categorical_list].nunique()}\n\n")
    
    concat_list = []
    
    for categorical in categorical_list:
        
        concat_list.append(pd.get_dummies(a_df[categorical], prefix=categorical, prefix_sep='_'))        
    
    concat_list.append(a_df["IS_SUCCESSFUL"])

    return pd.concat(concat_list, axis=1)   

In [12]:
enc_df = encode_df(df)

CATEGORIES FOR EACH CATEGORICAL FEATURE ENCODED:

APPLICATION_TYPE    6
AFFILIATION         3
CLASSIFICATION      6
USE_CASE            3
ORGANIZATION        3
INCOME_AMT          4
ASK_AMT             2
dtype: int64




In [None]:
enc_df

In [13]:
def pre_process(a_df):
    """ Make X,y ... train_test_split ... scale, fit and transform """
    
    # Split our preprocessed data into our features and target arrays

    y = enc_df["IS_SUCCESSFUL"].values
    X = enc_df.drop(["IS_SUCCESSFUL"], axis=1).values

    # Split the preprocessed data into a training and testing dataset

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 
    
    # Create a StandardScaler instances
    
    scaler = StandardScaler()

    # scaler = MinMaxScaler()

    # Fit the StandardScaler

    X_scaler = scaler.fit(X_train)

    # Scale the data

    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    print(f"TRAIN SCALED SHAPE: {X_train_scaled.shape}")
    print(f"TEST SCALED SHAPE: {X_test_scaled.shape}")

    input_dim = X_train_scaled.shape[1]  
    
    return input_dim, X_train_scaled, X_test_scaled, y_train, y_test

In [14]:
input_dim, X_train_scaled, X_test_scaled, y_train, y_test = pre_process(enc_df)

TRAIN SCALED SHAPE: (25724, 27)
TEST SCALED SHAPE: (8575, 27)


## Compile, Train and Evaluate the Model

In [None]:
def make_nn(input_dim=14, num_layers=2, num_units=5, num_epochs=5):
    """ Makes sequential nn, compiles, fits, saves, and reports on loss and accuracy """    
    
    nn = tf.keras.models.Sequential()

    # First layer 
    
    nn.add(tf.keras.layers.Dense(units=num_units, input_dim=input_dim, activation="relu"))
    
    # Hidden layers
    
    for layer in range(1, num_layers):
        
        nn.add(tf.keras.layers.Dense(units=num_units, activation="relu"))

    # Output layer

    nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
    
    nn.summary()
    
    nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) 
    
    fit_model = nn.fit(X_train_scaled, y_train, epochs=num_epochs)   
    
    model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
    
    print(f"\n\nLoss: {model_loss}, Accuracy: {model_accuracy}")    

    nn.save("AlphabetSoupCharity.h5")

In [None]:
make_nn(27, 2, 75, 100)

In [15]:
# Create a method that creates a new Sequential model with hyperparameter options

def create_model(hp):
    
    # Instantiate a Sequential model
    
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers.
    
    activation = hp.Choice('activation', ['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide the number of neurons in first layer and also
    # the activation function. 
    
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=27))

    # Allow kerastuner to decide the number of hidden layers and number of 
    # neurons in each one
    
    for i in range(hp.Int('num_layers', 1, 4)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))
    
    # Define the output layer
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [16]:
def find_best():
    """ Uses keras-tuner to find best model specs """
    
    tuner = kt.Hyperband(
        create_model,
        objective="val_accuracy",
        max_epochs=10,
        hyperband_iterations=2)
    
    # Run the kerastuner search for best hyperparameters

    tuner.search(X_train_scaled, y_train, epochs=20, validation_data=(X_test_scaled, y_test))
    
     # Get best model hyperparameters
 
    best_hyper = tuner.get_best_hyperparameters(1)[0]

    print(best_hyper.values)
    
     # Evaluate best model against full test data
 
    best_model = tuner.get_best_models(1)[0]
    model_loss, model_accuracy = best_model.evaluate(X_test_scaled, y_test, verbose=2)

    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")
    
     # Summarize the best model
 
    print(best_model.summary())

In [17]:
find_best()

INFO:tensorflow:Reloading Oracle from existing project .\untitled_project\oracle.json
INFO:tensorflow:Reloading Tuner from .\untitled_project\tuner0.json
INFO:tensorflow:Oracle triggered exit
{'activation': 'tanh', 'first_units': 9, 'num_layers': 4, 'units_0': 7, 'units_1': 1, 'units_2': 9, 'units_3': 7, 'units_4': 1, 'units_5': 3, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0049'}


ValueError: Received incompatible tensor with shape (35, 9) when attempting to restore variable with shape (27, 9) and name layer_with_weights-0/kernel/.ATTRIBUTES/VARIABLE_VALUE.

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

nn = tf.keras.models.Sequential()

# First hidden layer

nn.add(tf.keras.layers.Dense(units=9, input_dim=35, activation="tanh"))

# Second hidden layer

nn.add(tf.keras.layers.Dense(units=7, activation="tanh"))

# Third hidden layer

nn.add(tf.keras.layers.Dense(units=1, activation="tanh"))

# Fourth hidden layer

nn.add(tf.keras.layers.Dense(units=9, activation="tanh"))

# Fifth hidden layer

nn.add(tf.keras.layers.Dense(units=7, activation="tanh"))


# Output layer

nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model

nn.summary()

# Compile the model

nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) 

# Train the model

fit_model = nn.fit(X_train_scaled, y_train, epochs=50) 

# Evaluate the model using the test data

model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# Export our model to HDF5 file

nn.save("nn_optimized.h5")