In [None]:
# Import our dependencies

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

# Functions

In [None]:
def load_dataset():
    """ Reads dataset csv and returns pandas dataframe """
    
    filepath = "../Resources/charity_data.csv"

    df = pd.read_csv(filepath, encoding="utf-8", low_memory=False)
    
    return df

In [None]:
def clean_dataset(a_df):
    """ Returns deduped, na-dropped, useless column dropped, index-reset dataframe """    
    
    a_df = a_df.drop_duplicates()   
        
    a_df = a_df.dropna()
    
    a_df = a_df.drop(columns=["EIN", "NAME"])
        
    a_df = a_df.reset_index(drop=True)
    
    return a_df   

In [None]:
def examine_dataset(a_df):
    """ Provides summary info and visualizations of dataset """
    
    a_df.info()
    
    print("\n\n")
                  
    # Determine the number of unique values in each column.

    for col in a_df.columns:        
        if (a_df[col].nunique() > 10):
            print(f"{col}\n\n{a_df[col].value_counts()}\n\n") 

In [None]:
def reduce_cats(a_df, a_col, a_cutoff):
    """ Inputs are a series and a cutoff value for 'Other' """
    
    print(f"BEFORE: \n\n{a_df[a_col].value_counts()}\n\n")

    types_to_replace = (a_df[a_col].value_counts().loc[lambda x: x < int(a_cutoff)]).keys().tolist()

    for code in types_to_replace:        
        a_df[a_col] = a_df[a_col].replace(code, "Other")

    # Check to make sure binning was successful

    print(f"AFTER: \n\n{a_df[a_col].value_counts()}\n\n") 

In [None]:
def encode_df(a_df):
    """ Returns one-hot encoded dataframe """
    
    categorical_list = a_df.dtypes[a_df.dtypes == "object"].index.tolist() 
    
    print(f"CATEGORIES FOR EACH CATEGORICAL FEATURE ENCODED:\n\n{a_df[categorical_list].nunique()}\n\n")
    
    concat_list = []
    
    for categorical in categorical_list:
        
        concat_list.append(pd.get_dummies(a_df[categorical], prefix=categorical, prefix_sep='_'))        
    
    concat_list.append(a_df["IS_SUCCESSFUL"])

    return pd.concat(concat_list, axis=1)  

In [None]:
def pre_process(a_df):
    """ Make X,y ... train_test_split ... scale, fit and transform """
    
    # Split our preprocessed data into our features and target arrays

    y = a_df["IS_SUCCESSFUL"]
    X = a_df.drop(["IS_SUCCESSFUL"], axis=1)

    # Split the preprocessed data into a training and testing dataset

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 
    
    # Create a StandardScaler instances
    
    scaler = StandardScaler()

    # Fit the StandardScaler

    X_scaler = scaler.fit(X_train)

    # Scale the data

    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    print(f"TRAIN SCALED SHAPE: {X_train_scaled.shape}")
    print(f"TEST SCALED SHAPE: {X_test_scaled.shape}")

    input_dim = X_train_scaled.shape[1]  
    
    return input_dim, X_train_scaled, X_test_scaled, y_train, y_test 

In [None]:
def make_nn(input_dim=43, num_layers=2, num_units=100, num_epochs=10):
    """ Makes sequential nn, compiles, fits, saves, and reports on loss and accuracy """    
    
    nn = tf.keras.models.Sequential()

    # First layer 
    
    nn.add(tf.keras.layers.Dense(units=num_units, input_dim=input_dim, activation="relu"))
    
    # Hidden layers
    
    for layer in range(1, num_layers):
        
        nn.add(tf.keras.layers.Dense(units=num_units, activation="relu"))

    # Output layer

    nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))
    
    nn.summary()
    
    nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) 
    
    fit_model = nn.fit(X_train_scaled, y_train, epochs=num_epochs)   
    
    model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
    
    print(f"\n\nLoss: {model_loss}, Accuracy: {model_accuracy}")    

    nn.save("../Models/AlphabetSoupCharity_Optimization.h5")

# Tinkerings

## (1) Examine correlation heatmap

In [None]:
# Make dataframe and examine

enc_df = encode_df(clean_dataset(load_dataset()))

# Examine correlations from encoded dataframe

enc_df.corr()

corrmat = enc_df.corr()

top_corr_features = corrmat.index

plt.figure(figsize=(20,50))

sns.heatmap(enc_df[top_corr_features].corr()[["IS_SUCCESSFUL"]].sort_values(by="IS_SUCCESSFUL", ascending=False),\
            vmin=-1, vmax=1, annot=True, cmap="crest")

## (2) Random Forest Classifier and feature_importances_

In [None]:
# Preprocess dataframe

input_dim, X_train_scaled, X_test_scaled, y_train, y_test = pre_process(enc_df)

# See what Random Forest offers

clf = RandomForestClassifier(random_state=0, n_estimators=200).fit(X_train_scaled, y_train)

print(f"RandomForestClassifier Training Score: {clf.score(X_train_scaled, y_train)}")
print(f"RandomForestClassifier Testing Score: {clf.score(X_test_scaled, y_test)}")

feature_importances = clf.feature_importances_ 

features = sorted(zip(enc_df.columns, clf.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,50)
plt.margins(y=0.01)

ax.barh(y=cols, width=width)

plt.show()

# Now try with the selected features

sel = SelectFromModel(clf).fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_train_scaled), y_train, random_state=0)

scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

clf = RandomForestClassifier(random_state=0, n_estimators=200).fit(X_selected_train_scaled, y_train)

print(f"SelectFromModel RandomForestClassifier Training Score: {clf.score(X_selected_train_scaled, y_train)}")
print(f"SelectFromModel RandomForestClassifier Testing Score: {clf.score(X_selected_test_scaled, y_test)}")

## (3) Extra Trees Classifier and feature_importances_

In [None]:
# Make dataframe and examine

enc_df = encode_df(clean_dataset(load_dataset()))

# Preprocess dataframe

input_dim, X_train_scaled, X_test_scaled, y_train, y_test = pre_process(enc_df)

# See what Extra Trees offers

model = ExtraTreesClassifier().fit(X_train_scaled, y_train)

print(f"\n\nExtraTreesClassifier Training Score: {model.score(X_train_scaled, y_train)}")
print(f"ExtraTreesClassifier Testing Score: {model.score(X_test_scaled, y_test)}")

feature_importances = model.feature_importances_ 

features = sorted(zip(enc_df.columns, model.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,50)
plt.margins(y=0.01)

ax.barh(y=cols, width=width)

plt.show()

# Now try with the selected features

sel = SelectFromModel(model).fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_train_scaled), y_train, random_state=0)

scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

model = ExtraTreesClassifier().fit(X_selected_train_scaled, y_train)

print(f"SelectFromModel ExtraTreesClassifier Training Score: {model.score(X_selected_train_scaled, y_train)}")
print(f"SelectFromModel ExtraTreesClassifier Testing Score: {model.score(X_selected_test_scaled, y_test)}")

# Analysis and new models

- The correlation matrix heatmap revealed a little intelligence. AFFILIATION_Independent and AFFILIATION_CompanySponsored were the strongest positive and negative correlations with IS_SUCCESSFUL, respectively. 

- Random Forest and Extra Trees Classifiers offered more insight, especially when plotting the feature_importances_ suggested by each. Not only did each classifier get a higher score than my neural networks on both training and testing, but these scores increased slightly when using SelectFromModel and rerunning the classifiers.   

- My takeaway from these tinkerings is that it would be worth reforming the original dataframe with the feature selection intelligence, then re-attempting another neural network model. While I am at that, I might as well redo the classifiers as well, to see if I have reduced "noise" in the dataset and increased "signal".   

In [None]:
new_df = clean_dataset(load_dataset())

In [None]:
new_df

In [None]:
new_df["AFFILIATION"].value_counts()

In [None]:
new_df["APPLICATION_TYPE"].value_counts()

In [None]:
new_df["INCOME_AMT"].value_counts()

In [None]:
new_df["ORGANIZATION"].value_counts()

In [None]:
new_df["USE_CASE"].value_counts()

In [None]:
# Bin columns with > 10 unique values

reduce_cats(new_df, "AFFILIATION", 15_000)

reduce_cats(new_df, "APPLICATION_TYPE", 1000)

reduce_cats(new_df, "INCOME_AMT", 3000)

reduce_cats(new_df, "ORGANIZATION", 10_000)

reduce_cats(new_df, "USE_CASE", 5000)

In [None]:
new_df = new_df[["AFFILIATION", "APPLICATION_TYPE", "INCOME_AMT", "ORGANIZATION", "USE_CASE", "IS_SUCCESSFUL"]]

In [None]:
new_df

In [None]:
newenc_df = encode_df(new_df)

newenc_df

In [None]:
input_dim, X_train_scaled, X_test_scaled, y_train, y_test = pre_process(newenc_df)

In [None]:
make_nn(19, 2, 30, 50)

In [None]:
clf = RandomForestClassifier(random_state=0, n_estimators=200).fit(X_train_scaled, y_train)

print(f"RandomForestClassifier Training Score: {clf.score(X_train_scaled, y_train)}")
print(f"RandomForestClassifier Testing Score: {clf.score(X_test_scaled, y_test)}")

In [None]:
# See what Extra Trees offers

model = ExtraTreesClassifier().fit(X_train_scaled, y_train)

print(f"ExtraTreesClassifier Training Score: {model.score(X_train_scaled, y_train)}")
print(f"ExtraTreesClassifier Testing Score: {model.score(X_test_scaled, y_test)}")