# Report on the Neural Network Model

**Overview of the analysis**: Explain the purpose of this analysis.

**Results**: Using bulleted lists and images to support your answers, address the following questions.

**Data Preprocessing**

What variable(s) are the target(s) for your model?
What variable(s) are the features for your model?
What variable(s) should be removed from the input data because they are neither targets nor features?

**Compiling, Training, and Evaluating the Model**

How many neurons, layers, and activation functions did you select for your neural network model, and why?
Were you able to achieve the target model performance?
What steps did you take in your attempts to increase model performance?

**Summary**: Summarize the overall results of the deep learning model. Include a recommendation for how a different model could solve this classification problem, and then explain your recommendation.

In [None]:
# Import our dependencies

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, normalize, LabelEncoder, OneHotEncoder

import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

pd.options.display.max_rows = 500
pd.options.display.max_columns = 50

In [None]:
#  Import and read the charity_data.csv

df = pd.read_csv("Resources/charity_data.csv")
print(df.info())
df.head()

In [None]:
# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.

df = df[["AFFILIATION", "APPLICATION_TYPE", "CLASSIFICATION", "IS_SUCCESSFUL", "ORGANIZATION"]]
df

In [None]:
def reduce_cats(a_col, a_cutoff):
    """ Inputs are a series and a cutoff value for 'Other' """
    
    print(f"BEFORE: \n\n{df[a_col].value_counts()}\n\n")

    types_to_replace = (df[a_col].value_counts().loc[lambda x: x < int(a_cutoff)]).keys().tolist()

    for code in types_to_replace:        
        df[a_col] = df[a_col].replace(code, "Other")

    # Check to make sure binning was successful

    print(f"AFTER: \n\n{df[a_col].value_counts()}\n\n")   

In [None]:
reduce_cats("APPLICATION_TYPE", 500)

reduce_cats("AFFILIATION", 15_000)

reduce_cats("CLASSIFICATION", 1800)

reduce_cats("ORGANIZATION", 10_000)

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`

alphasoup_cat = df.dtypes[df.dtypes == "object"].index.tolist() 
alphasoup_cat

In [None]:
def encode_df(a_df):
    """ Returns one-hot encoded dataframe """
    
    categorical_list = a_df.dtypes[a_df.dtypes == "object"].index.tolist() 
    
    print(f"CATEGORIES FOR EACH CATEGORICAL FEATURE ENCODED:\n\n{a_df[categorical_list].nunique()}\n\n")
    
    concat_list = []
    
    for categorical in categorical_list:
        
        concat_list.append(pd.get_dummies(a_df[categorical], prefix=categorical, prefix_sep='_'))        
    
    concat_list.append(a_df["IS_SUCCESSFUL"])

    return pd.concat(concat_list, axis=1)  

In [None]:
df = encode_df(df)

In [None]:
# Split our preprocessed data into our features and target arrays

y = df["IS_SUCCESSFUL"].values
X = df.drop(["IS_SUCCESSFUL"], axis=1).values

# Split the preprocessed data into a training and testing dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
# Create a StandardScaler instances
# scaler = StandardScaler()

scaler = MinMaxScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

print(X_train_scaled.shape)
print(X_test_scaled.shape)

In [None]:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)

print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
feature_importances = clf.feature_importances_ 

features = sorted(zip(df.columns, clf.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,200)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

### AFFIL, ORG, APP_TYPE

In [None]:
sel = SelectFromModel(clf)
sel.fit(X_train_scaled, y_train)

X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_selected_train_scaled, y_train)
print(f'Training Score: {clf.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_selected_test_scaled, y_test)}')

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

nn = tf.keras.models.Sequential()

# First hidden layer

nn.add(tf.keras.layers.Dense(units=60, input_dim=21, activation="sigmoid"))

# Second hidden layer

nn.add(tf.keras.layers.Dense(units=30, activation="sigmoid"))

# Output layer

nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model

nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) 

In [None]:
# Train the model

fit_model = nn.fit(X_train_scaled, y_train, epochs=10) 

In [None]:
# Evaluate the model using the test data

model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Export our model to HDF5 file

nn.save("nn_optimized.h5")