# Adding name back as a feature
After discussion with some peers, it seems that adding the name back as a feature improves the results.

In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
import pickle
import matplotlib.pyplot as plt

#  Import and read the charity_data.csv.
import pandas as pd 
application_df = pd.read_csv("https://static.bc-edx.com/data/dla-1-2/m21/lms/starter/charity_data.csv")

# Drop the non-beneficial ID columns, 'EIN' and 'NAME'.
application_df = application_df.drop(columns=['EIN'])

# Choose a cutoff value of 500 and create a list of application types to be replaced
all_app_types = application_df['APPLICATION_TYPE'].value_counts().to_dict()
application_types_to_replace = [i for i in all_app_types.keys() if all_app_types[i] < 500]

# Replace in dataframe
for app in application_types_to_replace:
    application_df['APPLICATION_TYPE'] = application_df['APPLICATION_TYPE'].replace(app,"Other")

# Choose a cutoff value and create a list of classifications to be replaced
# use the variable name `classifications_to_replace`
all_class_types = application_df['CLASSIFICATION'].value_counts().to_dict()
classifications_to_replace = [i for i in all_class_types.keys() if all_class_types[i] < 1000]

# Replace in dataframe
for cls in classifications_to_replace:
    application_df['CLASSIFICATION'] = application_df['CLASSIFICATION'].replace(cls,"Other")

# Get all categorical data columns into a list
dummies_cols = []

cols = application_df.columns
typs = application_df.dtypes.values

for c in range(len(cols)):
    if (typs[c] == 'object'):
        dummies_cols.append(cols[c])

# Convert categorical data to numeric with `pd.get_dummies`
dummies_df = pd.get_dummies(application_df[dummies_cols])

# Concatenate the DF with the dummy data with the original dataset
concat_df = pd.concat([application_df, dummies_df], axis=1)

# Drop the unused columns
concat_df = concat_df.drop(columns=dummies_cols)

# Split our preprocessed data into our features and target arrays
y = concat_df['IS_SUCCESSFUL']
X = concat_df.drop(columns='IS_SUCCESSFUL')

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Number of features

In [3]:
len(X.columns)

19611

## Model optimisation (with keras tuner)

In [5]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):

    input_dim = 19611


    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','selu','sigmoid','tanh'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=20,
        step=2), activation=activation, input_dim=input_dim))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=20,
            step=2),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [7]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [8]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 60 Complete [00h 01m 07s]
val_accuracy: 0.7969679236412048

Best val_accuracy So Far: 0.8006997108459473
Total elapsed time: 00h 34m 44s


## Optimisation results

In [9]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'sigmoid',
 'first_units': 17,
 'num_layers': 2,
 'units_0': 19,
 'tuner/epochs': 7,
 'tuner/initial_epoch': 3,
 'tuner/bracket': 2,
 'tuner/round': 1,
 'units_1': 1,
 'tuner/trial_id': '0000',
 'units_2': 17,
 'units_3': 15,
 'units_4': 5,
 'units_5': 1}

In [10]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)

print(f"Loss: {100*model_loss:.2f}%")
print(f"Accuracy: {100*model_accuracy:.2f}%")

268/268 - 1s - loss: 0.4876 - accuracy: 0.8007 - 590ms/epoch - 2ms/step
Loss: 48.76%
Accuracy: 80.07%


## Conclusions
Adding the name as a feature drastically improve the accuracy of the model (from ~72% in Step 2 to 80% here.) There are however several problems with this:
- There is now a very large number of features and most of them are related to the name of the company
- Not 

In [18]:
count_names_df = application_df['NAME'].value_counts()==1
count_names_df.values

array([False, False, False, ...,  True,  True,  True])

In [10]:
repeated_name_df = application_df.loc[application_df['NAME'].value_counts()>2,'NAME']
repeated_name_df

IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).