# Your Mission, should you choose to accept it...

To hyperparameter tune and extract every ounce of accuracy out of this telecom customer churn dataset: <https://drive.google.com/file/d/1dfbAsM9DwA7tYhInyflIpZnYs7VT-0AQ/view> 

## Requirements

- Load the data
- Clean the data if necessary (it will be)
- Create and fit a baseline Keras MLP model to the data.
- Hyperparameter tune (at least) the following parameters:
 - batch_size
 - training epochs
 - optimizer
 - learning rate (if applicable to optimizer)
 - momentum (if applicable to optimizer)
 - activation functions
 - network weight initialization
 - dropout regularization
 - number of neurons in the hidden layer
 
 You must use Grid Search and Cross Validation for your initial pass of the above hyperparameters
 
 Try and get the maximum accuracy possible out of this data! You'll save big telecoms millions! Doesn't that sound great?


In [1]:
##### Your Code Here #####
import numpy as np
import pandas as pd

path = './WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
# I'll need to change all this data to numerical

from tensorflow import keras
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split


scaler = StandardScaler()
hot = OneHotEncoder()

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].replace(' ', 0))

cleaning_trans = ColumnTransformer(
    [
    ('scaler', StandardScaler(), ['tenure', 'MonthlyCharges', 'TotalCharges']),
    ('hot', OneHotEncoder(), ['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 ])],
    n_jobs=-1, remainder='drop', verbose=True)



In [3]:
from tensorflow import keras

X = df.drop(columns='Churn')
y = df['Churn']
X = cleaning_trans.fit_transform(X)
y = y.map({'Yes': 1, 'No': 0})

In [4]:
X.shape, y.shape

((7043, 7089), (7043,))

In [5]:
# create a baseline model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Random Seed
seed = 2001
np.random.seed(seed)

# Important Hyperparameters
inputs = X.shape[1]
epochs = 50
batch_size = 32

# Create our model
model = Sequential()

# input and hidden
model.add(Dense(32, input_dim = inputs, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='relu'))

#compile
model.compile(loss='binary_crossentropy',
               optimizer = 'adam',
               metrics=['accuracy'])


# Manual Validation Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)

W0718 17:20:37.229327 140193323214656 deprecation.py:506] From /home/nedderlander/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [6]:
history = model.fit(X_train, y_train, batch_size=batch_size, epochs = epochs, validation_split=.1, verbose =0)
scores = model.evaluate(X_test, y_test)
print(f'{model.metrics_names[1]}: {scores[1]*100}')

W0718 17:20:37.859617 140193323214656 deprecation.py:323] From /home/nedderlander/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


acc: 75.03545880317688


In [None]:
# now to hyper paramater tune
# I'm going to optomize batch size 

from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Random Seed
seed = 2001
np.random.seed(seed)

# Important Hyperparameters
inputs = X.shape[1]
# epochs = 50
# batch_size = 32

# Create our model function for the kerasclassifier wrapper
def create_model():
    model = Sequential()
    # input and hidden
    model.add(Dense(32, input_dim = inputs, activation='relu'))
#     model.add(Dropout(0.3))
#     model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='relu'))

    #compile
    model.compile(loss='binary_crossentropy',
                   optimizer = 'adam',
                   metrics=['accuracy'])
    return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

# create hyper paramaters to optomize
param_grid = {'batch_size': [10, 20, 40, 60, 80, 100],
              'epochs': [20]}

# create a grid search
grid = GridSearchCV(estimator=model, cv=3, param_grid=param_grid, n_jobs=-1, verbose=0)
grid_results = grid.fit(X, y)


In [10]:
# Report Results
print(f"Best: {grid_results.best_score_} using {grid_results.best_params_}")
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}") 

Best: 0.7887263979303148 using {'batch_size': 20, 'epochs': 20}
Means: 0.7557858708364186, Stdev: 0.018813125019112174 with: {'batch_size': 10, 'epochs': 20}
Means: 0.7887263979303148, Stdev: 0.0006968185169448575 with: {'batch_size': 20, 'epochs': 20}
Means: 0.7792133920727239, Stdev: 0.007368712870015184 with: {'batch_size': 40, 'epochs': 20}
Means: 0.7729660501013624, Stdev: 0.003894944610580463 with: {'batch_size': 60, 'epochs': 20}
Means: 0.7767996666073613, Stdev: 0.00731782342681365 with: {'batch_size': 80, 'epochs': 20}
Means: 0.7597614614763467, Stdev: 0.031997156874839595 with: {'batch_size': 100, 'epochs': 20}


In [26]:
import tensorflow
tensorflow.reset_default_graph()

In [None]:
# Define model function for Keras Classifier Object
def create_model():
    # create model
    model = Sequential()
    # Input and First Hidden Layer
    model.add(Dense(32, input_dim=X.shape[1], activation='relu'))
    # Second Hidden Layer
    model.add(Dense(16, activation='relu'))
    # Output Layer
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])

    return model

model = KerasClassifier(build_fn=create_model, verbose=1)

# Define the grid search parameters
param_grid = {'batch_size': [10, 40, 80, 100],
              'epochs': np.arange(5,41)}

# Create Grid Search
grid = GridSearchCV(estimator=model, 
                    param_grid=param_grid,
                    cv=5,
                    n_jobs=-1)

grid_result = grid.fit(X, y, verbose=0)

# Report Results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f"Means: {mean}, Stdev: {stdev} with: {param}")
    



In [6]:
# 20 is our batch size, now we can try different optomiziers
# Create our model function for the kerasclassifier wrapper
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Important Hyperparameters
inputs = X.shape[1]

def create_model():
    model = Sequential()
    # input and hidden
    model.add(Dense(32, input_dim = inputs, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1, activation='relu'))

    #compile
    model.compile(loss='binary_crossentropy',
                   optimizer = 'Adagrad',
                   metrics=['accuracy'])
    return model


optimizers = ['Adam', 'Adagrad', 'Adadelta', 'Adamax']
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']

for opt in optimizers:

    # create model
    model = KerasClassifier(build_fn=create_model(opt), verbose=0)

    # create hyper paramaters to optomize
    param_grid = {'batch_size': [20], #[10, 20, 40, 60, 80, 100],
    'epochs': [20, 15]}

    # create a grid search
    grid = GridSearchCV(estimator=model, cv=3, param_grid=param_grid, n_jobs=-1, verbose=0)
    grid_results = grid.fit(X, y)

    #Report Results
    print(f"Best: {grid_results.best_score_} using {grid_results.best_params_}")
    means = grid_results.cv_results_['mean_test_score']a
    stds = grid_results.cv_results_['std_test_score']
    params = grid_results.cv_results_['params']

    for mean, stdev, param in zip(means, stds, params):
        print(f"Means: {mean}, Stdev: {stdev} with: {param}") 


TypeError: can't pickle _thread.RLock objects

## Stretch Goals:

- Try to implement Random Search Hyperparameter Tuning on this dataset
- Try to implement Bayesian Optimiation tuning on this dataset
- Practice hyperparameter tuning other datasets that we have looked at. How high can you get MNIST? Above 99%?
- Study for the Sprint Challenge
 - Can you implement both perceptron and MLP models from scratch with forward and backpropagation?
 - Can you implement both perceptron and MLP models in keras and tune their hyperparameters with cross validation?