In [27]:
import numpy as np
import pandas as pd
from keras import models
from keras import layers
from keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn import metrics
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
def oneHotEncoded(data_frame, column_name_ls):
    df = data_frame.copy(deep=True)
    for col in column_name_ls:
        df[col] = pd.Categorical(df[col])
        df_dummies = pd.get_dummies(df[col], prefix=col)
        df = pd.concat([df, df_dummies], axis=1)
        df.drop([col], axis=1, inplace=True)
        
    return df

def build_classifier(units=32, learn_rate=0.001, hidden_layers=1, optimizer='adam'):
    m =  models.Sequential()

    m.add(layers.Dense(units=46, activation='relu', input_shape=(46,)))
    for i in range(hidden_layers):
         m.add(layers.Dense(units=units, activation='relu'))
#             m.add(layers.Dropout(dropout))
    m.add(layers.Dense(units=1, activation='sigmoid'))
#     optimizer = optimizers.Adam(learning_rate=learn_rate)
    m.compile(optimizer=optimizer, loss='binary_crossentropy',
             metrics=['accuracy'])
    return m

In [3]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
total_charges_filter = df['TotalCharges'] == ' '
df = df[~total_charges_filter]

categorical_features = ['gender', 'SeniorCitizen', 'Partner',
 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
 'PaymentMethod'] # ~16
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
target = 'Churn'

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

label = df['Churn'].map({'Yes': 1, 'No': 0})

data = oneHotEncoded(df.drop(['customerID','Churn'], axis=1), categorical_features)
data_numpy = data.to_numpy(copy=True)
label_numpy = label.to_numpy(copy=True)

x_train, x_test, y_train, y_test = train_test_split(data_numpy, label_numpy, test_size = 0.2)

In [None]:
classifier = KerasClassifier(build_fn=build_classifier)
parameters ={
            'epochs':[10,20,30],
            'units':[16, 32, 64, 128],
            'batch_size': [32, 64, 128],
            'hidden_layers': [1, 2, 3, 4],
            'optimizer': ['SGD', 'RMSprop', 'Adam']}
# doesn't tune the learning rate because I will use colab to train and now I don't know how to tune learning rate together with optimizer
gridSearch = GridSearchCV(estimator=classifier,
                          param_grid=parameters,
                         n_jobs=-1)

In [None]:
grid_result = gridSearch.fit(train, label, verbose=1)

In [None]:
print("Best: %f using %s"%(grid_result.best_score_,grid_result.best_params_))
means=grid_result.cv_results_['mean_test_score']
stds=grid_result.cv_results_['std_test_score']
params=grid_result.cv_results_['params']
for mean,stdev,param in zip(means,stds,params):
    print("%f (%f) with: %r"%(mean,stdev,param))

### SVM gridsearch

In [9]:
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.svm import SVC

In [18]:
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
  
svc_grid = GridSearchCV(SVC(), param_grid, refit = True, cv=10) 
  
# fitting the model for grid search 
svc_grid.fit(x_train, y_train) 

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [28]:
# print best parameter after tuning 
# print(grid_svc.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
# print(grid_svc.best_estimator_) 

svc_pred = svc_grid.predict(x_test) 
  
# print classification report 
print(classification_report(y_test, svc_pred))
print("Accuracy:",metrics.accuracy_score(y_test, svc_pred))

              precision    recall  f1-score   support

           0       0.81      0.93      0.87      1039
           1       0.67      0.38      0.49       368

    accuracy                           0.79      1407
   macro avg       0.74      0.66      0.68      1407
weighted avg       0.77      0.79      0.77      1407

Accuracy: 0.7882018479033405


### Logistic regression gridsearch

In [15]:
from sklearn.linear_model import LogisticRegression

In [30]:
logistic = LogisticRegression()

penalty = ['l1', 'l2']

C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

lr_grid = GridSearchCV(logistic, hyperparameters, cv=10, verbose=0)
lr_grid.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
  

In [31]:
lr_pred = lr_grid.predict(x_test) 
  
print(classification_report(y_test, lr_pred))
print("Accuracy:",metrics.accuracy_score(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1039
           1       0.65      0.55      0.60       368

    accuracy                           0.81      1407
   macro avg       0.75      0.72      0.73      1407
weighted avg       0.80      0.81      0.80      1407

Accuracy: 0.8059701492537313


### RandomForest gridsearch

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
rf_param = {
    'max_depth': [4,5,6,7,8],
    'n_estimators': [100, 300, 500],
    'criterion' :['gini', 'entropy']
}
rf = RandomForestClassifier()

rf_grid = GridSearchCV(estimator=rf, param_grid=rf_param, cv= 10)
rf_grid.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             ii

In [24]:
rf_grid.best_params_

{'criterion': 'gini', 'max_depth': 8, 'n_estimators': 300}

In [32]:
rf_pred = rf_grid.predict(x_test) 
  
print(classification_report(y_test, rf_pred))
print("Accuracy:",metrics.accuracy_score(y_test, rf_pred))

              precision    recall  f1-score   support

           0       0.84      0.91      0.88      1039
           1       0.68      0.51      0.58       368

    accuracy                           0.81      1407
   macro avg       0.76      0.71      0.73      1407
weighted avg       0.80      0.81      0.80      1407

Accuracy: 0.8073916133617626


### DNN

In [33]:
import tensorflow as tf
from tensorflow import keras

In [35]:
EPOCHS = 50
LR = 0.001
model = keras.Sequential([
    keras.layers.Dense(46, activation='relu', input_shape=(46,)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
adam = keras.optimizers.Adam(learning_rate= LR, decay= LR/ EPOCHS)
model.compile(optimizer=adam, loss='binary_crossentropy',
             metrics=['accuracy'])
model.load_weights('best.hdf5')

In [38]:
test_loss, test_acc  = model.evaluate(x_test, y_test, verbose=False)
print('Tested Acc: ', test_acc)
print('Tested Loss: ', test_loss)

Tested Acc:  0.8152097
Tested Loss:  0.4128850566298723
