## Develop a Deep Learning Based Churn Prediction Engine


### Import required libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score,recall_score, precision_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow.keras.models import Sequential
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.regularizers import l2, l1
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

#### Load and understand the data

In [None]:
# Read the data
data = pd.read_csv("TelcoChurn.csv")

In [None]:
# Check the data types of the columns
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [None]:
# See the summary statistics
data.describe(include='all')

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
count,7043,7043,7043.0,7043,7043,7043.0,7043,7043,7043,7043,...,7043,7043,7043,7043,7043,7043,7043,7043.0,7043.0,7043
unique,7043,2,,2,2,,2,2,3,2,...,2,2,2,2,3,2,4,,6531.0,2
top,6743-HHQPF,Male,,No,No,,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,,20.2,No
freq,1,3555,,3641,4933,,6361,4072,3096,5024,...,4621,4999,4336,4311,3875,4171,2365,,11.0,5174
mean,,,0.162147,,,32.371149,,,,,...,,,,,,,,64.761692,,
std,,,0.368612,,,24.559481,,,,,...,,,,,,,,30.090047,,
min,,,0.0,,,0.0,,,,,...,,,,,,,,18.25,,
25%,,,0.0,,,9.0,,,,,...,,,,,,,,35.5,,
50%,,,0.0,,,29.0,,,,,...,,,,,,,,70.35,,
75%,,,0.0,,,55.0,,,,,...,,,,,,,,89.85,,


In [None]:
# Read the data
data = pd.read_csv("TelcoChurn.csv", na_values=' ')

In [None]:
# # Display the shape of the data
data.shape

(7043, 21)

In [None]:
# Display top 5 rows
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
# See the summary statistics
data.describe(include='all')

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
count,7043,7043,7043.0,7043,7043,7043.0,7043,7043,7043,7043,...,7043,7043,7043,7043,7043,7043,7043,7043.0,7032.0,7043
unique,7043,2,,2,2,,2,2,3,2,...,2,2,2,2,3,2,4,,,2
top,6743-HHQPF,Male,,No,No,,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,,,No
freq,1,3555,,3641,4933,,6361,4072,3096,5024,...,4621,4999,4336,4311,3875,4171,2365,,,5174
mean,,,0.162147,,,32.371149,,,,,...,,,,,,,,64.761692,2283.300441,
std,,,0.368612,,,24.559481,,,,,...,,,,,,,,30.090047,2266.771362,
min,,,0.0,,,0.0,,,,,...,,,,,,,,18.25,18.8,
25%,,,0.0,,,9.0,,,,,...,,,,,,,,35.5,401.45,
50%,,,0.0,,,29.0,,,,,...,,,,,,,,70.35,1397.475,
75%,,,0.0,,,55.0,,,,,...,,,,,,,,89.85,3794.7375,


In [None]:
# Check the data types of the columns
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [None]:
# Check the number of unique levels in each column
data.nunique()

customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          2
InternetService        3
OnlineSecurity         2
OnlineBackup           2
DeviceProtection       2
TechSupport            2
StreamingTV            2
StreamingMovies        2
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6530
Churn                  2
dtype: int64

##### Check the distribution of target variable

In [None]:
# Let's see if there is a class imbalance in the target variable
print(data['Churn'].value_counts(normalize = True)*100)

No     73.463013
Yes    26.536987
Name: Churn, dtype: float64


In [None]:
# Check if there are any NA values in the data
data.isna().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

### Observations:

**On Type conversions:**

- Columns like customerID can be removed from the analysis
- The column 'SeniorCitizen' is a categorical column by its nature with 'Yes' as 1, and No as 0. So it should be converted into Categorical type

**On Missingness of data:**
 The data has few missing values in column TotalCharges

**On the class imbalance in the target attribute**
There are more instances where the customers didn't churn than those that have customers churned out. Class imbalance is slightly present.

## Data Pre-processing

In [None]:
# Remove customerID
data.drop(['customerID'], axis = 1, inplace=True)

### Type Conversions

In [None]:
# Convert 'SeniorCitizen' column into categorical
data['SeniorCitizen']= data['SeniorCitizen'].astype('category')

In [None]:
data.dtypes

gender                object
SeniorCitizen       category
Partner               object
Dependents            object
tenure                 int64
PhoneService          object
MultipleLines         object
InternetService       object
OnlineSecurity        object
OnlineBackup          object
DeviceProtection      object
TechSupport           object
StreamingTV           object
StreamingMovies       object
Contract              object
PaperlessBilling      object
PaymentMethod         object
MonthlyCharges       float64
TotalCharges         float64
Churn                 object
dtype: object

#### Preparing Data for Model building

In [None]:
data['Churn'] = data['Churn'].replace(to_replace=['No', 'Yes'], value=[0, 1])
data['Churn'].value_counts(normalize=True)

0    0.73463
1    0.26537
Name: Churn, dtype: float64

### Split the data into train and test sets


In [None]:
X = data.loc[:, data.columns != 'Churn']
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=111, stratify = y)
print(X_train.shape, X_test.shape)

(5634, 19) (1409, 19)


In [None]:
# Check the Train and Test data distribution
print("Train data")
print(y_train.value_counts(normalize=True)*100)
print("\n")
print("Test data")
print(y_test.value_counts(normalize=True)*100)

Train data
0    73.464679
1    26.535321
Name: Churn, dtype: float64


Test data
0    73.456352
1    26.543648
Name: Churn, dtype: float64


### Split the attributes into numerical and categorical types

In [None]:
X_train.dtypes

gender                object
SeniorCitizen       category
Partner               object
Dependents            object
tenure                 int64
PhoneService          object
MultipleLines         object
InternetService       object
OnlineSecurity        object
OnlineBackup          object
DeviceProtection      object
TechSupport           object
StreamingTV           object
StreamingMovies       object
Contract              object
PaperlessBilling      object
PaymentMethod         object
MonthlyCharges       float64
TotalCharges         float64
dtype: object

In [None]:
num_attr=X_train.select_dtypes(['int64','float64']).columns
num_attr

Index(['tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')

In [None]:
cat_attr = X_train.select_dtypes(['category', 'object']).columns
cat_attr

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'Contract', 'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

## Imputing missing values with median for numerical attributes

In [None]:
imputer = SimpleImputer(strategy='median')

imputer = imputer.fit(X_train[num_attr])
X_train[num_attr] = imputer.transform(X_train[num_attr])
X_test[num_attr] = imputer.transform(X_test[num_attr])

In [None]:
imputer.statistics_

array([  29.  ,   70.45, 1399.35])

### Imputing missing values with mode for categorical attributes

In [None]:
imputer = SimpleImputer(strategy='most_frequent')

imputer = imputer.fit(X_train[cat_attr])
X_train[cat_attr] = imputer.transform(X_train[cat_attr])
X_test[cat_attr] = imputer.transform(X_test[cat_attr])

In [None]:
imputer.statistics_

array(['Male', 0, 'No', 'No', 'Yes', 'No', 'Fiber optic', 'No', 'No',
       'No', 'No', 'No', 'No', 'Month-to-month', 'Yes',
       'Electronic check'], dtype=object)

### Standardizing the numerical attributes and One-hot encoding categorical attributes

In [None]:
# DataFrameMapper, a class for mapping pandas data frame columns to different sklearn transformations
mapper = DataFrameMapper(
  [([continuous_col], StandardScaler()) for continuous_col in num_attr] +
  [([categorical_col], OneHotEncoder(handle_unknown='error')) for categorical_col in cat_attr]
, df_out=True)

In [None]:
mapper.fit(X_train)
X_train = mapper.transform(X_train)
X_test = mapper.transform(X_test)

In [None]:
X_train.head(10)

NameError: ignored

In [None]:
print(X_train.shape, X_test.shape)

(5634, 39) (1409, 39)


In [None]:
print(y_train.shape, y_test.shape)

(5634,) (1409,)


### Defining Error Metrics

In [None]:
def get_CR_CM(train_actual,train_predicted,test_actual,test_predicted):
    print('''
             ========================================
               CLASSIFICATION REPORT FOR TRAIN DATA
             ========================================
        ''')
    print(classification_report(train_actual, train_predicted, digits=4))

    print('''
             =========================================
               CLASSIFICATION REPORT FOR TEST DATA
             =========================================
            ''')
    print(classification_report(test_actual, test_predicted, digits=4))

    print('''
 ========================================
   Confusion Matrix FOR TRAIN DATA
 ========================================
            ''')
    print(confusion_matrix(train_actual, train_predicted))

    print('''
 =========================================
   Confusion matrix FOR TEST DATA
 =========================================
            ''')
    print(confusion_matrix(test_actual, test_predicted))


In [None]:
def get_ACCURACY_RECALL(train_actual,train_predicted,test_actual,test_predicted):
    print('''
 ========================================
           ACCURACY FOR TRAIN DATA
 ========================================
        ''')
    print("         ",accuracy_score(train_actual, train_predicted))

    print('''
 =========================================
           ACCURACY FOR TEST DATA
 =========================================
            ''')
    print("         ",accuracy_score(test_actual, test_predicted))

    print('''
 ========================================
           RECALL FOR TRAIN DATA
 ========================================
            ''')
    print("         ",recall_score(train_actual, train_predicted))

    print('''
 =========================================
           RECALL FOR TEST DATA
 =========================================
            ''')
    print("         ",recall_score(test_actual, test_predicted))


In [None]:
scores = pd.DataFrame(columns=['Model','Train_Accuracy','Train_Recall','Train_Precision','Train_F1_Score','Test_Accuracy','Test_Recall','Test_Precision','Test_F1_Score'])

def get_metrics(train_actual,train_predicted,test_actual,test_predicted,model_description,dataframe):
    train_accuracy = accuracy_score(train_actual,train_predicted)
    train_recall   = recall_score(train_actual,train_predicted)
    train_precision= precision_score(train_actual,train_predicted)
    train_f1score  = f1_score(train_actual,train_predicted)
    test_accuracy = accuracy_score(test_actual,test_predicted)
    test_recall   = recall_score(test_actual,test_predicted)
    test_precision= precision_score(test_actual,test_predicted)
    test_f1score  = f1_score(test_actual,test_predicted)
    dataframe = dataframe.append(pd.Series([model_description, train_accuracy,train_recall,train_precision,train_f1score,
                                            test_accuracy,test_recall,test_precision,test_f1score],
                                           index=scores.columns ), ignore_index=True)
    return(dataframe)

## Building the ANN Model

### Convert dataframe to numpy array to feed into Neural Networks

In [None]:
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

In [None]:
print(type(X_train))
print(type(X_test))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [None]:
X_train.shape

(5634, 39)

In [None]:
# Taking input_shape
input_shape = X_train.shape[1]
print(input_shape)

39


# Model 1 (1 hidden layer)

In [None]:
seed = 7
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
model = Sequential()
model.add(Dense(32, input_dim=input_shape, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dense(1, activation='sigmoid',kernel_initializer='glorot_normal'))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                1280      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 1,313
Trainable params: 1,313
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

#### Callbacks - Early Stopping

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', verbose=1, min_delta=0.001)

In [None]:
callbacks=[early_stopping]

In [None]:
model.fit(X_train, y_train,epochs=100,batch_size=64,validation_split=0.2,callbacks=callbacks)

Train on 4507 samples, validate on 1127 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 00012: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f2b13941990>

In [None]:
train_pred_1=model.predict_classes(X_train)
test_pred_1=model.predict_classes(X_test)

In [None]:
get_CR_CM(y_train, train_pred_1, y_test, test_pred_1)


               CLASSIFICATION REPORT FOR TRAIN DATA
        
              precision    recall  f1-score   support

           0     0.8420    0.9092    0.8743      4139
           1     0.6773    0.5278    0.5932      1495

    accuracy                         0.8080      5634
   macro avg     0.7596    0.7185    0.7338      5634
weighted avg     0.7983    0.8080    0.7997      5634


               CLASSIFICATION REPORT FOR TEST DATA
            
              precision    recall  f1-score   support

           0     0.8430    0.9082    0.8744      1035
           1     0.6769    0.5321    0.5958       374

    accuracy                         0.8084      1409
   macro avg     0.7600    0.7201    0.7351      1409
weighted avg     0.7989    0.8084    0.8005      1409


   Confusion Matrix FOR TRAIN DATA
            
[[3763  376]
 [ 706  789]]

   Confusion matrix FOR TEST DATA
            
[[940  95]
 [175 199]]


In [None]:
get_ACCURACY_RECALL(y_train, train_pred_1, y_test, test_pred_1)


           ACCURACY FOR TRAIN DATA
        
          0.8079517216897408

           ACCURACY FOR TEST DATA
            
          0.808374733853797

           RECALL FOR TRAIN DATA
            
          0.5277591973244147

           RECALL FOR TEST DATA
            
          0.5320855614973262


In [None]:
scores = get_metrics(y_train,train_pred_1,y_test,test_pred_1,"1 layer model",scores)
scores

Unnamed: 0,Model,Train_Accuracy,Train_Recall,Train_Precision,Train_F1_Score,Test_Accuracy,Test_Recall,Test_Precision,Test_F1_Score
0,1 layer model,0.807952,0.527759,0.677253,0.593233,0.808375,0.532086,0.676871,0.595808


# Model 2 - Experiment 1 (2 hidden layers)

In [None]:
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
model = Sequential()
model.add(Dense(16, input_dim=input_shape, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dense(12, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dense(1, activation='sigmoid',kernel_initializer='glorot_normal'))

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 16)                640       
_________________________________________________________________
dense_3 (Dense)              (None, 12)                204       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 13        
Total params: 857
Trainable params: 857
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=50, batch_size=64,validation_split=0.2,callbacks=callbacks)

Train on 4507 samples, validate on 1127 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 00015: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f2b10083990>

In [None]:
train_pred_2=model.predict_classes(X_train)
test_pred_2=model.predict_classes(X_test)

In [None]:
get_CR_CM(y_train, train_pred_2, y_test, test_pred_2)


               CLASSIFICATION REPORT FOR TRAIN DATA
        
              precision    recall  f1-score   support

           0     0.8549    0.8770    0.8658      4139
           1     0.6333    0.5880    0.6098      1495

    accuracy                         0.8003      5634
   macro avg     0.7441    0.7325    0.7378      5634
weighted avg     0.7961    0.8003    0.7979      5634


               CLASSIFICATION REPORT FOR TEST DATA
            
              precision    recall  f1-score   support

           0     0.8588    0.8754    0.8670      1035
           1     0.6356    0.6016    0.6181       374

    accuracy                         0.8027      1409
   macro avg     0.7472    0.7385    0.7426      1409
weighted avg     0.7995    0.8027    0.8009      1409


   Confusion Matrix FOR TRAIN DATA
            
[[3630  509]
 [ 616  879]]

   Confusion matrix FOR TEST DATA
            
[[906 129]
 [149 225]]


In [None]:
get_ACCURACY_RECALL(y_train, train_pred_2, y_test, test_pred_2)


           ACCURACY FOR TRAIN DATA
        
          0.8003194888178914

           ACCURACY FOR TEST DATA
            
          0.8026969481902059

           RECALL FOR TRAIN DATA
            
          0.5879598662207358

           RECALL FOR TEST DATA
            
          0.6016042780748663


In [None]:
scores = get_metrics(y_train,train_pred_2,y_test,test_pred_2,"2 layer model - experiment 1",scores)
scores

Unnamed: 0,Model,Train_Accuracy,Train_Recall,Train_Precision,Train_F1_Score,Test_Accuracy,Test_Recall,Test_Precision,Test_F1_Score
0,1 layer model,0.807952,0.527759,0.677253,0.593233,0.808375,0.532086,0.676871,0.595808
1,2 layer model - experiment 1,0.800319,0.58796,0.633285,0.609781,0.802697,0.601604,0.635593,0.618132


# Model 2 - Experiment 2 (2 hidden layers with different number of neurons)

In [None]:
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
model = Sequential()
model.add(Dense(128, input_dim=input_shape, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dense(64, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dense(1, activation='sigmoid',kernel_initializer='glorot_normal'))

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 128)               5120      
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 13,441
Trainable params: 13,441
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=50, batch_size=64,validation_split=0.2,callbacks=callbacks)

Train on 4507 samples, validate on 1127 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 00013: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f2b086eb6d0>

In [None]:
train_pred_2=model.predict_classes(X_train)
test_pred_2=model.predict_classes(X_test)

In [None]:
get_CR_CM(y_train, train_pred_2, y_test, test_pred_2)


               CLASSIFICATION REPORT FOR TRAIN DATA
        
              precision    recall  f1-score   support

           0     0.8428    0.9239    0.8815      4139
           1     0.7129    0.5231    0.6034      1495

    accuracy                         0.8175      5634
   macro avg     0.7779    0.7235    0.7425      5634
weighted avg     0.8084    0.8175    0.8077      5634


               CLASSIFICATION REPORT FOR TEST DATA
            
              precision    recall  f1-score   support

           0     0.8298    0.9092    0.8677      1035
           1     0.6582    0.4840    0.5578       374

    accuracy                         0.7963      1409
   macro avg     0.7440    0.6966    0.7127      1409
weighted avg     0.7843    0.7963    0.7854      1409


   Confusion Matrix FOR TRAIN DATA
            
[[3824  315]
 [ 713  782]]

   Confusion matrix FOR TEST DATA
            
[[941  94]
 [193 181]]


In [None]:
get_ACCURACY_RECALL(y_train, train_pred_2, y_test, test_pred_2)


           ACCURACY FOR TRAIN DATA
        
          0.8175363862264821

           ACCURACY FOR TEST DATA
            
          0.7963094393186657

           RECALL FOR TRAIN DATA
            
          0.5230769230769231

           RECALL FOR TEST DATA
            
          0.4839572192513369


In [None]:
scores = get_metrics(y_train,train_pred_2,y_test,test_pred_2,"2 layer model - experiment 2",scores)
scores

Unnamed: 0,Model,Train_Accuracy,Train_Recall,Train_Precision,Train_F1_Score,Test_Accuracy,Test_Recall,Test_Precision,Test_F1_Score
0,1 layer model,0.807952,0.527759,0.677253,0.593233,0.808375,0.532086,0.676871,0.595808
1,2 layer model - experiment 1,0.800319,0.58796,0.633285,0.609781,0.802697,0.601604,0.635593,0.618132
2,2 layer model - experiment 2,0.817536,0.523077,0.712853,0.603395,0.796309,0.483957,0.658182,0.557781


# Model 3 (2 hidden layers with Dropout)

In [None]:
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
model = Sequential()
model.add(Dense(128, input_dim=input_shape, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid',kernel_initializer='glorot_normal'))

In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 128)               5120      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 65        
Total params: 13,441
Trainable params: 13,441
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train,
              epochs=50,
          batch_size=64,validation_split=0.2,callbacks=callbacks)

Train on 4507 samples, validate on 1127 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 00015: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f2b0847ed90>

In [None]:
train_pred_3=model.predict_classes(X_train)
test_pred_3=model.predict_classes(X_test)

In [None]:
get_CR_CM(y_train, train_pred_3, y_test, test_pred_3)


               CLASSIFICATION REPORT FOR TRAIN DATA
        
              precision    recall  f1-score   support

           0     0.8469    0.9034    0.8742      4139
           1     0.6719    0.5478    0.6035      1495

    accuracy                         0.8090      5634
   macro avg     0.7594    0.7256    0.7389      5634
weighted avg     0.8004    0.8090    0.8024      5634


               CLASSIFICATION REPORT FOR TEST DATA
            
              precision    recall  f1-score   support

           0     0.8345    0.8966    0.8645      1035
           1     0.6397    0.5080    0.5663       374

    accuracy                         0.7935      1409
   macro avg     0.7371    0.7023    0.7154      1409
weighted avg     0.7828    0.7935    0.7853      1409


   Confusion Matrix FOR TRAIN DATA
            
[[3739  400]
 [ 676  819]]

   Confusion matrix FOR TEST DATA
            
[[928 107]
 [184 190]]


In [None]:
get_ACCURACY_RECALL(y_train, train_pred_3, y_test, test_pred_3)


           ACCURACY FOR TRAIN DATA
        
          0.8090166844160455

           ACCURACY FOR TEST DATA
            
          0.7934705464868701

           RECALL FOR TRAIN DATA
            
          0.5478260869565217

           RECALL FOR TEST DATA
            
          0.5080213903743316


In [None]:
scores = get_metrics(y_train,train_pred_3,y_test,test_pred_3,"2 layer model with dropout",scores)
scores

Unnamed: 0,Model,Train_Accuracy,Train_Recall,Train_Precision,Train_F1_Score,Test_Accuracy,Test_Recall,Test_Precision,Test_F1_Score
0,1 layer model,0.807952,0.527759,0.677253,0.593233,0.808375,0.532086,0.676871,0.595808
1,2 layer model - experiment 1,0.800319,0.58796,0.633285,0.609781,0.802697,0.601604,0.635593,0.618132
2,2 layer model - experiment 2,0.817536,0.523077,0.712853,0.603395,0.796309,0.483957,0.658182,0.557781
3,2 layer model with dropout,0.809017,0.547826,0.671862,0.603537,0.793471,0.508021,0.639731,0.566319


# Model 4 (2 hidden layers with l2 regularizer)

In [None]:
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
model = Sequential()
model.add(Dense(64, input_dim=input_shape, kernel_initializer='glorot_normal', activation='relu',kernel_regularizer=l2(1e-03)))
model.add(Dropout(0.5))
model.add(Dense(32, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid',kernel_initializer='glorot_normal'))

In [None]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 64)                2560      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 32)                2080      
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 33        
Total params: 4,673
Trainable params: 4,673
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train,
              epochs=50,
          batch_size=64,validation_split=0.2,callbacks=callbacks)

Train on 4507 samples, validate on 1127 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 00030: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f2b08248850>

In [None]:
train_pred_5=model.predict_classes(X_train)
test_pred_5=model.predict_classes(X_test)

In [None]:
get_CR_CM(y_train, train_pred_5, y_test, test_pred_5)


               CLASSIFICATION REPORT FOR TRAIN DATA
        
              precision    recall  f1-score   support

           0     0.8359    0.9229    0.8773      4139
           1     0.7002    0.4983    0.5823      1495

    accuracy                         0.8103      5634
   macro avg     0.7680    0.7106    0.7298      5634
weighted avg     0.7999    0.8103    0.7990      5634


               CLASSIFICATION REPORT FOR TEST DATA
            
              precision    recall  f1-score   support

           0     0.8293    0.9198    0.8722      1035
           1     0.6820    0.4759    0.5606       374

    accuracy                         0.8020      1409
   macro avg     0.7556    0.6979    0.7164      1409
weighted avg     0.7902    0.8020    0.7895      1409


   Confusion Matrix FOR TRAIN DATA
            
[[3820  319]
 [ 750  745]]

   Confusion matrix FOR TEST DATA
            
[[952  83]
 [196 178]]


In [None]:
get_ACCURACY_RECALL(y_train, train_pred_5, y_test, test_pred_5)


           ACCURACY FOR TRAIN DATA
        
          0.8102591409300675

           ACCURACY FOR TEST DATA
            
          0.8019872249822569

           RECALL FOR TRAIN DATA
            
          0.4983277591973244

           RECALL FOR TEST DATA
            
          0.47593582887700536


In [None]:
scores = get_metrics(y_train,train_pred_5,y_test,test_pred_5,"2 layer model with dropout with L2 regularizer",scores)
scores

Unnamed: 0,Model,Train_Accuracy,Train_Recall,Train_Precision,Train_F1_Score,Test_Accuracy,Test_Recall,Test_Precision,Test_F1_Score
0,1 layer model,0.807952,0.527759,0.677253,0.593233,0.808375,0.532086,0.676871,0.595808
1,2 layer model - experiment 1,0.800319,0.58796,0.633285,0.609781,0.802697,0.601604,0.635593,0.618132
2,2 layer model - experiment 2,0.817536,0.523077,0.712853,0.603395,0.796309,0.483957,0.658182,0.557781
3,2 layer model with dropout,0.809017,0.547826,0.671862,0.603537,0.793471,0.508021,0.639731,0.566319
4,2 layer model with dropout with L2 regularizer,0.810259,0.498328,0.700188,0.582259,0.801987,0.475936,0.681992,0.56063


# Model 5 (2 hidden layers with l2 regularizer and class_weight)

In [None]:
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
model = Sequential()
model.add(Dense(128, input_dim=input_shape, kernel_initializer='glorot_normal', activation='relu',kernel_regularizer=l2(1e-03)))
model.add(Dropout(0.5))
model.add(Dense(64, kernel_initializer='glorot_normal', activation='relu',kernel_regularizer=l2(1e-03)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid',kernel_initializer='glorot_normal'))

In [None]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 128)               5120      
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 65        
Total params: 13,441
Trainable params: 13,441
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train,
              epochs=50,
          batch_size=64, class_weight={0:0.25, 1:0.75},validation_split=0.2, callbacks=callbacks)

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 4507 samples, validate on 1127 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 00030: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f2ae8fa17d0>

In [None]:
train_pred_4=model.predict_classes(X_train)
test_pred_4=model.predict_classes(X_test)

In [None]:
get_CR_CM(y_train, train_pred_4, y_test, test_pred_4)


               CLASSIFICATION REPORT FOR TRAIN DATA
        
              precision    recall  f1-score   support

           0     0.9271    0.6878    0.7897      4139
           1     0.4959    0.8502    0.6264      1495

    accuracy                         0.7309      5634
   macro avg     0.7115    0.7690    0.7081      5634
weighted avg     0.8127    0.7309    0.7464      5634


               CLASSIFICATION REPORT FOR TEST DATA
            
              precision    recall  f1-score   support

           0     0.9216    0.6928    0.7910      1035
           1     0.4960    0.8369    0.6229       374

    accuracy                         0.7310      1409
   macro avg     0.7088    0.7648    0.7069      1409
weighted avg     0.8086    0.7310    0.7463      1409


   Confusion Matrix FOR TRAIN DATA
            
[[2847 1292]
 [ 224 1271]]

   Confusion matrix FOR TEST DATA
            
[[717 318]
 [ 61 313]]


In [None]:
get_ACCURACY_RECALL(y_train, train_pred_4, y_test, test_pred_4)


           ACCURACY FOR TRAIN DATA
        
          0.7309194178203763

           ACCURACY FOR TEST DATA
            
          0.7310149041873669

           RECALL FOR TRAIN DATA
            
          0.8501672240802676

           RECALL FOR TEST DATA
            
          0.8368983957219251


In [None]:
scores = get_metrics(y_train,train_pred_4,y_test,test_pred_4,"2 layer model with dropout and class weight",scores)
scores

Unnamed: 0,Model,Train_Accuracy,Train_Recall,Train_Precision,Train_F1_Score,Test_Accuracy,Test_Recall,Test_Precision,Test_F1_Score
0,1 layer model,0.807952,0.527759,0.677253,0.593233,0.808375,0.532086,0.676871,0.595808
1,2 layer model - experiment 1,0.800319,0.58796,0.633285,0.609781,0.802697,0.601604,0.635593,0.618132
2,2 layer model - experiment 2,0.817536,0.523077,0.712853,0.603395,0.796309,0.483957,0.658182,0.557781
3,2 layer model with dropout,0.809017,0.547826,0.671862,0.603537,0.793471,0.508021,0.639731,0.566319
4,2 layer model with dropout with L2 regularizer,0.810259,0.498328,0.700188,0.582259,0.801987,0.475936,0.681992,0.56063
5,2 layer model with dropout and class weight,0.730919,0.850167,0.495903,0.626417,0.731015,0.836898,0.496038,0.622886


# Model 6 (2 hidden layers with l1 regularizer)

In [None]:
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
model = Sequential()
model.add(Dense(128, input_dim=input_shape, kernel_initializer='glorot_normal', activation='relu',kernel_regularizer=l1(0.01)))
model.add(Dropout(0.5))
model.add(Dense(64, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid',kernel_initializer='glorot_normal'))


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
warnings.filterwarnings("ignore")

In [None]:
model.fit(X_train, y_train,
              epochs=50,
          batch_size=64, class_weight={0:0.25, 1:0.75},validation_split=0.2,callbacks=callbacks)

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 4507 samples, validate on 1127 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 00020: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f2ae8d48090>

In [None]:
train_pred_6=model.predict_classes(X_train)
test_pred_6=model.predict_classes(X_test)

In [None]:
get_CR_CM(y_train, train_pred_6, y_test, test_pred_6)


               CLASSIFICATION REPORT FOR TRAIN DATA
        
              precision    recall  f1-score   support

           0     0.9360    0.6219    0.7473      4139
           1     0.4574    0.8823    0.6024      1495

    accuracy                         0.6910      5634
   macro avg     0.6967    0.7521    0.6748      5634
weighted avg     0.8090    0.6910    0.7088      5634


               CLASSIFICATION REPORT FOR TEST DATA
            
              precision    recall  f1-score   support

           0     0.9422    0.6145    0.7439      1035
           1     0.4564    0.8957    0.6047       374

    accuracy                         0.6891      1409
   macro avg     0.6993    0.7551    0.6743      1409
weighted avg     0.8133    0.6891    0.7069      1409


   Confusion Matrix FOR TRAIN DATA
            
[[2574 1565]
 [ 176 1319]]

   Confusion matrix FOR TEST DATA
            
[[636 399]
 [ 39 335]]


In [None]:
get_ACCURACY_RECALL(y_train, train_pred_6, y_test, test_pred_6)


           ACCURACY FOR TRAIN DATA
        
          0.6909833155839545

           ACCURACY FOR TEST DATA
            
          0.6891412349183819

           RECALL FOR TRAIN DATA
            
          0.8822742474916387

           RECALL FOR TEST DATA
            
          0.8957219251336899


In [None]:
scores = get_metrics(y_train,train_pred_6,y_test,test_pred_6,"2 layer model with dropout with L1 regularizer",scores)
scores

Unnamed: 0,Model,Train_Accuracy,Train_Recall,Train_Precision,Train_F1_Score,Test_Accuracy,Test_Recall,Test_Precision,Test_F1_Score
0,1 layer model,0.807952,0.527759,0.677253,0.593233,0.808375,0.532086,0.676871,0.595808
1,2 layer model - experiment 1,0.800319,0.58796,0.633285,0.609781,0.802697,0.601604,0.635593,0.618132
2,2 layer model - experiment 2,0.817536,0.523077,0.712853,0.603395,0.796309,0.483957,0.658182,0.557781
3,2 layer model with dropout,0.809017,0.547826,0.671862,0.603537,0.793471,0.508021,0.639731,0.566319
4,2 layer model with dropout with L2 regularizer,0.810259,0.498328,0.700188,0.582259,0.801987,0.475936,0.681992,0.56063
5,2 layer model with dropout and class weight,0.730919,0.850167,0.495903,0.626417,0.731015,0.836898,0.496038,0.622886
6,2 layer model with dropout with L1 regularizer,0.690983,0.882274,0.457351,0.602421,0.689141,0.895722,0.456403,0.604693


# Model 7

## MLP using features from AutoEncoders

In [None]:
encoding_dim  = 16 #8,12, 16, 20
# this is our input placeholder
input_data = Input(shape=(input_shape,))

# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_data)

# "decoded" is the lossy reconstruction of the input
decoded = Dense(input_shape, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(inputs=input_data, outputs=decoded)

In [None]:
autoencoder.compile(optimizer='adam', loss='mse')

In [None]:
autoencoder.fit(X_train, X_train,
                epochs=50,
                batch_size=64,
                shuffle=True,
                validation_data=(X_test, X_test), callbacks=callbacks)

Train on 5634 samples, validate on 1409 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 00036: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f2b084f8f90>

In [None]:
# this model maps an input to its encoded representation
encoder = Model(inputs=input_data, outputs=encoded)

In [None]:
x_train_encoded = encoder.predict(X_train)
x_test_encoded = encoder.predict(X_test)

In [None]:
x_train_encoded.shape

(5634, 16)

In [None]:
model = Sequential()

model.add(Dense(64, input_dim = encoding_dim, kernel_initializer='glorot_normal', activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, kernel_initializer='glorot_normal', activation='sigmoid'))

In [None]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 64)                1088      
_________________________________________________________________
dense_23 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 33        
Total params: 3,201
Trainable params: 3,201
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='Adam',
              metrics=['accuracy'])

In [None]:
model.fit(x_train_encoded, y_train, batch_size=64, epochs=50,validation_split=0.2,callbacks=callbacks)

Train on 4507 samples, validate on 1127 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 00016: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f2ae8b766d0>

In [None]:
train_pred_7=model.predict_classes(x_train_encoded)
test_pred_7=model.predict_classes(x_test_encoded)

In [None]:
get_CR_CM(y_train, train_pred_7, y_test, test_pred_7)


               CLASSIFICATION REPORT FOR TRAIN DATA
        
              precision    recall  f1-score   support

           0     0.8583    0.8780    0.8680      4139
           1     0.6393    0.5987    0.6183      1495

    accuracy                         0.8039      5634
   macro avg     0.7488    0.7383    0.7432      5634
weighted avg     0.8002    0.8039    0.8018      5634


               CLASSIFICATION REPORT FOR TEST DATA
            
              precision    recall  f1-score   support

           0     0.8500    0.8705    0.8601      1035
           1     0.6160    0.5749    0.5947       374

    accuracy                         0.7921      1409
   macro avg     0.7330    0.7227    0.7274      1409
weighted avg     0.7879    0.7921    0.7897      1409


   Confusion Matrix FOR TRAIN DATA
            
[[3634  505]
 [ 600  895]]

   Confusion matrix FOR TEST DATA
            
[[901 134]
 [159 215]]


In [None]:
get_ACCURACY_RECALL(y_train, train_pred_7, y_test, test_pred_7)


           ACCURACY FOR TRAIN DATA
        
          0.80386936457224

           ACCURACY FOR TEST DATA
            
          0.7920511000709723

           RECALL FOR TRAIN DATA
            
          0.5986622073578596

           RECALL FOR TEST DATA
            
          0.5748663101604278


In [None]:
scores = get_metrics(y_train,train_pred_7,y_test,test_pred_7,"autoencoder with 2 layer model",scores)
scores

Unnamed: 0,Model,Train_Accuracy,Train_Recall,Train_Precision,Train_F1_Score,Test_Accuracy,Test_Recall,Test_Precision,Test_F1_Score
0,1 layer model,0.807952,0.527759,0.677253,0.593233,0.808375,0.532086,0.676871,0.595808
1,2 layer model - experiment 1,0.800319,0.58796,0.633285,0.609781,0.802697,0.601604,0.635593,0.618132
2,2 layer model - experiment 2,0.817536,0.523077,0.712853,0.603395,0.796309,0.483957,0.658182,0.557781
3,2 layer model with dropout,0.809017,0.547826,0.671862,0.603537,0.793471,0.508021,0.639731,0.566319
4,2 layer model with dropout with L2 regularizer,0.810259,0.498328,0.700188,0.582259,0.801987,0.475936,0.681992,0.56063
5,2 layer model with dropout and class weight,0.730919,0.850167,0.495903,0.626417,0.731015,0.836898,0.496038,0.622886
6,2 layer model with dropout with L1 regularizer,0.690983,0.882274,0.457351,0.602421,0.689141,0.895722,0.456403,0.604693
7,autoencoder with 2 layer model,0.803869,0.598662,0.639286,0.618307,0.792051,0.574866,0.616046,0.594744


# Autoencoders with class weight

In [None]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 64)                1088      
_________________________________________________________________
dense_23 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_24 (Dense)             (None, 1)                 33        
Total params: 3,201
Trainable params: 3,201
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train_encoded, y_train, batch_size=64, epochs=50, class_weight={0:0.25, 1:0.75},validation_split=0.2,callbacks=callbacks)

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train on 4507 samples, validate on 1127 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 00007: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f2ae89ee8d0>

In [None]:
train_pred_8=model.predict_classes(x_train_encoded)
test_pred_8=model.predict_classes(x_test_encoded)

In [None]:
get_CR_CM(y_train, train_pred_8, y_test, test_pred_8)


               CLASSIFICATION REPORT FOR TRAIN DATA
        
              precision    recall  f1-score   support

           0     0.9259    0.6975    0.7956      4139
           1     0.5024    0.8455    0.6303      1495

    accuracy                         0.7368      5634
   macro avg     0.7141    0.7715    0.7130      5634
weighted avg     0.8135    0.7368    0.7518      5634


               CLASSIFICATION REPORT FOR TEST DATA
            
              precision    recall  f1-score   support

           0     0.9248    0.6889    0.7896      1035
           1     0.4953    0.8449    0.6245       374

    accuracy                         0.7303      1409
   macro avg     0.7100    0.7669    0.7070      1409
weighted avg     0.8108    0.7303    0.7458      1409


   Confusion Matrix FOR TRAIN DATA
            
[[2887 1252]
 [ 231 1264]]

   Confusion matrix FOR TEST DATA
            
[[713 322]
 [ 58 316]]


In [None]:
get_ACCURACY_RECALL(y_train, train_pred_8, y_test, test_pred_8)


           ACCURACY FOR TRAIN DATA
        
          0.7367767128150515

           ACCURACY FOR TEST DATA
            
          0.730305180979418

           RECALL FOR TRAIN DATA
            
          0.8454849498327759

           RECALL FOR TEST DATA
            
          0.8449197860962567


In [None]:
scores = get_metrics(y_train,train_pred_8,y_test,test_pred_8,"autoencoder with 2 layer model and with class weight",scores)
scores

Unnamed: 0,Model,Train_Accuracy,Train_Recall,Train_Precision,Train_F1_Score,Test_Accuracy,Test_Recall,Test_Precision,Test_F1_Score
0,1 layer model,0.807952,0.527759,0.677253,0.593233,0.808375,0.532086,0.676871,0.595808
1,2 layer model - experiment 1,0.800319,0.58796,0.633285,0.609781,0.802697,0.601604,0.635593,0.618132
2,2 layer model - experiment 2,0.817536,0.523077,0.712853,0.603395,0.796309,0.483957,0.658182,0.557781
3,2 layer model with dropout,0.809017,0.547826,0.671862,0.603537,0.793471,0.508021,0.639731,0.566319
4,2 layer model with dropout with L2 regularizer,0.810259,0.498328,0.700188,0.582259,0.801987,0.475936,0.681992,0.56063
5,2 layer model with dropout and class weight,0.730919,0.850167,0.495903,0.626417,0.731015,0.836898,0.496038,0.622886
6,2 layer model with dropout with L1 regularizer,0.690983,0.882274,0.457351,0.602421,0.689141,0.895722,0.456403,0.604693
7,autoencoder with 2 layer model,0.803869,0.598662,0.639286,0.618307,0.792051,0.574866,0.616046,0.594744
8,autoencoder with 2 layer model and with class ...,0.736777,0.845485,0.502385,0.630267,0.730305,0.84492,0.495298,0.624506
