In [35]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline

In [36]:
df = pd.read_csv('Churn_Modelling.csv')
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
4228,4229,15691061,Rapuokwu,740,France,Female,37,9,0.0,2,1,1,73225.31,0
7927,7928,15713426,Hancock,637,Germany,Male,30,1,122185.53,1,1,0,102566.46,1
3585,3586,15741745,Lane,757,France,Male,28,7,120911.75,2,1,1,131249.46,0
7143,7144,15761158,Y?an,719,France,Female,54,7,0.0,2,1,1,125041.52,0
6362,6363,15814750,Ricci,629,Spain,Male,34,8,0.0,2,1,1,180595.02,0


In [37]:
df.Exited.value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [38]:
df.drop(['RowNumber','CustomerId','Surname'],axis='columns',inplace=True)

In [39]:
df.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [40]:
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [41]:
df.shape

(10000, 11)

In [42]:
def print_unique_col_values(df):
       for column in df:
            if df[column].dtypes=='object':
                print(f'{column}: {df[column].unique()}') 

In [43]:
print_unique_col_values(df)

Geography: ['France' 'Spain' 'Germany']
Gender: ['Female' 'Male']


In [44]:
df['Geography'].replace({'France':1,'Spain':2,'Germany':3},inplace=True)
df['Gender'].replace({'Male':1,'Female':0},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Geography'].replace({'France':1,'Spain':2,'Germany':3},inplace=True)
  df['Geography'].replace({'France':1,'Spain':2,'Germany':3},inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].replace({'Male':1,'Female':0},inplace=True)
  df['Gender'].replace({'Ma

In [45]:
df.dtypes

CreditScore          int64
Geography            int64
Gender               int64
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [46]:
cols_to_scale = ['CreditScore', 'Age', 'Tenure','Balance', 'NumOfProducts', 'EstimatedSalary']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

In [47]:
df.sample(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6118,0.564,1,0,0.418919,0.5,0.669446,0.0,0,0,0.701015,0
5872,0.616,2,1,0.040541,0.7,0.0,0.333333,0,1,0.771415,0
3445,0.62,3,1,0.135135,0.1,0.471914,0.333333,1,0,0.071394,0
7123,0.744,1,0,0.27027,0.3,0.0,0.333333,0,1,0.839946,0
71,0.926,1,1,0.148649,0.6,0.0,0.0,1,0,0.169728,0


`stratify=y` means that the distribution of the target variable is preserved in the train and test sets. This is useful when the target variable is imbalanced.

In [48]:
df.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

In [49]:
X = df.drop('Exited',axis='columns')
y = df['Exited']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [50]:
X_train.shape

(8000, 10)

In [51]:
y_train.value_counts()

Exited
0    6370
1    1630
Name: count, dtype: int64

In [52]:
y.value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [53]:
y_test.value_counts()

Exited
0    1593
1     407
Name: count, dtype: int64

In [54]:
(6370/1630,1593/407)

(3.9079754601226995, 3.914004914004914)

In [55]:
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import classification_report

**Meaning of Weights:**<br>
- `weights[0]`: This is the weight assigned to class 0. A higher weight increases the importance of correctly classifying instances of class 0.
- `weights[1]`: This is the weight assigned to class 1. Similarly, a higher weight increases the importance of correctly classifying instances of class 1

In [56]:
def ANN(X_train, y_train, X_test, y_test, loss, log_dir):
    model = keras.Sequential([
        keras.layers.Dense(10, input_dim=10, activation='relu'),
        keras.layers.Dense(5, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    tb_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs/{log_dir}", histogram_freq=1)

    model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

    
    model.fit(X_train, y_train,verbose=0, epochs=100, callbacks=[tb_callback])

    print(model.evaluate(X_test, y_test))

    y_preds = model.predict(X_test)
    y_preds = np.round(y_preds)

    print("Classification Report: \n", classification_report(y_test, y_preds))

    return y_preds

In [57]:
ANN(X_train, y_train, X_test, y_test,'binary_crossentropy','without_balancing')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8765 - loss: 0.3314
[0.33240747451782227, 0.8675000071525574]
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.96      0.92      1593
           1       0.77      0.50      0.60       407

    accuracy                           0.87      2000
   macro avg       0.83      0.73      0.76      2000
weighted avg       0.86      0.87      0.86      2000



array([[1.],
       [0.],
       [0.],
       ...,
       [1.],
       [0.],
       [0.]], dtype=float32)

### Undersampling

In [58]:
count_class_0, count_class_1 = df.Exited.value_counts()

# Divide by class
df_class_0 = df[df['Exited'] == 0]
df_class_1 = df[df['Exited'] == 1]

In [59]:
count_class_0, count_class_1

(7963, 2037)

undersampling class 0

In [60]:
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.Exited.value_counts())

Random under-sampling:
Exited
0    2037
1    2037
Name: count, dtype: int64


In [61]:
X = df_test_under.drop('Exited',axis='columns')
y = df_test_under['Exited']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [62]:
y_train.value_counts()

Exited
1    1630
0    1629
Name: count, dtype: int64

In [63]:
y_test.value_counts()

Exited
0    408
1    407
Name: count, dtype: int64

In [64]:
ANN(X_train, y_train, X_test, y_test,'binary_crossentropy','undersampling')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7721 - loss: 0.4820  
[0.485416054725647, 0.7742331027984619]
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Classification Report: 
               precision    recall  f1-score   support

           0       0.79      0.75      0.77       408
           1       0.76      0.80      0.78       407

    accuracy                           0.77       815
   macro avg       0.77      0.77      0.77       815
weighted avg       0.77      0.77      0.77       815



array([[1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],

### Oversampling

In [65]:
# Over sampling minority class(Exited=1)
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.Exited.value_counts())

Random over-sampling:
Exited
0    7963
1    7963
Name: count, dtype: int64


In [66]:
X = df_test_over.drop('Exited',axis='columns')
y = df_test_over['Exited']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [67]:
y_train.value_counts(), y_test.value_counts()

(Exited
 0    6370
 1    6370
 Name: count, dtype: int64,
 Exited
 0    1593
 1    1593
 Name: count, dtype: int64)

In [68]:
ANN(X_train, y_train, X_test, y_test,'binary_crossentropy','oversampling')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7635 - loss: 0.4683
[0.44935038685798645, 0.7730696797370911]
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Classification Report: 
               precision    recall  f1-score   support

           0       0.77      0.79      0.78      1593
           1       0.78      0.76      0.77      1593

    accuracy                           0.77      3186
   macro avg       0.77      0.77      0.77      3186
weighted avg       0.77      0.77      0.77      3186



array([[0.],
       [1.],
       [0.],
       ...,
       [1.],
       [0.],
       [1.]], dtype=float32)

### SMOTE

In [69]:
X = df.drop('Exited',axis='columns')
y = df['Exited']

In [70]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

y_sm.value_counts()

Exited
1    7963
0    7963
Name: count, dtype: int64

In [71]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=15, stratify=y_sm)

In [72]:
y_train.value_counts(), y_test.value_counts()

(Exited
 0    6370
 1    6370
 Name: count, dtype: int64,
 Exited
 0    1593
 1    1593
 Name: count, dtype: int64)

In [73]:
ANN(X_train, y_train, X_test, y_test,'binary_crossentropy','SMOTE')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7869 - loss: 0.4527
[0.4366530179977417, 0.7969240546226501]
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Classification Report: 
               precision    recall  f1-score   support

           0       0.78      0.82      0.80      1593
           1       0.81      0.77      0.79      1593

    accuracy                           0.80      3186
   macro avg       0.80      0.80      0.80      3186
weighted avg       0.80      0.80      0.80      3186



array([[0.],
       [1.],
       [1.],
       ...,
       [1.],
       [0.],
       [1.]], dtype=float32)

### Ensemble

In [74]:
def get_train_batch(df_majority, df_minority, start, end):
    df_train = pd.concat([df_majority[start:end], df_minority], axis=0)
    X_train = df_train.drop('Exited', axis='columns')
    y_train = df_train.Exited
    return X_train, y_train

In [75]:
X = df.drop('Exited',axis='columns')
y = df['Exited']

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [77]:
y_train.value_counts()

Exited
0    6370
1    1630
Name: count, dtype: int64

model1 --> class1(1495) + class0(0, 1495)

model2 --> class1(1495) + class0(1496, 2990)

model3 --> class1(1495) + class0(2990, 4130)

In [78]:
df2 = X_train.copy()
df2['Exited'] = y_train

In [79]:
df2.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
5710,0.856,1,1,0.216216,0.5,0.554265,0.333333,0,0,0.339721,0
3745,0.852,3,0,0.256757,0.1,0.371163,0.333333,1,1,0.980432,0
5429,0.664,1,0,0.405405,0.7,0.0,0.333333,1,0,0.325318,0
551,0.648,3,1,0.391892,0.6,0.426077,0.0,1,1,0.010339,1
8967,0.97,1,1,0.094595,0.7,0.0,0.333333,1,1,0.41723,0


In [80]:
df2_class0 = df2[df2.Exited==0]
df2_class1 = df2[df2.Exited==1]

In [81]:
X_train, y_train = get_train_batch(df2_class0, df2_class1, 0, 1495)
y_pred1=ANN(X_train, y_train, X_test, y_test,'binary_crossentropy','ensemble1')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7469 - loss: 0.5114
[0.5067688226699829, 0.7555000185966492]
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.75      0.83      1593
           1       0.44      0.78      0.56       407

    accuracy                           0.76      2000
   macro avg       0.69      0.76      0.70      2000
weighted avg       0.83      0.76      0.78      2000



In [82]:
X_train, y_train = get_train_batch(df2_class0, df2_class1, 1495, 2990)
y_pred2 = ANN(X_train, y_train, X_test, y_test,'binary_crossentropy','ensemble2')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7329 - loss: 0.5076
[0.5039165616035461, 0.7390000224113464]
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.74      0.82      1593
           1       0.42      0.75      0.54       407

    accuracy                           0.74      2000
   macro avg       0.67      0.74      0.68      2000
weighted avg       0.82      0.74      0.76      2000



In [83]:
X_train, y_train = get_train_batch(df2_class0, df2_class1, 2990, 4130)

y_pred3 = ANN(X_train, y_train, X_test, y_test,'binary_crossentropy','ensemble3')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7276 - loss: 0.5241
[0.5266134142875671, 0.7315000295639038]
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.71      0.81      1593
           1       0.42      0.80      0.55       407

    accuracy                           0.73      2000
   macro avg       0.67      0.76      0.68      2000
weighted avg       0.83      0.73      0.76      2000



In [84]:
len(y_pred1)

2000

In [85]:
y_pred_final = y_pred1.copy()
for i in range(len(y_pred1)):
    n_ones = y_pred1[i] + y_pred2[i] + y_pred3[i]
    if n_ones>1:
        y_pred_final[i] = 1
    else:
        y_pred_final[i] = 0

In [86]:
cl_rep = classification_report(y_test, y_pred_final)
print(cl_rep)

              precision    recall  f1-score   support

           0       0.93      0.74      0.82      1593
           1       0.44      0.78      0.56       407

    accuracy                           0.75      2000
   macro avg       0.68      0.76      0.69      2000
weighted avg       0.83      0.75      0.77      2000

