# Bank Chrun prediction with Baseline Model 


In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import AdaBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
#read the csv "bankchurners.csv" , drop unknown columns and reveiw the dataframe 
bank_df = pd.read_csv(Path('./Resources/BankChurnersPrimary.csv'))
bank_df = bank_df[bank_df != 'Unkown'].dropna()
bank_df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [3]:
# Drop columns that are not relevant to the dataset
bank_df = bank_df.drop(columns=['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                                'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2',
                                "CLIENTNUM"])
bank_df.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [4]:
#review the datatype of dataframe
bank_df.dtypes

Attrition_Flag               object
Customer_Age                  int64
Gender                       object
Dependent_count               int64
Education_Level              object
Marital_Status               object
Income_Category              object
Card_Category                object
Months_on_book                int64
Total_Relationship_Count      int64
Months_Inactive_12_mon        int64
Contacts_Count_12_mon         int64
Credit_Limit                float64
Total_Revolving_Bal           int64
Avg_Open_To_Buy             float64
Total_Amt_Chng_Q4_Q1        float64
Total_Trans_Amt               int64
Total_Trans_Ct                int64
Total_Ct_Chng_Q4_Q1         float64
Avg_Utilization_Ratio       float64
dtype: object

In [5]:
#Select the float columns 
numeric_values = list(bank_df.dtypes[bank_df.dtypes == "float64"].index)
numeric_values

['Credit_Limit',
 'Avg_Open_To_Buy',
 'Total_Amt_Chng_Q4_Q1',
 'Total_Ct_Chng_Q4_Q1',
 'Avg_Utilization_Ratio']

In [6]:
#Change the columns from float to interger 
bank_df[numeric_values] = bank_df[numeric_values].astype(int)
print(bank_df.dtypes)

Attrition_Flag              object
Customer_Age                 int64
Gender                      object
Dependent_count              int64
Education_Level             object
Marital_Status              object
Income_Category             object
Card_Category               object
Months_on_book               int64
Total_Relationship_Count     int64
Months_Inactive_12_mon       int64
Contacts_Count_12_mon        int64
Credit_Limit                 int32
Total_Revolving_Bal          int64
Avg_Open_To_Buy              int32
Total_Amt_Chng_Q4_Q1         int32
Total_Trans_Amt              int64
Total_Trans_Ct               int64
Total_Ct_Chng_Q4_Q1          int32
Avg_Utilization_Ratio        int32
dtype: object


In [7]:
bank_df.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691,777,11914,1,1144,42,1,0
1,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256,864,7392,1,1291,33,3,0
2,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418,0,3418,2,1887,20,2,0
3,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313,2517,796,1,1171,20,2,0
4,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716,0,4716,2,816,28,2,0


In [8]:
#Select the columns that are categorical 
categorical_variables = list(bank_df.dtypes[bank_df.dtypes == "object"].index)
categorical_variables

['Attrition_Flag',
 'Gender',
 'Education_Level',
 'Marital_Status',
 'Income_Category',
 'Card_Category']

In [9]:
#import the Onehotencoder 
from sklearn.preprocessing import OneHotEncoder

In [10]:
#create the instance 
enc = OneHotEncoder(sparse=False)

In [11]:
#Encode categorcal variables using OneHotEncoder
encoded_data = enc.fit_transform(bank_df[categorical_variables])

In [12]:
# Create a DataFrame with the encoded variables 
encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_variables)
)

In [13]:
encoded_df.head(10)

Unnamed: 0,Attrition_Flag_Attrited Customer,Attrition_Flag_Existing Customer,Gender_F,Gender_M,Education_Level_College,Education_Level_Doctorate,Education_Level_Graduate,Education_Level_High School,Education_Level_Post-Graduate,Education_Level_Uneducated,...,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [14]:
# Create a DataFrame with the columnns containing numerical variables from the original dataset
numerical_variables_df = bank_df.drop(columns = categorical_variables)
numerical_variables_df.head()

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,45,3,39,5,1,3,12691,777,11914,1,1144,42,1,0
1,49,5,44,6,1,2,8256,864,7392,1,1291,33,3,0
2,51,3,36,4,1,0,3418,0,3418,2,1887,20,2,0
3,40,4,34,3,4,1,3313,2517,796,1,1171,20,2,0
4,40,3,21,5,1,0,4716,0,4716,2,816,28,2,0


In [15]:
# Using the Pandas concat function, combine the DataFrames the contain the encoded categorical data and the numerical data
main_bank_df = pd.concat(
    [
        numerical_variables_df,
        encoded_df
    ],
    axis=1
)
main_bank_df.head()

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,...,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,45,3,39,5,1,3,12691,777,11914,1,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,49,5,44,6,1,2,8256,864,7392,1,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,51,3,36,4,1,0,3418,0,3418,2,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,40,4,34,3,4,1,3313,2517,796,1,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,40,3,21,5,1,0,4716,0,4716,2,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [16]:
#change the column data type from float to int for the y variable and didnt classify the target data 
main_bank_df["Attrition_Flag_Attrited Customer"] = main_bank_df["Attrition_Flag_Attrited Customer"].apply(int)

y = main_bank_df["Attrition_Flag_Attrited Customer"]
y[:5]

0    0
1    0
2    0
3    0
4    0
Name: Attrition_Flag_Attrited Customer, dtype: int64

In [17]:
# Define features set X by selecting all columns but Attrition_Yes and Attrition_No
X = main_bank_df.drop(columns=["Attrition_Flag_Attrited Customer", "Attrition_Flag_Existing Customer"])
X.head(5)

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,...,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,45,3,39,5,1,3,12691,777,11914,1,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,49,5,44,6,1,2,8256,864,7392,1,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,51,3,36,4,1,0,3418,0,3418,2,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,40,4,34,3,4,1,3313,2517,796,1,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,40,3,21,5,1,0,4716,0,4716,2,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [18]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [19]:
# Create a StandardScaler instance, fit the training data and transform the testing data 
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
y_train.value_counts()

0    6373
1    1222
Name: Attrition_Flag_Attrited Customer, dtype: int64

In [21]:
#Create a Random_oversample, fit & transform the training data set 
random_oversampler = RandomOverSampler(random_state=1)
X_resampled, y_resampled = random_oversampler.fit_resample(X_train, y_train)

In [22]:
y_resampled.value_counts()

0    6373
1    6373
Name: Attrition_Flag_Attrited Customer, dtype: int64

In [23]:
#Create a Adabooster Model, fit the model using the training dataset and predict the testing data 
ada_model = AdaBoostClassifier()
ada_org_model = ada_model.fit(X_train, y_train)
y_ada_org_pred = ada_org_model.predict(X_test)
y_ada_org_pred[:10]

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 1], dtype=int64)

In [24]:
#Create a Adabooster Model, fit the model using the resampled dataset and predict the testing data 
ada_re_model = AdaBoostClassifier()
ada_resampled_model = ada_re_model.fit(X_resampled, y_resampled)
y_ada_resampled_pred = ada_resampled_model.predict(X_test)
y_ada_resampled_pred[:10]

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 1], dtype=int64)

In [25]:
#generate  the confusion matrix for original data
confusion_matrix(y_test, y_ada_org_pred)

array([[2080,   47],
       [  76,  329]], dtype=int64)

In [26]:
#generate  the confusion matrix for resampled data
confusion_matrix(y_test, y_ada_resampled_pred)

array([[1982,  145],
       [  27,  378]], dtype=int64)

In [27]:
#print the classification report using the testing data and original predictions data
print(classification_report(y_test, y_ada_org_pred))


              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2127
           1       0.88      0.81      0.84       405

    accuracy                           0.95      2532
   macro avg       0.92      0.90      0.91      2532
weighted avg       0.95      0.95      0.95      2532



In [28]:
#print the classification report using the testing data and resampled predictions data
print(classification_report(y_test, y_ada_resampled_pred))


              precision    recall  f1-score   support

           0       0.99      0.93      0.96      2127
           1       0.72      0.93      0.81       405

    accuracy                           0.93      2532
   macro avg       0.85      0.93      0.89      2532
weighted avg       0.94      0.93      0.94      2532



In [29]:
#review the accuracy score for the original data Adabooster model 
ada_original_accuracy = balanced_accuracy_score(y_test, y_ada_org_pred)
print("Ada Booster Accuracy: {:.2f}%".format(ada_original_accuracy * 100))

Ada Booster Accuracy: 89.51%


In [30]:
#review the accuracy score for the resampled data Adabooster model 
ada_resampled_accuracy = balanced_accuracy_score(y_test, y_ada_resampled_pred)
print("Ada Booster Accuracy: {:.2f}%".format(ada_resampled_accuracy * 100))

Ada Booster Accuracy: 93.26%
