In [28]:
import pandas as pd
import numpy as np
from pathlib import Path


from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC


In [29]:

starter_df = pd.read_csv(Path("c://users/ajcth/documents/github/bank_churn_project/Resources/BankChurnersPrimary.csv"))

starter_df = starter_df[starter_df != 'Unknown'].dropna()
starter_df.sample(5)


Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
7936,711005583,Existing Customer,37,M,3,High School,Single,$80K - $120K,Blue,24,...,3437.0,1064,2373.0,0.851,5110,76,0.854,0.31,0.000193,0.99981
9049,718006608,Existing Customer,44,F,5,Uneducated,Single,Less than $40K,Blue,36,...,7742.0,1163,6579.0,0.729,9081,100,0.852,0.15,0.000115,0.99988
8630,720935658,Existing Customer,57,F,4,Post-Graduate,Single,Less than $40K,Blue,45,...,3059.0,1744,1315.0,0.614,7709,89,0.561,0.57,0.000236,0.99976
5716,789531633,Existing Customer,53,F,4,Graduate,Single,Less than $40K,Blue,46,...,4117.0,777,3340.0,0.592,5218,88,0.6,0.189,5.8e-05,0.99994
7769,719372583,Existing Customer,57,M,3,Graduate,Single,$120K +,Blue,36,...,17268.0,1197,16071.0,0.637,4077,90,0.698,0.069,7.1e-05,0.99993


In [30]:
#Change a few column names so they display better in the early DF
replacement_column_names = {'Attrition_Flag': 'Status',
        'Customer_Age': 'Age',
        'Education_Level': 'Education',
        'Dependent_count': 'Dependents',
        'Income_Category': 'Income',
        'Card_Category': 'Card_Type',
        'Months_on_book': 'Tenure(month)'
        }
starter_df.rename(columns=replacement_column_names, inplace=True)

In [31]:
#Remove columns that are not understood or irrelevant 
#The Naive Bayes Classifiers give a perfect score, it could be linked to our target(y)
starter_df.drop(columns=[
#'Total_Relationship_Count', #Going to try adding this and StandardScaling
#'Contacts_Count_12_mon', #Going to try adding this and StandardScaling 
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', #Going to try adding this and StandardScaling 
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', #Going to try adding this and StandardScaling 
#'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1', #Going to try adding these and StandardScaling 
'CLIENTNUM' #unnecessary
],
inplace=True
)


In [32]:
#OPTION2
#Here is another option which includes the Graduate data.
starter_df = starter_df.replace({'Education' : {'Uneducated': 0, 'High School': 1, 'College' : 2, 'Graduate' :  3, 'Post-Graduate' : 4, 'Doctorate' : 5}})
starter_df.Education = starter_df.Education.astype(int)



In [33]:
#Replace card categories from Blue/Silver/Gold/Plat to 0/1/2/3/4
starter_df = starter_df.replace({'Card_Type' : {'Blue': 0, 'Silver': 1, 'Gold' : 2, 'Platinum' : 3}})
starter_df.Card_Type = starter_df.Card_Type.astype(int)

#Replace M/F with 0,1
starter_df = starter_df.replace({'Gender' : {'M': 0, 'F': 1}})
starter_df.Gender = starter_df.Gender.astype(int)


#Replace customer's status to 0 if they left and 1 if they stayed
starter_df = starter_df.replace({'Status' : {'Attrited Customer': 0, 'Existing Customer': 1}})



In [34]:

"""
use feature creation to make a ratio between dependents and income sources
consider someone divorced as Single. only 10% of divorced people receive alimony payments
        either a 1 (single/divorced) or a 2 (married)
        divide that by the amount of dependents plus one, which represents how many people you take care of
                i.e. a single person with no dependents only takes care of themself, so they are a 1. 
                """

starter_df = starter_df.replace({'Marital_Status' : {'Divorced': 1, 'Single': 1, 'Married' : 2}})
starter_df['Marital_Dependent_Ratio'] = ((starter_df['Marital_Status'] / (starter_df['Dependents'] + 1)) / 2).round(2)
starter_df.drop(columns=['Marital_Status', 'Dependents'], inplace=True)




In [36]:
#Do feature creation to make a ratio between avg open to buy and credit limit
starter_df['Credit_Usage'] = (starter_df['Avg_Open_To_Buy'] / starter_df['Credit_Limit']).round(2)
starter_df.drop(columns=['Avg_Open_To_Buy', 'Credit_Limit'], inplace=True)
starter_df = starter_df.dropna()

In [37]:
#Do feature creation to make a ratio between transaction amount and trans count
starter_df['Avg_Trans_Value'] = (starter_df['Total_Trans_Amt'] / starter_df['Total_Trans_Ct']).round(2)
starter_df.drop(columns=['Total_Trans_Amt', 'Total_Trans_Ct'], inplace=True)

In [38]:
#Do feature creation to make a ratio between age and tenure
starter_df['Tenure_By_Age'] = (starter_df['Tenure(month)'] / starter_df['Age']).round(2)
starter_df.drop(columns=['Tenure(month)', 'Age'], inplace=True)

In [39]:
#Rank the income as a 0-4 low-high
starter_df = starter_df.replace({'Income' : {'Less than $40K': 0, '$40K - $60K': 1, '$80K - $120K' : 2, '$60K - $80K' : 3, '$120K +' : 4}})
starter_df.Income = starter_df.Income.astype(int)


In [40]:
#Review the DF
starter_df.sample(5)

Unnamed: 0,Status,Gender,Education,Income,Card Type,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,MaritalDependentRatio,CreditUsage,AvgTransValue,TenureByAge
4984,1,1,3,0,0,4,3,3,909,0.772,0.8,0.454,0.25,0.55,67.27,0.75
7435,1,1,1,0,0,5,2,3,1658,0.669,1.0,0.618,0.1,0.38,61.28,0.71
1878,0,0,2,2,0,3,4,4,0,1.014,0.35,0.0,0.17,1.0,38.56,0.87
7029,1,0,1,4,0,4,4,2,982,0.651,0.816,0.106,0.25,0.89,52.24,0.82
2254,1,0,3,1,0,4,1,2,1118,0.643,0.55,0.352,0.5,0.65,64.84,0.59


In [41]:
#Define features(X) and target(y)
y = starter_df['Status']
X = starter_df.drop(columns=['Status'])

In [42]:
#Split features and target in to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [43]:
#Scale all numerical/float values that don't represent categories

scaler = StandardScaler()

#Use Column Transformer to scale only the numerical/float values that don't represent categories (male/female, married/single/divorced)
col_tran= ColumnTransformer([
('Total_Revolving_Bal_Scaled', scaler, ['Total_Revolving_Bal']),
('Education_Scaled', scaler, ['Education']),
('Income_Scaled', scaler, ['Income']),
('Card_Type_Scaled', scaler, ['Card_Type']),
('Months_Inactive_12_mon_scaled', scaler, ['Months_Inactive_12_mon']),
('Credit_Usage_Scaled', scaler, ['Credit_Usage']),
('Avg_Trans_Value_Scaled', scaler, ['Avg_Trans_Value']),
('Tenure_By_Age_Scaled', scaler, ['Tenure_By_Age']),
('Avg_Util_Ratio_Scaled', scaler, ['Avg_Utilization_Ratio']),
('Total_Relationship_Count_Scaled', scaler, ['Total_Relationship_Count']),
('Contacts_Count_12_mon_scaled', scaler, ['Contacts_Count_12_mon']),
('Total_Amt_Chng_Q4_Q1_scaled', scaler, ['Total_Amt_Chng_Q4_Q1']),
('Total_Ct_Chng_Q4_Q1_scaled', scaler, ['Total_Ct_Chng_Q4_Q1']),
('Marital_Dependent_Ratio_Scaled', scaler, ['Marital_Dependent_Ratio'])
])


X_train = col_tran.fit_transform(X_train)
X_test = col_tran.transform(X_test)


In [44]:
#Use SMOTE to add synthetic data and balance our target feature value count
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [45]:
#Choose a model


clf = XGBClassifier(
n_estimators = 50, 
max_depth = 4, 
objective='binary:logistic'
)


#clf = ExtraTreesClassifier()

#clf = AdaBoostClassifier()
#clf = AdaBoostClassifier(n_estimators=200, random_state=2, learning_rate = 0.2)

#svc = SVC()
#clf = AdaBoostClassifier(base_estimator=svc, algorithm='SAMME')

#clf = BalancedRandomForestClassifier()

#clf = SVC()

#clf = RandomForestClassifier()

In [46]:
#Fit the model on the training data
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [47]:
#Make predictions on the test data
test_predictions = clf.predict(X_test)

In [48]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 92.21%


In [49]:
#Generate a test matrix
test_matrix = confusion_matrix(y_test, test_predictions)
print(test_matrix)

[[ 203   74]
 [  64 1430]]


In [50]:
# Create a testing classifiction report
testing_report = classification_report(y_test, test_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.76      0.73      0.75       277
           1       0.95      0.96      0.95      1494

    accuracy                           0.92      1771
   macro avg       0.86      0.85      0.85      1771
weighted avg       0.92      0.92      0.92      1771



In [51]:
imb_testing_report = classification_report_imbalanced(y_test, test_predictions)
print(imb_testing_report)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.76      0.73      0.96      0.75      0.84      0.69       277
          1       0.95      0.96      0.73      0.95      0.84      0.72      1494

avg / total       0.92      0.92      0.77      0.92      0.84      0.71      1771

