In [139]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.feature_selection import VarianceThreshold, mutual_info_classif, SelectKBest, chi2, SequentialFeatureSelector
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC


In [140]:

starter_df = pd.read_csv(Path("c://users/ajcth/documents/github/bank_churn_model/Resources/BankChurnersCompare.csv"))
starter_df.sample(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6045,6046,15784594,Mazzi,549,Germany,Female,37,1,130622.34,2,1,1,128499.94,0
2924,2925,15792818,Perry,499,Germany,Female,29,6,148051.52,1,1,0,118623.94,0
5062,5063,15650432,Liu,849,Germany,Male,41,10,84622.13,1,1,1,198072.16,0
7474,7475,15799859,Lucchesi,704,France,Male,50,4,165438.26,1,1,0,120770.75,1
4406,4407,15623450,Brown,637,Germany,Female,27,7,135842.89,1,1,1,101418.05,0
2150,2151,15771211,Perkins,668,France,Male,38,10,86977.96,1,0,1,37094.75,0
9740,9741,15572021,Ts'ao,798,Germany,Female,29,8,80204.11,2,1,0,70223.22,0
6109,6110,15710105,Stirling,581,Germany,Female,26,3,105099.45,1,1,1,184520.0,1
4031,4032,15648461,Hs?eh,688,Spain,Male,37,7,138162.41,2,1,1,113926.31,0
3227,3228,15639576,Burns,691,France,Male,26,9,136623.19,1,1,0,153228.0,0


In [141]:
starter_df.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [142]:

#Drop the useless columns
starter_df = starter_df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])
#Drop Nan's
starter_df.dropna(inplace=True)

In [143]:
starter_df.dtypes

CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [144]:
enc = OneHotEncoder(sparse=False, drop='if_binary')
categorical_variables = ['Geography', 'Gender']
encoded_data = enc.fit_transform(starter_df[categorical_variables])
encoded_dataframe = pd.DataFrame(encoded_data, columns = enc.get_feature_names(categorical_variables))
starter_df.drop(columns=['Geography', 'Gender'], inplace=True)
starter_df = pd.concat([starter_df, encoded_dataframe.set_axis(starter_df.index)], axis=1)
starter_df.sample(10)




Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Male
8547,689,47,2,118812.5,2,0,0,31121.42,0,0.0,1.0,0.0,1.0
7629,723,68,3,110357.0,1,0,0,141977.54,1,0.0,1.0,0.0,0.0
8467,709,62,3,0.0,2,1,1,82195.15,0,0.0,0.0,1.0,1.0
8555,522,25,1,111432.13,1,1,1,168683.57,0,0.0,1.0,0.0,1.0
6540,571,33,3,71843.15,1,1,0,26772.04,0,0.0,1.0,0.0,1.0
328,671,35,1,144848.74,1,1,1,179012.3,0,0.0,1.0,0.0,1.0
7842,734,43,7,107805.67,1,0,0,182505.68,0,1.0,0.0,0.0,1.0
2006,687,39,7,0.0,2,1,0,26848.25,0,1.0,0.0,0.0,1.0
8522,644,18,8,0.0,2,1,0,59172.42,0,0.0,0.0,1.0,1.0
2521,638,25,4,148045.45,2,1,1,114722.42,0,0.0,1.0,0.0,1.0


In [145]:
X = starter_df.drop(columns= 'Exited')
y = starter_df['Exited']

In [146]:
'''
gbc = GradientBoostingClassifier()
sfs = SequentialFeatureSelector(gbc, n_features_to_select=6, direction='forward')
'''



clf = XGBClassifier(objective='binary:logistic')
sfs = SequentialFeatureSelector(clf, n_features_to_select=6, direction='forward')
sfs.fit(X, y)
X_selection = sfs.transform(X)

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X_selection, y, random_state=1)

In [148]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [149]:
#Use SMOTE to add synthetic data and balance our target feature value count
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [150]:
#Choose a model
'''
clf = GradientBoostingClassifier(
n_estimators=500,
random_state= 2,
)
'''


clf = XGBClassifier(
objective='binary:logistic'
)


#clf = AdaBoostClassifier()
#clf = AdaBoostClassifier(n_estimators=200, random_state=2, learning_rate = 0.2)

#svc = SVC()
#clf = AdaBoostClassifier(base_estimator=svc, algorithm='SAMME')

#clf = BalancedRandomForestClassifier()

#clf = SVC()

#clf = RandomForestClassifier()

In [151]:
#Fit the model on the training data
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [152]:
#Make predictions on the test data
test_predictions = clf.predict(X_test)

In [153]:
accuracy = accuracy_score(test_predictions, y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 83.64%


In [154]:
#Generate a test matrix
test_matrix = confusion_matrix(y_test, test_predictions)
print(test_matrix)

[[1786  194]
 [ 215  305]]


In [155]:
# Create a testing classifiction report
testing_report = classification_report(y_test, test_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.89      0.90      0.90      1980
           1       0.61      0.59      0.60       520

    accuracy                           0.84      2500
   macro avg       0.75      0.74      0.75      2500
weighted avg       0.83      0.84      0.84      2500



In [156]:
results = pd.DataFrame(sfs.get_support())
features = X.columns.values
features = pd.DataFrame(features)

results_df = pd.concat([features, results.set_axis(features.index)], axis=1)
results_df = results_df[results_df != False].dropna()
results_df

Unnamed: 0,0,0.1
1,Age,True
4,NumOfProducts,True
6,IsActiveMember,True
9,Geography_Germany,True
10,Geography_Spain,True
11,Gender_Male,True
