In [385]:
# for Exited, 1 means they exited
# for gender, male is 0 and female is 1


#Consider creating a combined column for exited and active member
    #-2 if exited and active, -1 if exited and inactive, 1 if stayed and inactive, 2 if stayed and active

#clean outlier data from monthly salary (scale or remove?)

#need to do either undersampling or oversampling, about 8000 users stayed while only 2000 left

#standardscaler or create new features from the features (salary, balance, age, credit score, tenure)

# separate data by region? looks like most data is from france. Much less likely for french customers to leave.
# i.e. the leave data for all three countries is similar but the stay data is much higher in france.

#MY ASSUMPTIONS:
#The data was collected at two points in a year. First point gave balance, salary, etc., then later that year they checked if that person had left or stayed
#Balance may still exist if a person left the bank because it represents the balance they had before leaving
#User is considered an active user if they are still using the bank services in some way. its possible for an active member to leave. its possible for an inactive member to stay.

#is profitable idea: check a 1 for yes and -1 for no, based on iff the user is both an active member AND has a positive balance

In [386]:
import pandas as pd
import numpy as np
from pathlib import Path

from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC


In [387]:

starter_df = pd.read_csv(Path("c://users/ajcth/documents/github/bank_churn_project/Resources/BankChurners.csv"))
starter_df.sample(10)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
8849,8850,15721433,Hixson,664,France,Female,38,4,74306.19,2,1,0,154395.56,0
7028,7029,15618410,Murray,718,Germany,Male,26,7,147527.03,1,0,0,51099.56,0
5758,5759,15610972,Crawford,681,Germany,Female,44,4,91115.76,2,0,0,24208.84,1
261,262,15673481,Morton,726,Spain,Female,48,6,99906.19,1,1,0,64323.24,0
7070,7071,15608595,Lo Duca,748,France,Female,39,3,157371.54,1,0,1,97734.3,0
2296,2297,15582714,Napolitani,749,Germany,Male,47,9,110022.74,1,0,1,135655.29,1
7973,7974,15656005,Millar,592,Germany,Male,31,7,124593.23,1,1,0,86079.67,0
1380,1381,15743067,Fuller,625,Germany,Male,26,3,130483.95,1,1,0,122810.53,0
2265,2266,15734628,Lysaght,623,France,Female,35,5,0.0,2,1,0,101192.08,0
4590,4591,15680167,Thomson,635,France,Female,78,6,47536.4,1,1,1,119400.08,0


In [388]:
starter_df.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [389]:
#Convert the values of the Gender column from "M" or "F" to "0" or "1"
starter_df = starter_df.replace({'Gender' : {'Male': 0, 'Female': 1}})
#Drop the useless columns
starter_df = starter_df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])
#Drop Nan's
starter_df.dropna(inplace=True)

In [390]:
starter_df.sample(10)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
2347,589,Germany,0,55,7,119961.48,1,1,0,65156.83,1
1041,722,France,0,30,5,0.0,2,1,0,166376.54,0
7756,621,Spain,1,36,7,116338.68,1,1,1,155743.48,0
5373,776,Spain,0,30,6,0.0,2,0,1,63908.86,0
6914,702,France,0,40,7,145536.9,1,0,1,135334.24,0
847,468,France,1,42,5,0.0,2,1,0,125305.34,0
4188,644,Germany,0,47,9,137774.11,2,1,0,151902.78,0
2215,493,France,0,36,9,0.0,2,1,1,65816.53,0
7490,654,France,1,35,2,90865.8,1,1,1,86764.46,0
8770,615,France,0,59,8,0.0,2,1,1,165576.55,0


In [391]:
#use feature creation to make a ratio between balance and salary
starter_df['BalanceSalaryRatio'] = (starter_df['Balance'] / starter_df['EstimatedSalary']).round(2)
starter_df.drop(columns=['Balance', 'EstimatedSalary'], inplace=True)

In [392]:
#use feature creation to make a ratio between age and tenure
starter_df['TenureByAge'] = (starter_df['Tenure'] / starter_df['Age']).round(2)
starter_df.drop(columns=['Tenure', 'Age'], inplace=True)

In [393]:

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Encode categorcal variables using OneHotEncoder
encoded_data = enc.fit_transform(starter_df[['Geography']])

encoded_df = pd.DataFrame(encoded_data, columns = enc.get_feature_names(['Geography']))

starter_df = pd.concat([starter_df, encoded_df], axis=1)
starter_df.drop(columns=['Geography'], inplace=True)




In [394]:
#Remove geography if we are just doing France and not encoding geography in the above cell
#starter_df = starter_df[starter_df.Geography != 'Spain']
#starter_df = starter_df[starter_df.Geography != 'Germany']
#starter_df.drop(columns=['Geography'], inplace=True)

In [395]:
X = starter_df.drop(columns= 'Exited')

In [396]:
y = starter_df['Exited']

In [397]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [398]:
#Scale all numerical/float values that don't represent categories

scaler = StandardScaler()

#Use Column Transformer to scale only the numerical/float values that don't represent categories (male/female, married/single/divorced)
#Going to try and remove all things related to Edu/age/dependents, see what happens
col_tran= ColumnTransformer([
('CreditScoreScaled', scaler, ['CreditScore']),
('NumOfProductsScaled', scaler, ['NumOfProducts']),
('BalanceSalaryRatioScaled', scaler, ['BalanceSalaryRatio']),
('TenureByAgeScaled', scaler, ['TenureByAge'])
])


X_train = col_tran.fit_transform(X_train)
X_test = col_tran.transform(X_test)


In [399]:
#Use SMOTE to add synthetic data and balance our target feature value count
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

In [400]:
#Choose a model

#GBC STILL THE BEST
#TRY WITH BOTH EDUCATION COLUMN OPTIONS (with and without Graduate data)

clf = GradientBoostingClassifier(
n_estimators=500,    #default = 100    range = 1-inf
random_state= 2,     #default = None   range = 1-inf
#subsample= 1,     #default = 1   range = 0.-1
#min_samples_split = 2,      #default = 2   range = 2-inf
#max_depth=3,  #default = 3    range = 1-inf
#min_impurity_decrease=0,    #default = 0    range = 0 - inf
#min_samples_leaf = 1,            #default = 1   range = 1 - inf
#min_weight_fraction_leaf = 0,     #default = 0   range =0 - 0.5
#max_leaf_nodes = None,     #default = None   range = 2-inf
#learning_rate = 0.1          #default=0.1    range 0.0-inf
)


'''
clf = XGBClassifier(
#n_estimators = 50, 
#max_depth = 4, 
objective='binary:logistic'
)
'''

#clf = AdaBoostClassifier()
#clf = AdaBoostClassifier(n_estimators=200, random_state=2, learning_rate = 0.2)

#svc = SVC()
#clf = AdaBoostClassifier(base_estimator=svc, algorithm='SAMME')

#clf = BalancedRandomForestClassifier()

#clf = SVC()

#clf = RandomForestClassifier()

"\nclf = XGBClassifier(\n#n_estimators = 50, \n#max_depth = 4, \nobjective='binary:logistic'\n)\n"

In [401]:
#Fit the model on the training data
clf.fit(X_train, y_train)

GradientBoostingClassifier(n_estimators=500, random_state=2)

In [402]:
#Make predictions on the test data
test_predictions = clf.predict(X_test)

In [403]:
accuracy = accuracy_score(test_predictions, y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 74.52%


In [404]:
#Create a dataframe showing predictions vs test
comparison = pd.DataFrame({"Testing Data Predictions": test_predictions,
    "Testing Data Actual Targets": y_test})
comparison = comparison.sort_index(ascending=True)
comparison.tail(5)

Unnamed: 0,Testing Data Predictions,Testing Data Actual Targets
9981,0,1
9982,1,1
9984,0,0
9994,0,0
9998,0,1


In [405]:
#Generate a test matrix
test_matrix = confusion_matrix(y_test, test_predictions)
print(test_matrix)

[[1638  342]
 [ 295  225]]


In [406]:
# Create a testing classifiction report
testing_report = classification_report(y_test, test_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.85      0.83      0.84      1980
           1       0.40      0.43      0.41       520

    accuracy                           0.75      2500
   macro avg       0.62      0.63      0.63      2500
weighted avg       0.75      0.75      0.75      2500

