### Import Statements
The libraries used are `pandas`, `pathlib`, `imblearn`, and `sklearn`. 

In [73]:
import pandas as pd
from pathlib import Path
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

### Import the data and create a dataframe
1. Use `pandas` and `pathlib` to read the `BankChurnersPrimary.csv` CSV from the `Resources` folder.
2. Sample the dataset to review the data.

In [74]:
starter_df = pd.read_csv(Path("c://users/ajcth/documents/github/bank_churn_project/Resources/BankChurnersPrimary.csv"))
starter_df.sample(5)

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
2157,709710783,Existing Customer,35,F,0,High School,Single,Less than $40K,Blue,36,...,5931.0,0,5931.0,0.668,2730,71,0.614,0.0,0.00043,0.99957
179,710829108,Existing Customer,59,F,1,Uneducated,Married,$40K - $60K,Blue,36,...,3356.0,985,2371.0,0.35,999,28,0.556,0.294,0.000185,0.99982
1595,769751958,Existing Customer,55,M,3,Unknown,Married,$120K +,Blue,49,...,9926.0,2018,7908.0,0.618,2235,57,0.357,0.203,0.000372,0.99963
448,710038008,Existing Customer,56,F,1,Graduate,Single,$40K - $60K,Blue,47,...,6038.0,0,6038.0,0.741,1447,30,0.875,0.0,6.4e-05,0.99994
2375,709212183,Existing Customer,53,F,2,Graduate,Married,Less than $40K,Blue,36,...,3176.0,1470,1706.0,0.388,1634,53,0.472,0.463,9.3e-05,0.99991


### Remove any columns that won't be used.
Naive Bayes columns can't be used (it's calculated from features and target).
Education_Level, Marital_Status, and Income_Category have many Nan/Unknown values. Drop to preserve sample size.
CLIENTNUM is irrelevant.

In [75]:
starter_df.drop(columns=[
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
'CLIENTNUM',
'Education_Level', 
'Marital_Status', 
'Income_Category'
], 
inplace= True)

### Evaluate the Card Type column for use as a feature
The Card Type data is processed with OneHotEncoder.

In [76]:
enc = OneHotEncoder()
categorical_variables = ['Card_Category']
encoded_data = enc.fit_transform(starter_df[categorical_variables])

### Evaluate the Gender column for use as a feature
Gender is a binary classification, so we don't need to OneHotEncode.
The values will be converted to numericals and specified as datatype `int`.
1. Male = 0
2. Female = 1 


In [77]:
starter_df = starter_df.replace({'Gender' : {
'M': 0, 
'F': 1
}})
starter_df.Gender = starter_df.Gender.astype(int)

### Evaluate the Attrition_Flag column for use as the target
Attrition_Flag is a binary classification, so we don't need to OneHotEncode.
The values will be converted to numericals and specified as datatype `int`.
1. Attrited Customer = 0
2. Existing Customer = 1 

In [78]:
starter_df = starter_df.replace({'Attrition_Flag' : {
'Attrited Customer': 0, 
'Existing Customer': 1
}})
starter_df.Attrition_Flag = starter_df.Attrition_Flag.astype(int)

### Define our features(X) and our target(y)

In [79]:
y = starter_df['Attrition_Flag']
X = starter_df.drop(columns=['Attrition_Flag'])

### Split features and target in to training and testing sets

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### StandardScaler will be used within ColumnTransformer. This allows for the scaling of specified columns within our X_train and X_test set.
StandardScaler will only be fit to X_train.

In [81]:
scaler = StandardScaler()

col_tran= ColumnTransformer([
('Total_Revolving_Bal', scaler, ['Total_Revolving_Bal']),
('Months_Inactive_12_mon', scaler, ['Months_Inactive_12_mon']),
('Total_Trans_Amt', scaler, ['Total_Trans_Amt']),
('Total_Trans_Ct', scaler, ['Total_Trans_Ct']),
('Months_on_book', scaler, ['Months_on_book']),
('Customer_Age', scaler, ['Customer_Age']),
('Avg_Util_Ratio', scaler, ['Avg_Utilization_Ratio']),
('Total_Relationship_Count', scaler, ['Total_Relationship_Count']),
('Contacts_Count_12_mon', scaler, ['Contacts_Count_12_mon']),
('Total_Amt_Chng_Q4_Q1', scaler, ['Total_Amt_Chng_Q4_Q1']),
('Total_Ct_Chng_Q4_Q1', scaler, ['Total_Ct_Chng_Q4_Q1']),
('Dependent_count', scaler, ['Dependent_count']),
('Credit_Limit', scaler, ['Credit_Limit']),
('Avg_Open_To_Buy', scaler, ['Avg_Open_To_Buy'])
])


X_train = col_tran.fit_transform(X_train)
X_test = col_tran.transform(X_test)

### Use SMOTE to add synthetic data and balance our target/feature value count

In [82]:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

### Use GridSearch to determine the best hyperparameter values

In [83]:
'''
parameters = {
#'n_estimators' : [],               #default = 100    range = 1-inf
#'random_state' : [],               #default = None   range = 1-inf
#'subsample' : [],                  #default = 1      range = 0.-1
#'min_samples_split' : [],          #default = 2      range = 2-inf
#'max_depth' : [],                  #default = 3      range = 1-inf
#'min_impurity_decrease' : [],      #default = 0      range = 0 - inf
#'min_samples_leaf' : [],           #default = 1      range = 1 - inf
#'min_weight_fraction_leaf' : [],   #default = 0      range =0 - 0.5
#'max_leaf_nodes' : [],             #default = None   range = 2-inf
#'learning_rate' : [],              #default=0.1      range 0.0-inf
#'init' : ('zero', None),
#'loss' : ('deviance', 'exponential')
}



gbc = GradientBoostingClassifier()
clf = GridSearchCV(gbc, parameters)
clf.fit(X_train, y_train)
clf.best_estimator_
'''


"\nparameters = {\n#'n_estimators' : [],               #default = 100    range = 1-inf\n#'random_state' : [],               #default = None   range = 1-inf\n#'subsample' : [],                  #default = 1      range = 0.-1\n#'min_samples_split' : [],          #default = 2      range = 2-inf\n#'max_depth' : [],                  #default = 3      range = 1-inf\n#'min_impurity_decrease' : [],      #default = 0      range = 0 - inf\n#'min_samples_leaf' : [],           #default = 1      range = 1 - inf\n#'min_weight_fraction_leaf' : [],   #default = 0      range =0 - 0.5\n#'max_leaf_nodes' : [],             #default = None   range = 2-inf\n#'learning_rate' : [],              #default=0.1      range 0.0-inf\n#'init' : ('zero', None),\n#'loss' : ('deviance', 'exponential')\n}\n\n\n\ngbc = GradientBoostingClassifier()\nclf = GridSearchCV(gbc, parameters)\nclf.fit(X_train, y_train)\nclf.best_estimator_\n"

### Create an instance of the GradientBoostingClassifier model

In [84]:
gbc_model = GradientBoostingClassifier(
n_estimators=500,
random_state= 2
)

### Fit the model on the training data

In [85]:
gbc_model.fit(X_train, y_train)

GradientBoostingClassifier(n_estimators=500, random_state=2)

### Make predictions on the test data

In [86]:
gbc_test_predictions = gbc_model.predict(X_test)

### Evaluate the model's performance

In [87]:
accuracy = accuracy_score(y_test, gbc_test_predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 97.12%


### Generate a confusion matrix

In [88]:
gbc_test_matrix = confusion_matrix(y_test, gbc_test_predictions)
print(gbc_test_matrix)

[[ 374   31]
 [  42 2085]]


### Generate a classification report

In [89]:
gbc_testing_report = classification_report(y_test, gbc_test_predictions)
print(gbc_testing_report)

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       405
           1       0.99      0.98      0.98      2127

    accuracy                           0.97      2532
   macro avg       0.94      0.95      0.95      2532
weighted avg       0.97      0.97      0.97      2532



### Generate an imbalanced classification report

In [90]:
imb__gbc_testing_report = classification_report_imbalanced(y_test, gbc_test_predictions)
print(imb__gbc_testing_report)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.92      0.98      0.91      0.95      0.90       405
          1       0.99      0.98      0.92      0.98      0.95      0.91      2127

avg / total       0.97      0.97      0.93      0.97      0.95      0.91      2532

