### Import Statements
The libraries used are `pandas`, `pathlib`, `imblearn`, and `sklearn`. 

In [22]:
import pandas as pd
from pathlib import Path
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

### Import the data and create a dataframe
1. Use `pandas` and `pathlib` to read the `BankChurnersPrimary.csv` CSV from the `Resources` folder.
2. Sample the dataset to review the data.

In [23]:
starter_df = pd.read_csv(Path("c://users/ajcth/documents/github/bank_churn_model/Resources/BankChurnersPrimary.csv"))
starter_df.sample(5)

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
1858,719916408,Existing Customer,34,M,5,Graduate,Married,$60K - $80K,Blue,27,...,2501.0,1909,592.0,0.956,1850,34,1.125,0.763,0.000118,0.99988
9167,715351983,Existing Customer,32,F,2,Unknown,Married,Less than $40K,Blue,20,...,9761.0,1470,8291.0,0.796,14334,97,0.732,0.151,0.000198,0.9998
5947,802867683,Existing Customer,53,F,2,High School,Single,$40K - $60K,Blue,47,...,1536.0,728,808.0,0.604,4133,78,0.529,0.474,0.000186,0.99981
6214,711064008,Existing Customer,48,F,2,Unknown,Single,Less than $40K,Blue,36,...,9505.0,2390,7115.0,0.727,3616,68,0.619,0.251,0.000198,0.9998
2281,714062958,Existing Customer,36,M,4,Unknown,Single,$40K - $60K,Blue,36,...,9887.0,994,8893.0,0.488,2552,64,0.455,0.101,6.3e-05,0.99994


### Remove any columns that won't be used.
Naive Bayes columns can't be used (it's calculated from features and target).
Education_Level, Marital_Status, and Income_Category have many Nan/Unknown values. Drop to preserve sample size.
CLIENTNUM is irrelevant.

In [24]:
starter_df.drop(columns=[
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
'CLIENTNUM',
'Education_Level', 
'Marital_Status', 
'Income_Category'
], 
inplace= True)

### Encode categorical features
Using OneHotEncoder.

In [25]:
enc = OneHotEncoder(sparse=False, drop='if_binary')
categorical_variables = ['Card_Category', 'Gender', 'Attrition_Flag']
encoded_data = enc.fit_transform(starter_df[categorical_variables])
encoded_dataframe = pd.DataFrame(encoded_data, columns = enc.get_feature_names(categorical_variables))
starter_df.drop(columns=['Card_Category', 'Gender', 'Attrition_Flag'], inplace=True)
starter_df = pd.concat([starter_df, encoded_dataframe.set_axis(starter_df.index)], axis=1)
starter_df.sample(10)



Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver,Gender_M,Attrition_Flag_Existing Customer
1834,54,3,38,4,2,4,5122.0,1642,3480.0,0.935,1908,39,0.696,0.321,1.0,0.0,0.0,0.0,0.0,1.0
8923,47,2,35,1,6,3,5791.0,1957,3834.0,0.654,7823,93,0.722,0.338,1.0,0.0,0.0,0.0,1.0,1.0
2546,36,2,30,6,3,3,16050.0,1480,14570.0,1.326,3215,56,0.806,0.092,1.0,0.0,0.0,0.0,1.0,1.0
7284,50,1,39,6,3,3,9248.0,0,9248.0,0.613,2325,45,0.552,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9832,43,4,30,1,1,2,7386.0,656,6730.0,0.885,15691,126,0.68,0.089,1.0,0.0,0.0,0.0,1.0,1.0
3860,42,0,35,5,2,3,34516.0,1934,32582.0,0.626,4613,73,0.553,0.056,0.0,0.0,0.0,1.0,1.0,1.0
5551,65,0,55,5,2,1,6352.0,1386,4966.0,0.5,3709,77,0.604,0.218,1.0,0.0,0.0,0.0,0.0,1.0
5899,35,2,23,6,3,3,2317.0,1301,1016.0,0.925,4161,71,1.219,0.562,1.0,0.0,0.0,0.0,0.0,1.0
310,63,2,36,2,2,2,5007.0,930,4077.0,0.885,1050,28,1.154,0.186,1.0,0.0,0.0,0.0,1.0,1.0
8084,56,3,39,1,3,4,1632.0,0,1632.0,0.621,2509,47,0.424,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Define our features(X) and our target(y)

In [27]:
y = starter_df['Attrition_Flag_Existing Customer']
X = starter_df.drop(columns=['Attrition_Flag_Existing Customer'])

### Split features and target in to training and testing sets

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### StandardScaler will be used to scale the data.
StandardScaler will only be fit to X_train.

In [29]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Use SMOTE to add synthetic data and balance our target/feature value count

In [30]:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

### Create an instance of the GradientBoostingClassifier model

In [31]:
gbc_model = GradientBoostingClassifier(
n_estimators=500,
random_state= 2
)

### Fit the model on the training data

In [32]:
gbc_model.fit(X_train, y_train)

GradientBoostingClassifier(n_estimators=500, random_state=2)

### Make predictions on the test data

In [33]:
gbc_test_predictions = gbc_model.predict(X_test)

### Evaluate the model's performance

In [34]:
accuracy = accuracy_score(y_test, gbc_test_predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 97.39%


### Generate a confusion matrix

In [35]:
gbc_test_matrix = confusion_matrix(y_test, gbc_test_predictions)
print(gbc_test_matrix)

[[ 374   31]
 [  35 2092]]


### Generate a classification report

In [36]:
gbc_testing_report = classification_report(y_test, gbc_test_predictions)
print(gbc_testing_report)

              precision    recall  f1-score   support

         0.0       0.91      0.92      0.92       405
         1.0       0.99      0.98      0.98      2127

    accuracy                           0.97      2532
   macro avg       0.95      0.95      0.95      2532
weighted avg       0.97      0.97      0.97      2532



### Generate an imbalanced classification report

In [37]:
imb_gbc_testing_report = classification_report_imbalanced(y_test, gbc_test_predictions)
print(imb_gbc_testing_report)

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.91      0.92      0.98      0.92      0.95      0.90       405
        1.0       0.99      0.98      0.92      0.98      0.95      0.91      2127

avg / total       0.97      0.97      0.93      0.97      0.95      0.91      2532

