### Import Statements
The libraries used are `pandas`, `pathlib`, `imblearn`, and `sklearn`. 

In [1]:
import pandas as pd
from pathlib import Path
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV


### Import the data and create a dataframe
1. Use `pandas` and `pathlib` to read the `BankChurnersPrimary.csv` CSV from the `Resources` folder.
2. Sample the dataset to review the data.

In [2]:
starter_df = pd.read_csv(Path("c://users/ajcth/documents/github/bank_churn_model/Resources/BankChurnersPrimary.csv"))
starter_df.sample(5)

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
4055,718602708,Existing Customer,43,M,3,Graduate,Single,$80K - $120K,Blue,33,...,16351.0,1541,14810.0,0.763,3475,82,0.673,0.094,0.000187,0.99981
4795,711646908,Attrited Customer,39,M,3,Unknown,Single,$60K - $80K,Blue,23,...,7821.0,2517,5304.0,0.408,2010,43,0.344,0.322,0.99544,0.004558
1595,769751958,Existing Customer,55,M,3,Unknown,Married,$120K +,Blue,49,...,9926.0,2018,7908.0,0.618,2235,57,0.357,0.203,0.000372,0.99963
8575,779141658,Existing Customer,48,M,4,Graduate,Married,$80K - $120K,Blue,33,...,4963.0,2115,2848.0,0.767,4989,78,0.696,0.426,0.000189,0.99981
2826,787460883,Attrited Customer,41,M,3,Uneducated,Married,$40K - $60K,Blue,28,...,11091.0,0,11091.0,0.522,1234,34,0.308,0.0,0.99819,0.001812


### Remove any columns that won't be used.
Naive Bayes columns can't be used (it's calculated from features and target).
Education_Level, Marital_Status, and Income_Category have many Nan/Unknown values. Drop to preserve sample size.
CLIENTNUM is irrelevant.

In [3]:
starter_df.drop(columns=[
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
'CLIENTNUM',
'Education_Level', 
'Marital_Status', 
'Income_Category'
], 
inplace= True)

### Evaluate the Card Type column for use as a feature
The Card Type data is processed with OneHotEncoder.

In [4]:
enc = OneHotEncoder(sparse=False)
categorical_variables = ['Card_Category']
encoded_data = enc.fit_transform(starter_df[categorical_variables])
encoded_dataframe = pd.DataFrame(encoded_data, columns = enc.get_feature_names(categorical_variables))
starter_df.drop(columns=['Card_Category'], inplace=True)
starter_df = pd.concat([starter_df, encoded_dataframe.set_axis(starter_df.index)], axis=1)
starter_df.sample(10)



Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
2056,Existing Customer,54,F,0,36,4,3,1,12547.0,1378,11169.0,1.12,3360,56,0.75,0.11,0.0,0.0,0.0,1.0
3548,Existing Customer,36,M,4,36,5,3,3,28200.0,765,27435.0,0.68,2507,64,0.641,0.027,1.0,0.0,0.0,0.0
7964,Existing Customer,45,F,3,34,1,2,1,1811.0,787,1024.0,0.585,4882,80,1.162,0.435,1.0,0.0,0.0,0.0
683,Existing Customer,46,F,2,35,6,4,3,4675.0,1734,2941.0,0.413,1232,34,0.545,0.371,1.0,0.0,0.0,0.0
5499,Existing Customer,43,F,4,36,3,3,2,4902.0,2517,2385.0,0.982,4508,69,0.643,0.513,1.0,0.0,0.0,0.0
7136,Existing Customer,53,F,2,36,3,4,3,2592.0,1364,1228.0,0.679,4208,68,0.7,0.526,1.0,0.0,0.0,0.0
821,Existing Customer,37,F,4,36,5,3,1,3093.0,2517,576.0,0.855,1371,23,0.917,0.814,1.0,0.0,0.0,0.0
9727,Existing Customer,37,M,1,17,1,1,3,34516.0,0,34516.0,0.776,14127,116,0.731,0.0,1.0,0.0,0.0,0.0
5969,Existing Customer,51,F,3,45,5,2,3,9117.0,1259,7858.0,0.88,3837,75,0.705,0.138,1.0,0.0,0.0,0.0
5536,Existing Customer,47,F,3,34,4,1,3,1963.0,1524,439.0,0.879,3492,59,1.034,0.776,1.0,0.0,0.0,0.0


### Evaluate the Gender column for use as a feature
Gender is a binary classification, so we don't need to OneHotEncode.
The values will be converted to numericals and specified as datatype `int`.
1. Male = 0
2. Female = 1 


In [5]:
starter_df = starter_df.replace({'Gender' : {
'M': 0, 
'F': 1
}})
starter_df.Gender = starter_df.Gender.astype(int)

### Evaluate the Attrition_Flag column for use as the target
Attrition_Flag is a binary classification, so we don't need to OneHotEncode.
The values will be converted to numericals and specified as datatype `int`.
1. Attrited Customer = 0
2. Existing Customer = 1 

In [6]:
starter_df = starter_df.replace({'Attrition_Flag' : {
'Attrited Customer': 0, 
'Existing Customer': 1
}})
starter_df.Attrition_Flag = starter_df.Attrition_Flag.astype(int)

### Define our features(X) and our target(y)

In [7]:
y = starter_df['Attrition_Flag']
X = starter_df.drop(columns=['Attrition_Flag'])

### Split features and target in to training and testing sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

### StandardScaler will be used to scale the data.
StandardScaler will only be fit to X_train.

In [9]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Use SMOTE to add synthetic data and balance our target/feature value count

In [10]:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

### Create an instance of the GradientBoostingClassifier model

In [11]:
gbc_model = GradientBoostingClassifier(
n_estimators=500,
random_state= 2
)

### Fit the model on the training data

In [12]:
gbc_model.fit(X_train, y_train)

GradientBoostingClassifier(n_estimators=500, random_state=2)

### Make predictions on the test data

In [13]:
gbc_test_predictions = gbc_model.predict(X_test)

### Evaluate the model's performance

In [14]:
accuracy = accuracy_score(y_test, gbc_test_predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 97.20%


### Generate a confusion matrix

In [15]:
gbc_test_matrix = confusion_matrix(y_test, gbc_test_predictions)
print(gbc_test_matrix)

[[ 371   34]
 [  37 2090]]


### Generate a classification report

In [16]:
gbc_testing_report = classification_report(y_test, gbc_test_predictions)
print(gbc_testing_report)

              precision    recall  f1-score   support

           0       0.91      0.92      0.91       405
           1       0.98      0.98      0.98      2127

    accuracy                           0.97      2532
   macro avg       0.95      0.95      0.95      2532
weighted avg       0.97      0.97      0.97      2532



### Generate an imbalanced classification report

In [17]:
imb_gbc_testing_report = classification_report_imbalanced(y_test, gbc_test_predictions)
print(imb_gbc_testing_report)

                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.92      0.98      0.91      0.95      0.89       405
          1       0.98      0.98      0.92      0.98      0.95      0.91      2127

avg / total       0.97      0.97      0.93      0.97      0.95      0.90      2532

