In [None]:
import pandas as pd
from pathlib import Path
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from skfeature.function.similarity_based import fisher_score


### Import the data and create a dataframe
1. Use `pandas` and `pathlib` to read the `BankChurnersPrimary.csv` CSV from the `Resources` folder.
2. Sample the dataset to review the data.

In [None]:
starter_df = pd.read_csv(Path("c://users/ajcth/documents/github/bank_churn_model/Resources/BankChurnersPrimary.csv"))
starter_df.sample(5)

### Remove any columns that won't be used.
Naive Bayes columns can't be used (it's calculated from features and target).
Education_Level, Marital_Status, and Income_Category have many Nan/Unknown values. Drop to preserve sample size.
CLIENTNUM is irrelevant.

In [None]:
starter_df.drop(columns=[
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
'CLIENTNUM',
'Education_Level', 
'Marital_Status', 
'Income_Category'
], 
inplace= True)


### Evaluate the Card Type column for use as a feature
The Card Type data is processed with OneHotEncoder.

In [None]:

enc = OneHotEncoder(sparse=False)
categorical_variables = ['Card_Category']
encoded_data = enc.fit_transform(starter_df[categorical_variables])
encoded_dataframe = pd.DataFrame(encoded_data, columns = enc.get_feature_names(categorical_variables))
starter_df.drop(columns=['Card_Category'], inplace=True)
starter_df = pd.concat([starter_df, encoded_dataframe.set_axis(starter_df.index)], axis=1)
starter_df.sample(10)

### Evaluate the Gender column for use as a feature
Gender is a binary classification, so we don't need to OneHotEncode.
The values will be converted to numericals and specified as datatype `int`.
1. Male = 0
2. Female = 1 


In [None]:
starter_df = starter_df.replace({'Gender' : {
'M': 0, 
'F': 1
}})
starter_df.Gender = starter_df.Gender.astype(int)

### Evaluate the Attrition_Flag column for use as the target
Attrition_Flag is a binary classification, so we don't need to OneHotEncode.
The values will be converted to numericals and specified as datatype `int`.
1. Attrited Customer = 0
2. Existing Customer = 1 

In [None]:
starter_df = starter_df.replace({'Attrition_Flag' : {
'Attrited Customer': 0, 
'Existing Customer': 1
}})
starter_df.Attrition_Flag = starter_df.Attrition_Flag.astype(int)

### Create a heatmap to show correlation between features and target

In [None]:
cor = starter_df.corr()

plt.figure(figsize = (10,6))
sns.heatmap(cor, annot = True)