In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot

import plotly.express as ex
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.offline as pyo

import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score as f1
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
import scikitplot as skplt

from category_encoders import OrdinalEncoder

from imblearn.over_sampling import SMOTE


## Data Exploration 

In [None]:
data=pd.read_csv('bankdata.csv')

# Remove last 2 columns from the dataset.
data=data[data.columns[:-2]]
data.head()

In [None]:
data.dtypes

### Boxplots

In [None]:
fig=make_subplots(rows=6,cols=1)
# Age boxplot 
a1=go.Box(x=data['Customer_Age'],name="Age Boxplot", boxmean=True)
fig.add_trace(a1,row=1,col=1)

# Dependent count
a2=go.Box(x=data['Dependent_count'],name="Dependents Boxplot", boxmean=True)
fig.add_trace(a2,row=2,col=1)

a3=go.Box(x=data['Months_on_book'],name="Months on Book Boxplot", boxmean=True)
fig.add_trace(a3,row=3,col=1)

a4=go.Box(x=data['Total_Relationship_Count'],name="Total Products Boxplot", boxmean=True)
fig.add_trace(a4,row=4,col=1)

a5=go.Box(x=data['Months_Inactive_12_mon'],name="Number of Months Inactive Boxplot", boxmean=True)
fig.add_trace(a5,row=5,col=1)

a6=go.Box(x=data['Credit_Limit'],name="Credit Limit Boxplot", boxmean=True)
fig.add_trace(a6,row=6,col=1)




fig.show()

### Histograms

In [None]:
data.loc[:,('Customer_Age','Dependent_count','Months_on_book','Total_Relationship_Count','Months_Inactive_12_mon','Credit_Limit')].hist(color='k',alpha=0.5,bins=10)

### Piecharts

In [None]:
# Education Levels
ex.pie(data,names='Education_Level',title='Propotion Of Education Levels',hole=0.33)


In [None]:
#Marriage status
ex.pie(data,names='Marital_Status',title='Propotion Of Different Marriage Statuses',hole=0.33)

In [None]:
# Income Levels
ex.pie(data,names='Income_Category',title='Propotion Of Different Income Levels',hole=0.33)

In [None]:
# Card Categories
ex.pie(data,names='Card_Category',title='Propotion Of Different Card Categories',hole=0.33)

In [None]:
# Customer Statuses
ex.pie(data,names='Attrition_Flag',title='Proportion of Churn vs Not-Churn Customers',hole=0.33)

## Data Preprocessing 

In [None]:
data_he=pd.get_dummies(data,columns=['Education_Level','Income_Category','Marital_Status','Card_Category'], drop_first=True)
data_he.Gender=data_he.Gender.replace({'F':1, 'M':0})
data_he.Attrition_Flag=data_he.Attrition_Flag.replace({'Attrited Customer':1, 'Existing Customer':0})
data_he.drop(columns='CLIENTNUM')
data_he.head(5)

## Data Unsampling using SMOTE

In [None]:
oversample=SMOTE()
X,y=oversample.fit_resample(data_he[data_he.columns[1:]],data_he[data_he.columns[0]])
data_sampled=X.assign(Churn=y)

data_s=data_sampled[data_sampled.columns[15:-1]].copy()
data_sampled=data_sampled.drop(columns=data_sampled.columns[15:-1])

In [None]:
N_COMPONENTS = 4

pca_model = PCA(n_components = N_COMPONENTS )

pc_matrix = pca_model.fit_transform(data_s)

evr = pca_model.explained_variance_ratio_
total_var = evr.sum() * 100
cumsum_evr = np.cumsum(evr)

trace1 = {
    "name": "individual explained variance", 
    "type": "bar", 
    'y':evr}
trace2 = {
    "name": "cumulative explained variance", 
    "type": "scatter", 
     'y':cumsum_evr}
data = [trace1, trace2]
layout = {
    "xaxis": {"title": "Principal components"}, 
    "yaxis": {"title": "Explained variance ratio"},
  }
fig = go.Figure(data=data, layout=layout)
fig.update_layout(     title='Explained Variance Using {} Dimensions'.format(N_COMPONENTS))
fig.show()


In [None]:
unsampled_data=pd.concat([data_sampled,pd.DataFrame(pc_matrix,columns=['PC-{}'.format(i) for i in range(0,N_COMPONENTS)])],axis=1)

In [None]:
fig = ex.scatter_matrix(
    data_sampled_2[['PC-{}'.format(i) for i in range(0,N_COMPONENTS)]].values,
    color=unsampled_data.Credit_Limit,
    dimensions=range(N_COMPONENTS),
    labels={str(i):'PC-{}'.format(i) for i in range(0,N_COMPONENTS)},
    title=f'Total Explained Variance: {total_var:.2f}%')

fig.update_traces(diagonal_visible=False)
fig.update_layout(
    coloraxis_colorbar=dict(
        title="Credit_Limit",
    ),
)
fig.show()

## Model Selection

In [None]:
X_features = ['Total_Trans_Ct','PC-3','PC-1','PC-0','PC-2','Total_Ct_Chng_Q4_Q1','Total_Relationship_Count']

X = unsampled_data[X_features]
y = unsampled_data['Churn']

train_x,test_x,train_y,test_y = train_test_split(X,y,test_size=1/3,random_state=42)

## Cross Validation

In [None]:
rf_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",RandomForestClassifier(random_state=42)) ])
ada_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",AdaBoostClassifier(random_state=42,learning_rate=0.7)) ])
svm_pipe = Pipeline(steps =[ ('scale',StandardScaler()), ("RF",SVC(random_state=42,kernel='rbf')) ])


f1_cross_val_scores = cross_val_score(rf_pipe,train_x,train_y,cv=2,scoring='f1')
ada_f1_cross_val_scores=cross_val_score(ada_pipe,train_x,train_y,cv=2,scoring='f1')
svm_f1_cross_val_scores=cross_val_score(svm_pipe,train_x,train_y,cv=2,scoring='f1')

## Model Evaluation

In [None]:
rf_pipe.fit(train_x,train_y)
rf_prediction = rf_pipe.predict(test_x)

ada_pipe.fit(train_x,train_y)
ada_prediction = ada_pipe.predict(test_x)

svm_pipe.fit(train_x,train_y)
svm_prediction = svm_pipe.predict(test_x)

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['<b>Model<b>', '<b>F1 Score On Test Data<b>'],
                                           line_color='darkslategray',
    fill_color='whitesmoke',
    align=['center','center'],
    font=dict(color='black', size=18),
    height=40),cells=dict(values=[['<b>Random Forest<b>', '<b>AdaBoost<b>','<b>SVM<b>'], [np.round(f1(rf_prediction,test_y),2),np.round(f1(ada_prediction,test_y),2),np.round(f1(svm_prediction,test_y),2)]])

fig.update_layout(title='Model Results On Test Data')
fig.show()

In [None]:
z=confusion_matrix(unsampled_data_prediction_RF,original_df_with_pcs['Attrition_Flag'])
fig = ff.create_annotated_heatmap(z, x=['Not Churn','Churn'], y=['Predicted Not Churn','Predicted Churn'], colorscale='Fall',xgap=3,ygap=3)
fig['data'][0]['showscale'] = True
fig.update_layout(title='Prediction On Original Data With Random Forest Model Confusion Matrix')
fig.show()