In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, r2_score, accuracy_score

from sklearn.ensemble import GradientBoostingClassifier

## plot size
plt.rcParams.update({'figure.figsize':(15,6)})
plt.rcParams.update({'font.size':18})

In [3]:
# importing the dataset
df = pd.read_csv("Churn_Modelling.csv")

# trimming the Age varibale
df = df[df["Age"]<=75].reset_index()

# creating dummy variables
dum = pd.get_dummies(df[['Geography','Gender']],drop_first=True)
df['Geography_Germany'] = dum['Geography_Germany']
df['Geography_Spain'] = dum['Geography_Spain']
df['Gender_Male'] = dum['Gender_Male']

# using the KNN Imputer for the Balance variable
df['Balance_nan'] = df['Balance'].replace(0.00, np.nan)
knn_df = df[['CreditScore','Age', 'Tenure', 'Balance_nan','EstimatedSalary','Gender_Male']]
imputer = KNNImputer(n_neighbors=10, weights="distance")
knn_df = imputer.fit_transform(knn_df)
knn_df = pd.DataFrame(knn_df,columns =['CreditScore','Age', 'Tenure', 'Balance_nan','EstimatedSalary','Gender_Male'])

df_keep = df[[ 'Exited','CreditScore','Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Geography_Germany',
       'Geography_Spain', 'Gender_Male']].reset_index()
df_keep = df_keep.drop("index",axis=1)
df_with_knn = df_keep.copy()
df_with_knn['Balance'] = knn_df['Balance_nan']

In [4]:
df_with_knn

Unnamed: 0,Exited,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
0,1,619,42,2,137971.483371,1,1,1,101348.88,0,0,0
1,0,608,41,1,83807.860000,1,0,1,112542.58,0,1,0
2,1,502,42,8,159660.800000,3,1,0,113931.57,0,0,0
3,0,699,39,1,133585.075460,2,0,0,93826.63,0,0,0
4,0,850,43,2,125510.820000,1,1,1,79084.10,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9950,0,771,39,5,144541.517907,2,1,0,96270.64,0,0,1
9951,0,516,35,10,57369.610000,1,1,1,101699.77,0,0,1
9952,1,709,36,7,132552.368184,1,0,1,42085.58,0,0,0
9953,1,772,42,3,75075.310000,2,1,0,92888.52,1,0,1


In [5]:
# dataframe with using knn on the balance
X_knn = df_with_knn.drop("Exited",axis=1)
y_knn = df_with_knn["Exited"]

X_train, X_test, y_train, y_test = train_test_split(X_knn, y_knn, test_size=0.25, stratify=y_knn)

In [6]:
scale=StandardScaler()
train=scale.fit_transform(X_train)
test=scale.transform(X_test)

### Best Model

In [7]:
# XG-Boost Model
xgb = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 3, max_features = 'sqrt', n_estimators = 200).fit(train, y_train)
y_pred = xgb.predict(test)
print(f'The classification rate is {round(accuracy_score(y_test, y_pred),4)}')

The classification rate is 0.8642
