In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint

In [6]:
df = pd.read_csv('data/Churn_Modelling.csv')
df.head(3)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [8]:
cols_to_drop = ['RowNumber', 'Surname']
focus_df = df.drop(cols_to_drop, axis=1)
focus_df.head(3)

Unnamed: 0,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [10]:
df.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [11]:
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [13]:
str_cols = [
    col for col in focus_df.columns if focus_df[col].dtype == 'object'
]

str_cols

['Geography', 'Gender']

In [21]:
focus_df_encoded = pd.get_dummies(
    focus_df, columns=str_cols
).set_index('CustomerId')

focus_df_encoded.head(3)

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
15634602,619,42,2,0.0,1,1,1,101348.88,1,1,0,0,1,0
15647311,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
15619304,502,42,8,159660.8,3,1,0,113931.57,1,1,0,0,1,0


In [27]:
target = 'Exited'

X = focus_df_encoded.drop(target, axis=1)
y = focus_df[target]

pprint(f"Features: {', '.join(X.columns)}")

('Features: CreditScore, Age, Tenure, Balance, NumOfProducts, HasCrCard, '
 'IsActiveMember, EstimatedSalary, Geography_France, Geography_Germany, '
 'Geography_Spain, Gender_Female, Gender_Male')


In [30]:
y.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [28]:
from sklearn.model_selection import train_test_split

seed = 8

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=seed
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7500, 13), (2500, 13), (7500,), (2500,))

In [38]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=seed)
X_train_resamp, y_train_resamp = smote.fit_resample(X_train, y_train)

X_train_resamp.shape, X_test.shape, y_train_resamp.shape, y_test.shape

((11920, 13), (2500, 13), (11920,), (2500,))

In [51]:
y_train_resamp.value_counts()

0    5960
1    5960
Name: Exited, dtype: int64

In [63]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [62]:
from sklearn.linear_model import LogisticRegression

scoring = 'f1'

lr = LogisticRegression(random_state=seed)
scaler = StandardScaler()
lr_pipe = make_pipeline(scaler, lr)

lr_cv = np.mean(
    cross_val_score(
        lr_pipe, 
        X_train_resamp, 
        y_train_resamp,
        cv=5
    )
)

lr_cv

0.8308724832214764

In [65]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=seed)
rf_pipe = make_pipeline(scaler, rf)

rf_cv = np.mean(
    cross_val_score(
        rf_pipe,
        X_train_resamp,
        y_train_resamp,
        cv=5,
        scoring='f1'
    )
)

rf_cv

0.8622702244402773

In [66]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb_pipe = make_pipeline(scaler, nb)

nb_cv = np.mean(
    cross_val_score(
        nb_pipe,
        X_train_resamp,
        y_train_resamp,
        cv=5,
        scoring='f1'
    )
)

nb_cv

0.7741194473629169

In [69]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators=[
        ('lr', lr_pipe),
        ('rf', rf),
        ('nb', nb_pipe)
    ],
    final_estimator=RandomForestClassifier(random_state=seed)
)

stacking_cv_f1 = np.mean(
    cross_val_score(
        stacking_clf,
        X_train_resamp,
        y_train_resamp,
        cv=5,
        scoring='f1'
    )
)

stacking_cv_f1

0.8584245381334721