# Imports

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,\
                                    cross_validate,\
                                    GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import LabelEncoder,\
                                  StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# Load data

In [25]:
dataset = pd.read_csv("Churn Modeling.csv")
dataset.drop(["RowNumber", "CustomerId", "Surname"], axis=1, inplace=True)
dataset.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [26]:
y = dataset["Exited"]
X = dataset.loc[:, :"EstimatedSalary"]

In [27]:
col_to_encode = ["Geography", "Gender"]
encoder = LabelEncoder()
for col in col_to_encode:
    X[col] = encoder.fit_transform(X[col])

In [28]:
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.0,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.8,3,1,0,113931.57
3,699,0,0,39,1,0.0,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.1


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Build pipeline

In [30]:
pipe = Pipeline([('scaler', StandardScaler()),
                 ("ada_clf", AdaBoostClassifier())])

pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.86

In [31]:
prediction = pipe.predict(X_test)
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      2379
           1       0.73      0.52      0.60       621

    accuracy                           0.86      3000
   macro avg       0.81      0.73      0.76      3000
weighted avg       0.85      0.86      0.85      3000



# Hyperparameter tuning

In [43]:
parameters = {
#     "base_estimator": [LogisticRegression(), DecisionTreeClassifier(), SVC()],
    "n_estimators": np.logspace(0, 2, 100).astype(int),
    "learning_rate": np.logspace(0, -1, 10),
    "algorithm": ["SAMME", "SAMME.R"]
}

clf_optim = GridSearchCV(estimator=AdaBoostClassifier(random_state=0),
            param_grid=parameters)

In [44]:
clf_optim.fit(X_train, y_train)

GridSearchCV(estimator=AdaBoostClassifier(random_state=0),
             param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                         'learning_rate': array([1.        , 0.77426368, 0.59948425, 0.46415888, 0.35938137,
       0.27825594, 0.21544347, 0.16681005, 0.12915497, 0.1       ]),
                         'n_estimators': array([  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,
         3,   3,   3,   3,   4,   4,   4,   4,   4,   5,   5,   5,   5,
         6,   6,   6,   7,   7,   7,   8,   8,   8,   9,   9,  10,  10,
        11,  11,  12,  12,  13,  14,  14,  15,  16,  17,  17,  18,  19,
        20,  21,  22,  23,  24,  25,  27,  28,  29,  31,  32,  34,  35,
        37,  39,  41,  43,  45,  47,  49,  52,  54,  57,  59,  62,  65,
        68,  72,  75,  79,  83,  86,  91,  95, 100])})

In [45]:
clf_optim.best_params_

{'algorithm': 'SAMME.R',
 'learning_rate': 0.2782559402207124,
 'n_estimators': 86}

In [46]:
pipe2 = Pipeline([('scaler', StandardScaler()),
                 ("ada_clf", AdaBoostClassifier(algorithm="SAMME.R", learning_rate=0.2783, n_estimators=86))])

In [47]:
pipe2.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('ada_clf',
                 AdaBoostClassifier(learning_rate=0.2783, n_estimators=86))])

In [48]:
y_pred = pipe2.predict(X_test)
pipe2.score(X_test, y_test)

0.863

In [None]:
pipe2.