In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import CatBoostEncoder
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
)
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from feature_engine.imputation import DropMissingData
from feature_engine.selection import DropFeatures
from feature_engine.creation import CombineWithReferenceFeature
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import (
    classification_report,
    f1_score,
    precision_recall_curve,
    roc_auc_score,
    plot_roc_curve,
)
import optuna
import warnings

# Config
%matplotlib inline
%load_ext nb_black
%load_ext lab_black
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


<IPython.core.display.Javascript object>

In [2]:
X_train = pd.read_csv("../data/raw/train.csv")
X_test = pd.read_csv("../data/raw/test.csv")

<IPython.core.display.Javascript object>

In [3]:
X_train["Churn"] = X_train["Churn"].map({"Yes": 1, "No": 0})
X_test["Churn"] = X_test["Churn"].map({"Yes": 1, "No": 0})

<IPython.core.display.Javascript object>

In [4]:
def casting_numerical(dataframe, numerical_feature):
    """Cast features to the correct type."""
    dataframe[numerical_feature] = dataframe[numerical_feature].apply(
        lambda dataframe: str(dataframe).replace(",", "."),
    )
    dataframe[numerical_feature] = pd.to_numeric(
        dataframe[numerical_feature], errors="coerce"
    )
    dataframe[numerical_feature] = dataframe[numerical_feature].astype("float64")
    dataframe[numerical_feature] = dataframe[numerical_feature].replace("", np.nan)
    return dataframe


def casting_categorical(dataframe, categorical_feature):
    """Cast features to the correct type."""
    dataframe[categorical_feature] = dataframe[categorical_feature].astype("object")
    return dataframe

<IPython.core.display.Javascript object>

In [5]:
# apply casting_numerical function to X_train and X_test simultaneously
X_train, X_test = map(
    lambda dataframe: casting_numerical(dataframe, "TotalCharges"),
    [X_train, X_test],
)

X_train, X_test = map(
    lambda dataframe: casting_categorical(dataframe, "SeniorCitizen"),
    [X_train, X_test],
)

<IPython.core.display.Javascript object>

In [6]:
# dropna from X_train and X_test simultaneously
X_train, X_test = map(lambda dataframe: dataframe.dropna(), [X_train, X_test])

<IPython.core.display.Javascript object>

In [7]:
y_train = X_train["Churn"]
X_train = X_train.drop("Churn", axis=1)

y_test = X_test["Churn"]
X_test = X_test.drop("Churn", axis=1)

<IPython.core.display.Javascript object>

In [8]:
def objective(trial):

    params = {
        "classifier__n_estimators": trial.suggest_int(
            "classifier__n_estimators", 100, 1000
        ),
        "classifier__max_depth": trial.suggest_int("classifier__max_depth", 3, 10),
        "classifier__learning_rate": trial.suggest_loguniform(
            "classifier__learning_rate", 1e-3, 1.0
        ),
        "classifier__subsample": trial.suggest_uniform(
            "classifier__subsample", 0.2, 1.0
        ),
        "classifier__colsample_bytree": trial.suggest_uniform(
            "classifier__colsample_bytree", 0.2, 1.0
        ),
        "classifier__reg_alpha": trial.suggest_loguniform(
            "classifier__reg_alpha", 1e-3, 10.0
        ),
        "classifier__reg_lambda": trial.suggest_loguniform(
            "classifier__reg_lambda", 1e-3, 10.0
        ),
    }

    pipeline = Pipeline(
        [
            ("drop_vars", DropFeatures(["customerID"])),
            (
                "tenure_combine",
                CombineWithReferenceFeature(
                    variables_to_combine=["MonthlyCharges", "TotalCharges"],
                    reference_variables=["tenure"],
                    operations=["div"],
                    new_variables_names=["tenureMonthlyRate", "tenureTotalRate"],
                ),
            ),
            (
                "totalcharges_combine",
                CombineWithReferenceFeature(
                    variables_to_combine=["TotalCharges"],
                    reference_variables=["MonthlyCharges"],
                    operations=["div"],
                    new_variables_names=["RateCharge"],
                ),
            ),
            (
                "preprocessor",
                ColumnTransformer(
                    [
                        (
                            "num",
                            make_pipeline(
                                SimpleImputer(strategy="median"),
                            ),
                            make_column_selector(dtype_include=np.number),
                        ),
                        (
                            "cat",
                            make_pipeline(
                                SimpleImputer(strategy="most_frequent"),
                                TargetEncoder(),
                            ),
                            make_column_selector(dtype_include=["object"]),
                        ),
                    ]
                ),
            ),
            ("classifier", XGBClassifier(**params, random_state=42)),
        ]
    )

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

    scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring="f1")

    return scores.mean()

<IPython.core.display.Javascript object>

study = optuna.create_study(
    direction="maximize",
    study_name="xgb",
)
study.optimize(objective, n_trials=25)

# Imprime os resultados da otimização
print("The Bests Parameters are: ", study.best_params)
print("The Best Score Value is: ", study.best_value)
print("The Best Trial is: ", study.best_trial)

optuna.visualization.plot_optimization_history(study)

# use the best parameters to fit the model
pipeline = Pipeline(
    [
        ("drop_vars", DropFeatures(["customerID"])),
        (
            "tenure_combine",
            CombineWithReferenceFeature(
                variables_to_combine=["MonthlyCharges", "TotalCharges"],
                reference_variables=["tenure"],
                operations=["div"],
                new_variables_names=["tenureMonthlyRate", "tenureTotalRate"],
            ),
        ),
        (
            "totalcharges_combine",
            CombineWithReferenceFeature(
                variables_to_combine=["TotalCharges"],
                reference_variables=["MonthlyCharges"],
                operations=["div"],
                new_variables_names=["RateCharge"],
            ),
        ),
        (
            "preprocessor",
            ColumnTransformer(
                [
                    (
                        "num",
                        make_pipeline(
                            SimpleImputer(strategy="median"),
                        ),
                        make_column_selector(dtype_include=np.number),
                    ),
                    (
                        "cat",
                        make_pipeline(
                            SimpleImputer(strategy="most_frequent"),
                            TargetEncoder(),
                        ),
                        make_column_selector(dtype_include=["object"]),
                    ),
                ]
            ),
        ),
        (
            "classifier",
            XGBClassifier(
                **study.best_params,
                random_state=42,
            ),
        ),
    ]
)


pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))