In [1]:
import os
import hydra
from hydra.utils import instantiate
from sklearn.model_selection import train_test_split
import optuna
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier

from sklearn import datasets
import pandas as pd


import optuna
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice
from sklearn.metrics import accuracy_score
from pipeline import create_pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def objective(trial):
    df = pd.read_csv(
        r"https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
    )
    X = df.drop("Survived", axis=1)
    y = df["Survived"]
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
        
    }

    model = create_pipeline(
        estimator=GradientBoostingClassifier(**param),
        numerical_imputer=SimpleImputer(strategy='median'),
        numerical_scaler=StandardScaler(),
        categorical_imputer=SimpleImputer(strategy='most_frequent'),
        categorical_encoder=OneHotEncoder(handle_unknown='ignore'),
        numerical_features=["Age", 'Fare'],
        categorical_features=["Pclass", 'Sex', 'SibSp', 'Parch', 'Embarked'],
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42, 
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    # print(score)
    return score

In [3]:
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=12345),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
)

[32m[I 2022-05-12 10:29:05,971][0m A new study created in memory with name: no-name-d022e6c6-6b32-4042-988a-56e010bfce91[0m


In [4]:
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=100, timeout=600)

In [5]:
study

<optuna.study.study.Study at 0x17feb4fa0>

In [6]:
print(study.best_params)
plot_optimization_history(study).show()
plot_parallel_coordinate(study).show()
plot_param_importances(study)
plot_contour(study).show()
# plot_intermediate_values(study) in case of pruning

{'n_estimators': 34, 'learning_rate': 0.3068461847302287}
