In [1]:
# LIGHTGBM WITH OPTUNA TUTORIAL
# Medium: https://towardsdatascience.com/how-to-make-your-model-awesome-with-optuna-b56d490368af
# Github: https://github.com/PiotrekGa/optuna_article/blob/master/Example.ipynb

In [2]:
import os
import warnings

import joblib
import numpy as np
import optuna
from lightgbm import LGBMClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

warnings.filterwarnings("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = fetch_20newsgroups()

X = data["data"][:5000]
y = data["target"][:5000]

In [4]:
model = Pipeline(
    [
        ("tfidf", TfidfVectorizer(stop_words="english")),
        (
            "lgbc",
            LGBMClassifier(
                objective="f1_score", class_weight="balanced", n_jobs=1
            ),
        ),
    ]
)

In [5]:
def objective(trial):

    tfidf__analyzer = trial.suggest_categorical(
        "tfidf__analyzer", ["word", "char", "char_wb"]
    )
    tfidf__lowercase = trial.suggest_categorical(
        "tfidf__lowercase", [False, True]
    )
    tfidf__max_features = trial.suggest_int("tfidf__max_features", 500, 10_000)
    lgbc__num_leaves = trial.suggest_int("lgbc__num_leaves", 2, 150)
    lgbc__max_depth = trial.suggest_int("lgbc__max_depth", 2, 100)
    lgbc__n_estimators = trial.suggest_int("lgbc__n_estimators", 10, 200)
    lgbc__subsample_for_bin = trial.suggest_int(
        "lgbc__subsample_for_bin", 2000, 300_000
    )
    lgbc__min_child_samples = trial.suggest_int(
        "lgbc__min_child_samples", 20, 500
    )
    lgbc__reg_alpha = trial.suggest_uniform("lgbc__reg_alpha", 0.0, 1.0)
    lgbc__colsample_bytree = trial.suggest_uniform(
        "lgbc__colsample_bytree", 0.6, 1.0
    )
    lgbc__learning_rate = trial.suggest_loguniform(
        "lgbc__learning_rate", 1e-3, 1e-1
    )

    params = {
        "tfidf__analyzer": tfidf__analyzer,
        "tfidf__lowercase": tfidf__lowercase,
        "tfidf__max_features": tfidf__max_features,
        "lgbc__num_leaves": lgbc__num_leaves,
        "lgbc__max_depth": lgbc__max_depth,
        "lgbc__n_estimators": lgbc__n_estimators,
        "lgbc__subsample_for_bin": lgbc__subsample_for_bin,
        "lgbc__min_child_samples": lgbc__min_child_samples,
        "lgbc__reg_alpha": lgbc__reg_alpha,
        "lgbc__colsample_bytree": lgbc__colsample_bytree,
        "lgbc__learning_rate": lgbc__learning_rate,
    }

    model.set_params(**params)

    return -1 * np.mean(cross_val_score(model, X, y, cv=8, n_jobs=-1))

In [6]:
if os.path.isfile("optuna_study.pkl"):
    study = joblib.load("optuna_study.pkl")
else:
    study = optuna.create_study()
    study.optimize(objective, n_trials=20, timeout=3600)
    joblib.dump(study, "study.pkl")

[I 2023-11-05 14:50:37,237] A new study created in memory with name: no-name-57860e42-77ce-4da9-a7d7-e05dcf7e9a71
[I 2023-11-05 14:50:44,753] Trial 0 finished with value: -0.6422 and parameters: {'tfidf__analyzer': 'word', 'tfidf__lowercase': True, 'tfidf__max_features': 5687, 'lgbc__num_leaves': 18, 'lgbc__max_depth': 68, 'lgbc__n_estimators': 17, 'lgbc__subsample_for_bin': 161774, 'lgbc__min_child_samples': 42, 'lgbc__reg_alpha': 0.6782263437420829, 'lgbc__colsample_bytree': 0.7160039924526378, 'lgbc__learning_rate': 0.0011808572133990506}. Best is trial 0 with value: -0.6422.
[I 2023-11-05 14:50:58,905] Trial 1 finished with value: -0.2862 and parameters: {'tfidf__analyzer': 'char_wb', 'tfidf__lowercase': False, 'tfidf__max_features': 4449, 'lgbc__num_leaves': 117, 'lgbc__max_depth': 36, 'lgbc__n_estimators': 135, 'lgbc__subsample_for_bin': 129298, 'lgbc__min_child_samples': 132, 'lgbc__reg_alpha': 0.7765518329150993, 'lgbc__colsample_bytree': 0.9931820171585545, 'lgbc__learning_rat

['study.pkl']

In [7]:
print("Best_value:", study.best_value)

Best_value: -0.7596


In [8]:
model.set_params(**study.best_params)
model.fit(X, y)