### prerequsite: pip install kaleido optuna

In [1]:
import catboost as cb
import category_encoders as ce
import json
import numpy as np
import optuna
import os
import pandas as pd

from sklearn import preprocessing
from lib.clean import clean
from lib.data_prepare import data_prepare
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import precision_recall_curve
from lib.plot import plot
from lib.stats import stats
from sklearn.metrics import classification_report
from pathlib import Path
from sklearn.model_selection import StratifiedKFold, KFold
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


def objective(trial: optuna.Trial):
    file_path = "/global/cfs/cdirs/kbase/KE-Catboost/ziming/InterProScan/common_samples/data/go/go_aggregate_4.1_rm30_corr0.9.pkl"
    df = pd.read_pickle(file_path)
    X = df.iloc[:, 1:]
    y = df['biome']

    param = {
        "objective": trial.suggest_categorical("objective", ["MultiClassOneVsAll", "MultiClass"]),
        "depth": trial.suggest_int("depth", 4, 6, step=2), #Maximum tree depth is 16
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.2, step=0.05),
        # "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 1, 5, step=1),
        # "random_strength": trial.suggest_int("random_strength", 1, 5, step=1),
        # "bagging_temperature": trial.suggest_int("bagging_temperature", 0, 5, step=1),
         # "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"])
    }
    
    fold=5
    skf = StratifiedKFold(n_splits=fold)#, random_state=42)
    acc_test = []
    for train_val, test in skf.split(X, y):
        X_train_val, y_train_val = X.iloc[train_val], y.iloc[train_val]
        X_test, y_test = X.iloc[test], y.iloc[test]
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, random_state=42, test_size=0.22, stratify=y_train_val) 
        # If not None, data is split in a stratified fashion
        # Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.
        # 1. train pool
        X_train_pool = Pool(
            data=X_train,
            label=y_train,
        )
        X_val_pool = Pool(
            data=X_val,
            label=y_val,
        )
        # 2. Init the model
        gbm = cb.CatBoostClassifier(
            custom_metric='Accuracy',
            random_seed=42,
            task_type="GPU",
            **param)
            
        # 3. training
        gbm.fit(X_train_pool, eval_set=X_val_pool, verbose=0, early_stopping_rounds=100)
        
        # 4. predict
        preds = gbm.predict(X_test)
        acc = accuracy_score(y_test, preds)
        acc_test.append(acc)

    return sum(acc_test)/fold

  from pandas import Int64Index as NumericIndex


In [None]:
study_name = "go_optuna_cv"  # Unique identifier of the study.
study = optuna.create_study(direction="maximize",study_name=study_name, storage="sqlite:///{}.db".format(study_name),load_if_exists=True)
study.optimize(objective, n_trials=5)

[32m[I 2022-09-30 14:38:22,330][0m Using an existing study with name 'go_optuna_cv' instead of creating a new one.[0m
[32m[I 2022-09-30 14:41:06,723][0m Trial 1 finished with value: 0.7775360920567158 and parameters: {'objective': 'MultiClassOneVsAll', 'depth': 6, 'learning_rate': 0.1}. Best is trial 1 with value: 0.7775360920567158.[0m
[32m[I 2022-09-30 14:42:27,538][0m Trial 2 finished with value: 0.7769265816683284 and parameters: {'objective': 'MultiClassOneVsAll', 'depth': 4, 'learning_rate': 0.2}. Best is trial 1 with value: 0.7775360920567158.[0m
[32m[I 2022-09-30 14:45:12,118][0m Trial 3 finished with value: 0.775650685536083 and parameters: {'objective': 'MultiClassOneVsAll', 'depth': 6, 'learning_rate': 0.2}. Best is trial 1 with value: 0.7775360920567158.[0m
[32m[I 2022-09-30 14:48:13,169][0m Trial 4 finished with value: 0.7725480236005862 and parameters: {'objective': 'MultiClass', 'depth': 6, 'learning_rate': 0.1}. Best is trial 1 with value: 0.77753609205671

In [None]:
# save best params to json file
output_dir = "{}_output/".format(study_name)
Path(output_dir).mkdir(parents=True, exist_ok=True)
with open(os.path.join(output_dir, 'best_params.json'), 'w') as fp:
    json.dump(study.best_params, fp)

In [None]:
# save parallel plot of result
parallel_plot = plot_parallel_coordinate(study)
parallel_plot.write_image(os.path.join(output_dir, "{}_parallel_plot.png".format(study_name)))
parallel_plot.show()

In [None]:
# save contour plot of result
contour_plot = plot_contour(study)
contour_plot.write_image(os.path.join(output_dir, "{}_contour_plot.png".format(study_name)))
contour_plot.show()