<a href="https://colab.research.google.com/github/stepthom/869_course/blob/main/optuna_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# My First Optuna Slides

[Optuna](https://optuna.org/) is great for advanced hyperparameter tuning.

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2022-11-13 17:02:31.467412


In [2]:
pip install optuna

Collecting optuna
  Downloading optuna-3.0.3-py3-none-any.whl (348 kB)
     -------------------------------------- 348.5/348.5 kB 5.5 MB/s eta 0:00:00
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
     ------------------------------------- 209.8/209.8 kB 13.3 MB/s eta 0:00:00
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting cliff
  Downloading cliff-4.0.0-py3-none-any.whl (80 kB)
     -------------------------------------- 81.0/81.0 kB 906.6 kB/s eta 0:00:00
Collecting scipy<1.9.0,>=1.7.0
  Downloading scipy-1.8.1-cp39-cp39-win_amd64.whl (36.9 MB)
     --------------------------------------- 36.9/36.9 MB 15.2 MB/s eta 0:00:00
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.3-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.7/78.7 kB ? eta 0:00:00
Collecting cmd2>=1.0.0
  Downloading cmd2-2.4.2-py3-none-any.whl (147 kB)
    

In [3]:
import pandas as pd
import optuna
import numpy as np

# Load and Prep Data

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/stepthom/869_course/main/data/GermanCredit.csv")
df.head()

Unnamed: 0,Duration,Amount,InstallmentRatePercentage,ResidenceDuration,Age,NumberExistingCredits,NumberPeopleMaintenance,Telephone,ForeignWorker,Class,...,OtherInstallmentPlans.Bank,OtherInstallmentPlans.Stores,OtherInstallmentPlans.None,Housing.Rent,Housing.Own,Housing.ForFree,Job.UnemployedUnskilled,Job.UnskilledResident,Job.SkilledEmployee,Job.Management.SelfEmp.HighlyQualified
0,6,1169,4,4,67,2,1,0,1,Good,...,0,0,1,0,1,0,0,0,1,0
1,48,5951,2,2,22,1,1,1,1,Bad,...,0,0,1,0,1,0,0,0,1,0
2,12,2096,2,3,49,1,2,1,1,Good,...,0,0,1,0,1,0,0,1,0,0
3,42,7882,2,4,45,1,2,1,1,Good,...,0,0,1,0,0,1,0,0,1,0
4,24,4870,3,4,53,2,2,1,1,Bad,...,0,0,1,0,0,1,0,0,1,0


In [5]:
target_name = 'Class'
X = df.drop([target_name], axis=1)
y = df[target_name].to_numpy()

# Create and Run an Optuna Study

First we must create an objective function. This is a function that builds a model (given a particular values for all hyperparameters) and returns a score.

Read more in the tutorials: https://optuna.readthedocs.io/en/stable/tutorial/index.html

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score


def objective_dt(trial, X, y):

  # Now, define all the hyperparams we want to vary, and what values they are allowed
  # to take.
  #
  # Each trial, optuna will automatically choose values for each hyperparam.
  hyper_params = {
        
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),

        "max_depth": trial.suggest_int("max_depth", 5, 100, step=5),

        "min_samples_split": trial.suggest_int("min_samples_split", 5, 100, step=5),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 5, 100, step=5),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 1010, step=100),

        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),

        "random_state": 77,
  }
    

  # Use the hyperparams that optuna has chosen for this trial to create a DecisionTreeClassifier
  clf = DecisionTreeClassifier(**hyper_params)

  # Run CV to see how well these hyper_params do
  cv_scores = cross_val_score(clf, X, y, cv=10, scoring="f1_macro")
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score

In [7]:
# Create and run an Optuna study

# More options for creating the optuna study can be found at their webpage:
# https://optuna.readthedocs.io/en/stable/reference/generated/optuna.create_study.html
#
# The default sampler is called TPESampler and is very good, but there are others.

study = optuna.create_study(direction="maximize")


[32m[I 2022-11-13 17:05:36,693][0m A new study created in memory with name: no-name-1e24de8b-a501-4b57-8a59-9842021ec4ad[0m


In [8]:
# More options for optimizing the hyperparms can be found:
# https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize

study.optimize(lambda trial: objective_dt(trial, X, y), n_trials=100,  gc_after_trial=True)

[32m[I 2022-11-13 17:05:52,528][0m Trial 0 finished with value: 0.6337857831520763 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 80, 'min_samples_split': 30, 'min_samples_leaf': 100, 'max_leaf_nodes': 10, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6337857831520763.[0m
[32m[I 2022-11-13 17:05:52,818][0m Trial 1 finished with value: 0.6494559803481313 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 25, 'min_samples_split': 85, 'min_samples_leaf': 5, 'max_leaf_nodes': 1010, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.6494559803481313.[0m
[32m[I 2022-11-13 17:05:53,100][0m Trial 2 finished with value: 0.6401232747569949 and parameters: {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 60, 'min_samples_split': 40, 'min_samples_leaf': 50, 'max_leaf_nodes': 710, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.6494559803481313.[0m
[32m[I 2022-11-13 17:05:53,495][0m Trial 3 finish

# (Optional) Inspect the Results of the Study

In [9]:
# What were the best params?
study.best_params

{'criterion': 'entropy',
 'splitter': 'best',
 'max_depth': 40,
 'min_samples_split': 65,
 'min_samples_leaf': 5,
 'max_leaf_nodes': 810,
 'class_weight': 'balanced'}

In [10]:
# What was the best value?
study.best_value

0.6644413489111798

In [11]:
# All the details of the best trial
study.best_trial

FrozenTrial(number=53, values=[0.6644413489111798], datetime_start=datetime.datetime(2022, 11, 13, 17, 6, 6, 607980), datetime_complete=datetime.datetime(2022, 11, 13, 17, 6, 6, 757719), params={'criterion': 'entropy', 'splitter': 'best', 'max_depth': 40, 'min_samples_split': 65, 'min_samples_leaf': 5, 'max_leaf_nodes': 810, 'class_weight': 'balanced'}, distributions={'criterion': CategoricalDistribution(choices=('gini', 'entropy')), 'splitter': CategoricalDistribution(choices=('best', 'random')), 'max_depth': IntDistribution(high=100, log=False, low=5, step=5), 'min_samples_split': IntDistribution(high=100, log=False, low=5, step=5), 'min_samples_leaf': IntDistribution(high=100, log=False, low=5, step=5), 'max_leaf_nodes': IntDistribution(high=1010, log=False, low=10, step=100), 'class_weight': CategoricalDistribution(choices=('balanced', None))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=53, state=TrialState.COMPLETE, value=None)

In [12]:
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [13]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()

# Build the Final Model (on all the data)

In [14]:
clf = DecisionTreeClassifier(**study.best_params)
clf.fit(X,y)

DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=40, max_leaf_nodes=810, min_samples_leaf=5,
                       min_samples_split=65)

In [15]:
 # ... and now you can make predictions on new (competition, or real) data, deploy, etc...