<a href="https://colab.research.google.com/github/antbartash/australian_rain/blob/main/HyperBand.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup on Google Colab

In [1]:
# install scikit-hyperband (debugged version)
!git clone https://github.com/PacktPublishing/Hyperparameter-Tuning-with-Python.git
%cp -r /content/Hyperparameter-Tuning-with-Python/hyperband /content

# install catboost
!pip install catboost

Cloning into 'Hyperparameter-Tuning-with-Python'...
remote: Enumerating objects: 259, done.[K
remote: Counting objects: 100% (259/259), done.[K
remote: Compressing objects: 100% (149/149), done.[K
remote: Total 259 (delta 90), reused 212 (delta 62), pack-reused 0[K
Receiving objects: 100% (259/259), 5.47 MiB | 9.97 MiB/s, done.
Resolving deltas: 100% (90/90), done.
Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from catboost import CatBoostClassifier
from scipy.stats import uniform, randint
from hyperband import HyperbandSearchCV

In [3]:
data = pd.read_csv(
    'https://raw.githubusercontent.com/antbartash/australian_rain/main/data/data_transformed.csv',
    index_col=0
  )

print(data.shape)
data.head()

(142193, 23)


Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month
0,2.0,13.0,13.0,14.0,13.4,22.9,0.6,,,44.0,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0,0.0,12.0
1,2.0,14.0,6.0,15.0,7.4,25.1,0.0,,,44.0,...,25.0,1010.6,1007.8,,,17.2,24.3,0.0,0.0,12.0
2,2.0,15.0,13.0,15.0,12.9,25.7,0.0,,,46.0,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0,0.0,12.0
3,2.0,4.0,9.0,0.0,9.2,28.0,0.0,,,24.0,...,16.0,1017.6,1012.8,,,18.1,26.5,0.0,0.0,12.0
4,2.0,13.0,1.0,7.0,17.5,32.3,1.0,,,41.0,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0,12.0


In [4]:
X, y = data.drop(columns=['RainTomorrow', 'RainToday']), data['RainTomorrow']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [5]:
def get_catboost_dataset(data):
    for column in ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']:
        data[column] = data[column].astype(np.float32).fillna(-1).apply(lambda x: str(x))
    return data

X_train, X_test = get_catboost_dataset(X_train), get_catboost_dataset(X_test)

X_train.dtypes

Location          object
WindGustDir       object
WindDir9am        object
WindDir3pm        object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
Evaporation      float64
Sunshine         float64
WindGustSpeed    float64
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
Month            float64
dtype: object

# Baseline model

In [6]:
baseline_model = CatBoostClassifier(
    n_estimators=100,
    cat_features=['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'],
    random_state=42, verbose=False, task_type='GPU'
)
cv_result = cross_validate(baseline_model, X_train, y_train, scoring='roc_auc',
                           return_train_score=True)

print(f"Mean train Gini: {cv_result['train_score'].mean() * 2 - 1}")
print(f"Mean test Gini: {cv_result['test_score'].mean() * 2 - 1}")
print(f"Mean fit time: {np.round(cv_result['fit_time'].mean(), 2)} s")

Mean train Gini: 0.796968780534191
Mean test Gini: 0.7746998696201008
Mean fit time: 2.09 s


In [None]:
hyperparameter_space = {
    'depth': randint(1, 8),
    'l2_leaf_reg': uniform(0.0, 100.0),
    # 'random_strength': uniform(loc=0.01, scale=50), # CPU only
    'bagging_temperature': uniform(0.0, 100.0),
    'grow_policy': ['SymmetricTree', 'Depthwise'],
    'scale_pos_weight': [1, 3, 3.5, 4]
}

model = CatBoostClassifier(
    n_estimators=100,
    cat_features=['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'],
    random_state=42, verbose=False, task_type='GPU'
)

clf = HyperbandSearchCV(
    model, hyperparameter_space, resource_param='n_estimators', eta=3,
    min_iter=100, max_iter=10000, # the minimum and maximum resources for all brackets
    scoring='roc_auc', n_jobs=-1, cv=3, refit=False, random_state=42, verbose=3
)
clf.fit(X_train, y_train)

Starting bracket 1 (out of 5) of hyperband
Starting successive halving iteration 1 out of 5. Fitting 81 configurations, with resource_param n_estimators set to 123, and keeping the best 27 configurations.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
