<a href="https://colab.research.google.com/github/antbartash/australian_rain/blob/main/Grid_Random_HalvingSearchCV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from lightgbm import LGBMClassifier

In [2]:
data = pd.read_csv(
    'https://raw.githubusercontent.com/antbartash/australian_rain/main/data/data_transformed.csv',
    index_col=0
  )

print(data.shape)
data.head()

(142193, 23)


Unnamed: 0,Location,WindGustDir,WindDir9am,WindDir3pm,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month
0,2.0,13.0,13.0,14.0,13.4,22.9,0.6,,,44.0,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,0.0,0.0,12.0
1,2.0,14.0,6.0,15.0,7.4,25.1,0.0,,,44.0,...,25.0,1010.6,1007.8,,,17.2,24.3,0.0,0.0,12.0
2,2.0,15.0,13.0,15.0,12.9,25.7,0.0,,,46.0,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,0.0,0.0,12.0
3,2.0,4.0,9.0,0.0,9.2,28.0,0.0,,,24.0,...,16.0,1017.6,1012.8,,,18.1,26.5,0.0,0.0,12.0
4,2.0,13.0,1.0,7.0,17.5,32.3,1.0,,,41.0,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0.0,0.0,12.0


In [3]:
X, y = data.drop(columns=['RainTomorrow', 'RainToday']), data['RainTomorrow']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)


# Baseline model

In [4]:
baseline_model = LGBMClassifier(random_state=42, verbose=0)
cv_result = cross_validate(baseline_model, X, y, scoring='roc_auc',
                           return_train_score=True)

print(f"Mean train Gini: {cv_result['train_score'].mean() * 2 - 1}")
print(f"Mean test Gini: {cv_result['test_score'].mean() * 2 - 1}")

Mean train Gini: 0.8193700395010979
Mean test Gini: 0.7007491769077299


# GridSearchCV

In [None]:
hyperparameter_space = {
    'n_estimators': [100, 250, 500, 1000],
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],
    'num_leaves': [30],
    'max_depth': [1, 2, 6, -1],
    'class_weight': [None],
    'colsample_bytree': [0.75, 1],
    'reg_alpha': [0, 1, 5],
    'reg_lambda': [0, 1, 5]
}

model = LGBMClassifier(random_state=42, verbose=0)
clf = GridSearchCV(model, hyperparameter_space, cv=3,
                   scoring='roc_auc', verbose=1)
clf.fit(X_train, y_train)

print(f"Best Gini: {clf.best_score_ * 2 - 1}")
print(f"Best params: {clf.best_params_}")

Fitting 3 folds for each of 1440 candidates, totalling 4320 fits
