In [11]:
pip install matplotlib pandas numpy plotly seaborn optuna lightgbm scikit-learn


Collecting seaborn
  Obtaining dependency information for seaborn from https://files.pythonhosted.org/packages/7b/e5/83fcd7e9db036c179e0352bfcd20f81d728197a16f883e7b90307a88e65e/seaborn-0.13.0-py3-none-any.whl.metadata
  Using cached seaborn-0.13.0-py3-none-any.whl.metadata (5.3 kB)
Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/b3/f8/ee33e36194eb03a76eccf3adac3fba51f0e56fbd20609bb531659d48d3cb/lightgbm-4.1.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.1.0-py3-none-win_amd64.whl.metadata (19 kB)
Using cached seaborn-0.13.0-py3-none-any.whl (294 kB)
Downloading lightgbm-4.1.0-py3-none-win_amd64.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.3 MB 2.0 MB/s eta 0:00:01
   ---- ----------------------------------- 0.2/1.3 MB 1.8 MB/s eta 0:00:01
   -------------- ------------------------- 0.5/1.3 MB 3.6 MB/s eta 0:00:01
   -------

In [12]:
%matplotlib inline
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import optuna
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import sklearn

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from warnings import simplefilter
simplefilter("ignore", category=RuntimeWarning)

  from .autonotebook import tqdm as notebook_tqdm


**We are reading the train, validation and test splits.**

In [13]:
train = pd.read_csv(r"C:\Users\User\Videos\UTS\ꞮꞮꞮ.Semester\iLab 2\Countries\Greece\Wildfire Prediction\Greece_wildfire_prediction\data\Train, valid and test\train.csv" )
valid = pd.read_csv(r"C:\Users\User\Videos\UTS\ꞮꞮꞮ.Semester\iLab 2\Countries\Greece\Wildfire Prediction\Greece_wildfire_prediction\data\Train, valid and test\valid.csv" )
test = pd.read_csv(r"C:\Users\User\Videos\UTS\ꞮꞮꞮ.Semester\iLab 2\Countries\Greece\Wildfire Prediction\Greece_wildfire_prediction\data\Train, valid and test\test.csv")

# We are setting our target feature columns.

In [14]:
features = [
    'latitude', 'longitude', 'month',
    'fire_cnt_before', 'fire_before',
    'fire_cnt_last_year', 'fire_last_year',
    'fire_cnt_last_year_same_month', 'fire_last_year_same_month',
    'temperature_min', 'temperature_max', 'temperature_avg'
]

**We are defining our objective function to optimize. We want to maximize our ROC-AUC score for our classifier.**

In [15]:
def objective(trial):

    train_data = lgb.Dataset(train[features], label=train.fire)

    param = {
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02,0.05]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,50,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 2, 1000),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('cat_smooth', 1, 256),
        'cat_l2' : trial.suggest_int('cat_smooth', 1, 256),
    }


    gbm = lgb.train(param, train_data)
    preds = gbm.predict(valid[features])
    pred_labels = np.rint(preds)
    accuracy = roc_auc_score(valid.fire, pred_labels)
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=-1)

[I 2023-10-17 17:33:25,356] A new study created in memory with name: no-name-71b11176-32ce-4cc4-8c78-f3e05b78b6ce
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
  'feature

Collecting plotly.express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Collecting plotly>=4.1.0 (from plotly.express)
  Obtaining dependency information for plotly>=4.1.0 from https://files.pythonhosted.org/packages/df/79/c80174d711ee26ee5da55a9cc3e248f1ec7a0188b5e4d6bbbbcd09b974b0/plotly-5.17.0-py2.py3-none-any.whl.metadata
  Downloading plotly-5.17.0-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting statsmodels>=0.9.0 (from plotly.express)
  Downloading statsmodels-0.14.0-cp39-cp39-win_amd64.whl (9.4 MB)
     ---------------------------------------- 0.0/9.4 MB ? eta -:--:--
     ---------------------------------------- 0.1/9.4 MB 3.3 MB/s eta 0:00:03
     - -------------------------------------- 0.3/9.4 MB 3.5 MB/s eta 0:00:03
     - -------------------------------------- 0.3/9.4 MB 2.5 MB/s eta 0:00:04
     -- ------------------------------------- 0.6/9.4 MB 3.3 MB/s eta 0:00:03
     ---- ----------------------------------- 1.0/9.4 MB 4.8 MB/s eta 0:00:02
     --

  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
[I 2023-10-17 17:33:46,474] Trial 32 finished with value: 0.5 and parameters: {'lambda_l1': 8.080041745399432e-07, 'lambda_l2': 0.0003985074628047148, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.05, 'max_depth': 10, 'num_leaves': 384, 'feature_fraction': 0.5343813659552685, 'bagging_fraction': 0.5820267825199297, 'bagging_freq': 5, 'min_child_samples': 246, 'cat_smooth': 4}. Best is trial 3 with value: 0.5.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
[I 2023

In [16]:
study.best_params

{'lambda_l1': 9.08226931862397e-06,
 'lambda_l2': 3.064704281084003e-07,
 'colsample_bytree': 0.7,
 'subsample': 0.6,
 'learning_rate': 0.006,
 'max_depth': 100,
 'num_leaves': 897,
 'feature_fraction': 0.9091937267473535,
 'bagging_fraction': 0.2057801454225861,
 'bagging_freq': 3,
 'min_child_samples': 233,
 'cat_smooth': 11}