# Library

In [13]:
# Native library
import copy
import collections
import multiprocessing as mp
from typing import Union

import warnings
warnings.filterwarnings('ignore')

import os
import sys
path = os.path.join(os.pardir, os.pardir) # '../../src/'
sys.path.append(path)

# Save object
import joblib

from tqdm import tqdm

# Data management
import numpy as np
import pandas as pd
import xarray as xr

import plotly.express as px

from constants import TARGET, TARGET_TEST, FOLDER, S_COLUMNS, G_COLUMNS, M_COLUMNS

# Data prepocessing
from src.data.datascaler import DatasetScaler
# from sklearn.preprocessing import MinMaxScaler

# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Hyperoptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import wandb

# Regressor models
from xgboost import XGBRegressor
from lce import LCERegressor

# Model evaluation
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

# Constant

In [5]:
# Target variable
# TARGET = "Rice Yield (kg/ha)"
# TARGET_TEST = 'Predicted Rice Yield (kg/ha)'
# S_COLUMNS = ['ndvi', 'savi', 'evi', 'rep', 'osavi', 'rdvi', 'mtvi1', 'lswi']
# G_COLUMNS = ['Field size (ha)', 'Rice Crop Intensity(D=Double, T=Triple)']
# M_COLUMNS = ['tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover', 'windspeed', 'winddir', 
#              'sealevelpressure', 'cloudcover', 'solarradiation', 'solarenergy', 'uvindex', 'moonphase', 'solarexposure']
# FOLDER = 'augment_10_5'

MODEL_PATH = os.path.join('model', FOLDER, 'XGBoost', 'Aggregate')
DATA_PATH = os.path.join(os.pardir, os.pardir, 'data', 'processed', FOLDER)
os.makedirs(MODEL_PATH, exist_ok=True)

# Train

## Import Data

In [6]:
xdf = xr.open_dataset(os.path.join(DATA_PATH, 'train_processed.nc'))
xdf = xr.merge([xdf[G_COLUMNS], xdf[M_COLUMNS].sel(datetime=xdf['time'], name=xdf['District']), xdf[S_COLUMNS], xdf[[TARGET]]])
xdf = xdf.drop(['name', 'datetime'])
xdf

## Compute aggregation

In [7]:
xdf = xr.concat([xdf.mean(dim='state_dev'), xdf.max(dim='state_dev'), xdf.min(dim='state_dev')], dim='agg')
xdf['agg'] = ['mean', 'max', 'min'] 
xdf

## Format Data

In [8]:
df = xdf.to_dataframe()
df.reset_index(inplace=True)
df.set_index(['ts_id', 'ts_obs', 'ts_aug'] + G_COLUMNS + [TARGET], drop=True, inplace=True)
df = df.pivot(columns='agg')
df.columns = df.columns.map('_'.join).str.strip('_')
df.reset_index(inplace=True)
df.set_index(['ts_id', 'ts_obs', 'ts_aug'], drop=True, inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Field size (ha),"Rice Crop Intensity(D=Double, T=Triple)",Rice Yield (kg/ha),tempmax_max,tempmax_mean,tempmax_min,tempmin_max,tempmin_mean,tempmin_min,temp_max,...,osavi_min,rdvi_max,rdvi_mean,rdvi_min,mtvi1_max,mtvi1_mean,mtvi1_min,lswi_max,lswi_mean,lswi_min
ts_id,ts_obs,ts_aug,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,0,0,0.588832,0.43024,0.107143,1.605007,0.403939,-1.110033,1.966868,0.492688,-0.666903,2.161660,...,-1.674072,0.934289,-0.082817,-1.597402,1.083855,-0.016945,-1.432723,0.701071,-0.267268,-1.028830
1,0,1,0.588832,0.43024,0.107143,1.605007,0.403939,-1.110033,1.966868,0.492688,-0.666903,2.161660,...,-1.939070,1.127370,-0.058626,-1.827706,1.183513,-0.000760,-1.644201,0.946509,-0.254700,-1.322344
2,0,2,0.588832,0.43024,0.107143,1.605007,0.403939,-1.110033,1.966868,0.492688,-0.666903,2.161660,...,-1.774103,1.378619,0.075912,-1.705366,1.509188,0.165716,-1.564741,1.112695,-0.182137,-1.263092
3,0,3,0.588832,0.43024,0.107143,1.605007,0.403939,-1.110033,1.966868,0.492688,-0.666903,2.161660,...,-1.643810,1.177675,-0.058706,-1.523833,1.304718,-0.001834,-1.354105,0.977083,-0.181882,-1.134358
4,0,4,0.588832,0.43024,0.107143,1.605007,0.403939,-1.110033,1.966868,0.492688,-0.666903,2.161660,...,-1.873486,1.346554,0.032197,-1.799793,1.421819,0.095357,-1.658284,1.152990,-0.212619,-1.562033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5565,556,5,-0.294495,0.43024,0.714286,1.605007,0.095849,-1.283334,1.088945,-0.502292,-2.422750,1.893658,...,-1.884258,1.222347,-0.131362,-1.759351,1.266699,-0.177623,-1.656060,1.159994,0.060452,-1.632104
5566,556,6,-0.294495,0.43024,0.714286,1.605007,0.095849,-1.283334,1.088945,-0.502292,-2.422750,1.893658,...,-1.912984,0.816759,-0.186636,-1.773445,0.821349,-0.257327,-1.659888,0.651309,-0.262772,-1.554542
5567,556,7,-0.294495,0.43024,0.714286,1.605007,0.095849,-1.283334,1.088945,-0.502292,-2.422750,1.893658,...,-1.305538,1.048638,-0.078343,-1.433728,1.089599,-0.153113,-1.435511,0.984078,0.021034,-1.665391
5568,556,8,-0.294495,0.43024,0.714286,1.605007,0.095849,-1.283334,1.088945,-0.502292,-2.422750,1.893658,...,-2.139328,1.242285,-0.147745,-1.967255,1.211020,-0.193899,-1.866153,1.031263,-0.099591,-1.732658


In [9]:
X, y = df.drop(columns=TARGET), df[[TARGET]]

## Split Data

In [10]:
# Split dataset into Train/Test subdataset equitably distributed according to TARGET
# Repeat the operation to create Train/Val/Test subdataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
train_idx, test_idx = train_test_split(X.index.levels[1], test_size=.2, random_state=0)
X_train, X_test, y_train, y_test = X.loc[:, train_idx, :], X.loc[:, test_idx, :], y.loc[:, train_idx, :], y.loc[:, test_idx, :] 

## Kfold

In [11]:
class ObsKFold(KFold):
    def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
        super().__init__(n_splits, shuffle=shuffle, random_state=random_state)


    def split(self, X: pd.DataFrame, y=None, groups=None):

        if not isinstance(X, pd.DataFrame):
            raise TypeError("X expected pd.DataFrame but received %s." % (type(X)))
            
        full_idx = X.index.get_level_values('ts_obs')
        uniq_idx = full_idx.unique()
        for train, test in super().split(X=uniq_idx, y=None, groups=None):
            yield full_idx.isin(train), full_idx.isin(test)
    
kfold = ObsKFold()
for train, test in kfold.split(X_train):
    print(train)

[False False False ...  True  True  True]
[False False False ...  True  True  True]
[False False False ... False False False]
[False False False ...  True  True  True]
[False False False ...  True  True  True]


In [12]:
class WandbCallback():
    def __init__(self, project: str, tags: list[str], estimator_name: str) -> None:
        self.project = project
        self.tags = tags
        self.estimator_name = estimator_name
        
    def save(self, wandb_run, parameters, train_scores, test_scores, num_split)->None:
        run_name = "{}-{}".format(wandb_run['sweep_run_name'], num_split)
        run = wandb.init(
            project=self.project,
            tags=self.tags,
            group=wandb_run['sweep_id'],
            name=run_name,
            config=parameters,
            job_type=wandb_run['sweep_run_name'],
        )

        run.log(dict(test_scores=test_scores, train_scores=train_scores))
    
        wandb.join()

In [33]:
from sklearn.model_selection._validation import _fit_and_score

def _fit_score_callback(
        estimator,
        X,
        y,
        scorer,
        train,
        test,
        verbose,
        parameters,
        wandb_run,
        callback,
        fit_params,
        return_train_score=None,
        return_parameters=False,
        return_n_test_samples=False,
        return_times=False,
        return_estimator=False,
        split_progress=None,
        candidate_progress=None,
        error_score=np.nan,
):
    
    result =  _fit_and_score(
        estimator,
        X,
        y,
        scorer,
        train,
        test,
        verbose,
        parameters,
        fit_params,
        return_train_score=True,
        return_parameters=return_parameters,
        return_n_test_samples=return_n_test_samples,
        return_times=return_times,
        return_estimator=True,
        split_progress=split_progress,
        candidate_progress=candidate_progress,
        error_score=error_score,
    )
    
    callback.save(
        wandb_run, 
        parameters=result['estimator'].get_params(),
        train_scores=result['train_scores'],
        test_scores=result['test_scores'],
        num_split=split_progress[0]
        )

    return result

In [30]:
from collections import defaultdict
from itertools import product
import time

import numpy as np

from sklearn.base import is_classifier, clone

from sklearn.model_selection._split import check_cv
from sklearn.model_selection._validation import _insert_error_scores
from sklearn.model_selection._validation import _warn_or_raise_about_fit_failures
from joblib import Parallel
from sklearn.utils.validation import indexable, _check_fit_params
from sklearn.utils.fixes import delayed
from sklearn.metrics._scorer import _check_multimetric_scoring
from sklearn.metrics import check_scoring


class WandBGridSearchCV(GridSearchCV):
    def __init__(self, estimator, param_grid, callback, *, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch="2*n_jobs", error_score=np.nan, return_train_score=False):
        super().__init__(estimator, param_grid, scoring=scoring, n_jobs=n_jobs, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch, error_score=error_score, return_train_score=return_train_score)
        self.callback = callback

    def _wandb_sweep_run(self, n_candidates):
        for _ in range(n_candidates):
            sweep_run = wandb.init()
            sweep_id = sweep_run.id
            sweep_run_name = sweep_run.name
            # sweep_run.save()
            yield dict(sweep_id=sweep_id, sweep_run_name=sweep_run_name)

    def fit(self, X, y=None, *, groups=None, **fit_params):
        """Run fit with all sets of parameters.

        Parameters
        ----------

        X : array-like of shape (n_samples, n_features)
            Training vector, where `n_samples` is the number of samples and
            `n_features` is the number of features.

        y : array-like of shape (n_samples, n_output) \
            or (n_samples,), default=None
            Target relative to X for classification or regression;
            None for unsupervised learning.

        groups : array-like of shape (n_samples,), default=None
            Group labels for the samples used while splitting the dataset into
            train/test set. Only used in conjunction with a "Group" :term:`cv`
            instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).

        **fit_params : dict of str -> object
            Parameters passed to the `fit` method of the estimator.

            If a fit parameter is an array-like whose length is equal to
            `num_samples` then it will be split across CV groups along with `X`
            and `y`. For example, the :term:`sample_weight` parameter is split
            because `len(sample_weights) = len(X)`.

        Returns
        -------
        self : object
            Instance of fitted estimator.
        """
        
        # return super().fit(X=X, y=y, groups=groups, **fit_params)
        estimator = self.estimator
        refit_metric = "score"

        if callable(self.scoring):
            scorers = self.scoring
        elif self.scoring is None or isinstance(self.scoring, str):
            scorers = check_scoring(self.estimator, self.scoring)
        else:
            scorers = _check_multimetric_scoring(self.estimator, self.scoring)
            self._check_refit_for_multimetric(scorers)
            refit_metric = self.refit

        X, y, groups = indexable(X, y, groups)
        fit_params = _check_fit_params(X, fit_params)

        cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
        n_splits = cv_orig.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)

        parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(
            scorer=scorers,
            fit_params=fit_params,
            return_train_score=self.return_train_score,
            return_n_test_samples=True,
            return_times=True,
            return_parameters=False,
            error_score=self.error_score,
            verbose=self.verbose,
        )
        results = {}
        with parallel:
            all_candidate_params = []
            all_out = []
            all_more_results = defaultdict(list)

            def evaluate_candidates(candidate_params, cv=None, more_results=None):
                cv = cv or cv_orig
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                wandb_runs = self._wandb_sweep_run(n_candidates)

                if self.verbose > 0:
                    print(
                        "Fitting {0} folds for each of {1} candidates,"
                        " totalling {2} fits".format(
                            n_splits, n_candidates, n_candidates * n_splits
                        )
                    )

                out = parallel(
                    delayed(_fit_score_callback)(
                        clone(base_estimator),
                        X,
                        y,
                        train=train,
                        test=test,
                        parameters=parameters,
                        split_progress=(split_idx, n_splits),
                        candidate_progress=(cand_idx, n_candidates),
                        wandb_run=wandb_run,
                        callback=self.callback,
                        **fit_and_score_kwargs,
                    )
                    for (cand_idx,(wandb_run, parameters)), (split_idx, (train, test)) in product(
                        enumerate(zip(wandb_runs, candidate_params)), enumerate(cv.split(X, y, groups))
                    )
                )

                if len(out) < 1:
                    raise ValueError(
                        "No fits were performed. "
                        "Was the CV iterator empty? "
                        "Were there no candidates?"
                    )
                elif len(out) != n_candidates * n_splits:
                    raise ValueError(
                        "cv.split and cv.get_n_splits returned "
                        "inconsistent results. Expected {} "
                        "splits, got {}".format(n_splits, len(out) // n_candidates)
                    )

                _warn_or_raise_about_fit_failures(out, self.error_score)

                # For callable self.scoring, the return type is only know after
                # calling. If the return type is a dictionary, the error scores
                # can now be inserted with the correct key. The type checking
                # of out will be done in `_insert_error_scores`.
                if callable(self.scoring):
                    _insert_error_scores(out, self.error_score)

                all_candidate_params.extend(candidate_params)
                all_out.extend(out)

                if more_results is not None:
                    for key, value in more_results.items():
                        all_more_results[key].extend(value)

                nonlocal results
                results = self._format_results(
                    all_candidate_params, n_splits, all_out, all_more_results
                )

                return results

            self._run_search(evaluate_candidates)

            # multimetric is determined here because in the case of a callable
            # self.scoring the return type is only known after calling
            first_test_score = all_out[0]["test_scores"]
            self.multimetric_ = isinstance(first_test_score, dict)

            # check refit_metric now for a callabe scorer that is multimetric
            if callable(self.scoring) and self.multimetric_:
                self._check_refit_for_multimetric(first_test_score)
                refit_metric = self.refit

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = self._select_best_index(
                self.refit, refit_metric, results
            )
            if not callable(self.refit):
                # With a non-custom callable, we can select the best score
                # based on the best index
                self.best_score_ = results[f"mean_test_{refit_metric}"][
                    self.best_index_
                ]
            self.best_params_ = results["params"][self.best_index_]

        if self.refit:
            # we clone again after setting params in case some
            # of the params are estimators as well.
            self.best_estimator_ = clone(
                clone(base_estimator).set_params(**self.best_params_)
            )
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

            if hasattr(self.best_estimator_, "feature_names_in_"):
                self.feature_names_in_ = self.best_estimator_.feature_names_in_

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers

        self.cv_results_ = results
        self.n_splits_ = n_splits

        return self

In [32]:
param_grid = { 
    'max_depth' : [2, 3],
    'n_estimators': [100],
    'learning_rate': [0.01],
    # 'colsample_bytree': [2],
    # 'subsample': np.linspace(0.6, 1, 5, dtype=float),
}

xgr = XGBRegressor()

callback = WandbCallback('winged-bull', tags=['XGBoost', 'GridSearchCV', 'Aggregate'], estimator_name='XGBRegressor')
gcv = WandBGridSearchCV(xgr, param_grid, callback=callback, cv=ObsKFold(5), n_jobs=-1)
gcv.fit(X_train, y_train)
wandb.finish()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016675276400000407, max=1.0…

{'fit_error': None, 'test_scores': -0.44706947010204634, 'train_scores': 0.6002997785182085, 'n_test_samples': 720, 'fit_time': 0.6849620342254639, 'score_time': 0.005615711212158203, 'estimator': XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=2, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)}
{'fit_error': None, 'test_scores': 0.2987674885124695, 'train_scores': 0.6224659153111778, 'n_test_samples': 700, 'fit_t

wandb: Currently logged in as: urgellbapt (winged-bull). Use `wandb login --relogin` to force relogin
wandb: Currently logged in as: urgellbapt (winged-bull). Use `wandb login --relogin` to force relogin
wandb: Currently logged in as: urgellbapt (winged-bull). Use `wandb login --relogin` to force relogin
wandb: wandb version 0.14.0 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade
wandb: Tracking run with wandb version 0.13.11
wandb: Run data is saved locally in /Users/titou/Documents.nosync/crop-forecasting/notebooks/baseline/wandb/run-20230316_202703-3s0lq750
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run dulcet-salad-6-2
wandb: ⭐️ View project at https://wandb.ai/winged-bull/winged-bull
wandb: 🚀 View run at https://wandb.ai/winged-bull/winged-bull/runs/3s0lq750
wandb: Waiting for W&B process to finish... (success).
wandb: wandb version 0.14.0 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade
wandb: Tracking ru

{'fit_error': None, 'test_scores': -0.33019595592573925, 'train_scores': 0.5949555283295972, 'n_test_samples': 700, 'fit_time': 0.8338677883148193, 'score_time': 0.0052831172943115234, 'estimator': XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)}
{'fit_error': None, 'test_scores': 0.5666068983943033, 'train_scores': 0.6098488952646368, 'n_test_samples': 700, 'fit_

wandb: \ Waiting for wandb.init()...
wandb: Run history:
wandb:  test_scores ▁
wandb: train_scores ▁
wandb: 
wandb: Run summary:
wandb:  test_scores 0.57639
wandb: train_scores 0.59121
wandb: 
wandb: 🚀 View run dulcet-salad-6-4 at: https://wandb.ai/winged-bull/winged-bull/runs/p18zt0uq
wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
wandb: Find logs at: ./wandb/run-20230316_202705-p18zt0uq/logs
wandb: 
wandb: Run history:
wandb:  test_scores ▁
wandb: train_scores ▁
wandb: 
wandb: Run summary:
wandb:  test_scores 0.36368
wandb: train_scores 0.62525
wandb: 
wandb: 
wandb: Run history:
wandb:  test_scores ▁
wandb: train_scores ▁
wandb: 
wandb: Run summary:
wandb:  test_scores 0.28529
wandb: train_scores 0.64312
wandb: 
wandb: 🚀 View run dulcet-salad-6-0 at: https://wandb.ai/winged-bull/winged-bull/runs/xryyvcv2
wandb: Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
wandb: Find logs at: ./wandb/run-20230316_202705-xryyvcv2/

## Machine Learning Regression

### WandB

In [None]:
Worker = collections.namedtuple("Worker", ("queue", "process"))
WorkerInitData = collections.namedtuple(
    "WorkerInitData", ("num", "sweep_id", "sweep_run_name", "config")
)
WorkerDoneData = collections.namedtuple("WorkerDoneData", ("val_accuracy"))

In [None]:
def reset_wandb_env():
    exclude = {
        "WANDB_PROJECT",
        "WANDB_ENTITY",
        "WANDB_API_KEY",
    }
    for k, v in os.environ.items():
        if k.startswith("WANDB_") and k not in exclude:
            del os.environ[k]

### Train

In [None]:
def train(sweep_q, worker_q, estimator):
    reset_wandb_env()
    worker_data = worker_q.get()
    run_name = "{}-{}".format(worker_data.sweep_run_name, worker_data.num)
    config = worker_data.config
    run = wandb.init(
        group=worker_data.sweep_id,
        job_type=worker_data.sweep_run_name,
        name=run_name,
        config=config,
    )
    estimator(**wandb.)
    val_accuracy = estimator.fit(**wandb.config).score()
    run.log(dict(val_accuracy=val_accuracy))
    wandb.join()
    sweep_q.put(WorkerDoneData(val_accuracy=val_accuracy))

In [None]:
num_folds = 10

sweep_q = mp.Queue()
workers = []

for num in range(num_folds):
    q = mp.Queue()
    p = mp.Process(
        target=train, kwargs=dict(sweep_q=sweep_q, worker_q=q)
    )
    p.start()
    workers.append(Worker(queue=q, process=p))

sweep_run = wandb.init()
sweep_id = sweep_run.sweep_id or "unknown"
sweep_url = sweep_run.get_sweep_url()
project_url = sweep_run.get_project_url()
sweep_group_url = "{}/groups/{}".format(project_url, sweep_id)
sweep_run.notes = sweep_group_url
sweep_run.save()
sweep_run_name = sweep_run.name or sweep_run.id or "unknown"

metrics = []
for num in range(num_folds):
    worker = workers[num]
    # start worker
    worker.queue.put(
        WorkerInitData(
            sweep_id=sweep_id,
            num=num,
            sweep_run_name=sweep_run_name,
            config=dict(sweep_run.config),
        )
    )
    # get metric from worker
    result = sweep_q.get()
    # wait for worker to finish
    worker.process.join()
    # log metric to sweep_run
    metrics.append(result.val_accuracy)

sweep_run.log(dict(val_accuracy=sum(metrics) / len(metrics)))
wandb.join()


In [None]:
wandb.init(
        project='winged-bull',
        group=
        tags=['XGBoost', 'Aggregation', 'Augmented', 'Machine Learning']
    )

### XGBoost

In [None]:
xgbr = XGBRegressor(random_state=0, callbacks=[WandbCallback()])

param_grid = { 
    'max_depth' : np.linspace(2, 10, 5, dtype=int),
    'n_estimators': np.linspace(100, 1000, 5, dtype=int),
    'learning_rate': np.linspace(0.01, 0.3, 5, dtype=float),
    'colsample_bytree': np.linspace(0.5, 1, 5, dtype=float),
    # 'subsample': np.linspace(0.6, 1, 5, dtype=float),
}

# Perform a grid search
cv_xgbr = GridSearchCV(xgbr, param_grid, n_jobs=-1, verbose=10)
cv_xgbr.fit(X_train, y_train)

Fitting 5 folds for each of 625 candidates, totalling 3125 fits
[CV 2/5; 1/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100
[CV 3/5; 1/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100
[CV 1/5; 1/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100
[CV 4/5; 1/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100
[CV 5/5; 1/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100
[CV 2/5; 2/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=325
[CV 1/5; 2/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=325
[CV 3/5; 2/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=325
[CV 3/5; 1/625] END colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100;, score=0.608 total time=   0.5s
[CV 1/5; 1/625] END colsample_bytree=0.5, learning_rate=0.0

In [None]:
cv_xgbr.best_params_

{'colsample_bytree': 0.625,
 'learning_rate': 0.01,
 'max_depth': 10,
 'n_estimators': 1000}

#### Evaluate

In [None]:
scaler: DatasetScaler = joblib.load(os.path.join(DATA_PATH, 'scaler_dataset.joblib'))

y_pred = cv_xgbr.predict(X_test)
y_pred = scaler.scaler_t.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)
y_true = scaler.scaler_t.inverse_transform(y_test).reshape(-1)

rmse = mean_squared_error(y_true, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f'RMSE: {rmse:.0f} | MAPE: {100*mape:.1f}% | R2 score: {r2:.3f}')

RMSE: 252 | MAPE: 2.8% | R2 score: 0.897


In [None]:
# Plot features importance
df_fi = pd.DataFrame(columns=['Feature', 'Importance'])
df_fi['Feature'] = X.columns
df_fi['Importance'] = cv_xgbr.best_estimator_.feature_importances_
df_fi.sort_values('Importance', inplace=True, ascending=False)

fig = px.bar(df_fi.head(10), x='Feature', y='Importance', title="Feature importance")
fig.show()

### Save the model

In [None]:
model_filename = os.path.join(MODEL_PATH, 'cv_xgboost.save')
joblib.dump(cv_xgbr, model_filename)

### Train Final Model

In [None]:
param = copy.deepcopy(cv_xgbr.best_params_)
param["random_state"] = 0
param["tree_method"] = 'gpu_hist'

xgbr = XGBRegressor(**param)

xgbr.fit(X, y)

In [None]:
model_filename = os.path.join(MODEL_PATH, 'xgboost.save')
joblib.dump(xgbr, model_filename)

# Test

## Import Data

In [None]:
xdf = xr.open_dataset(os.path.join(DATA_PATH, 'test_processed.nc'))
xdf = xr.merge([xdf[G_COLUMNS], xdf[M_COLUMNS].sel(datetime=xdf['time'], name=xdf['District']), xdf[S_COLUMNS]])
xdf = xdf.drop(['name', 'datetime'])
xdf

## Compute aggregation

In [None]:
xdf = xr.concat([xdf.mean(dim='state_dev'), xdf.max(dim='state_dev'), xdf.min(dim='state_dev')], dim='agg')
xdf['agg'] = ['mean', 'max', 'min'] 
xdf

## Format Data

In [None]:
df = xdf.to_dataframe()
df.reset_index(inplace=True)
df.set_index(['ts_id', 'ts_obs', 'ts_aug'] + G_COLUMNS, drop=True, inplace=True)
df = df.pivot(columns='agg')
df.columns = df.columns.map('_'.join).str.strip('_')
df.reset_index(inplace=True)
df.set_index(['ts_id', 'ts_obs', 'ts_aug'], drop=True, inplace=True)
X = df
X

## Load the model

In [None]:
model_filename = os.path.join(MODEL_PATH, 'xgboost.save')
model: XGBRegressor = joblib.load(model_filename)

## Predict data

In [None]:
scaler: DatasetScaler = joblib.load(os.path.join(DATA_PATH, 'scaler_dataset.joblib'))

y_pred = model.predict(X_test)
y_pred = scaler.scaler_t.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)

In [None]:
s_pred = pd.Series(y_pred, index=df.index, name=TARGET_TEST)
s_pred.reset_index(['ts_id', 'ts_aug'], drop=True, inplace=True)
s_pred.reset_index(inplace=True)
s_pred = s_pred.groupby('ts_obs').mean()

## Create Submissions file

In [None]:
df_sub = pd.read_csv(os.path.join(os.pardir, os.pardir, 'data', 'raw', 'test.csv'))
df_sub.drop(columns=TARGET_TEST, inplace=True)
df_sub = pd.merge(df_sub, s_pred, right_index=True, left_index=True)

In [None]:
sub_filename = os.path.join(MODEL_PATH, 'submission.csv')
df_sub.to_csv(sub_filename, index=False, header=True)