# Library

In [28]:
# Native library
import os, sys

path = os.path.join(".")
sys.path.insert(1, path)

from utils import ROOT_DIR

# Data management
import numpy as np
import xarray as xr

from src.constants import TARGET, FOLDER, S_COLUMNS

# Data prepocessing
from src.data.preprocessing import Smoother, Convertor, Filler, Sorter
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# Regressor models
from xgboost import XGBRegressor

# Training
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

In [29]:
steps_pipeline = [
    ("filler", Filler()),
    ("smoother", Smoother()),
    ("convertor", Convertor(agg=True)),
    ("sorter", Sorter()),
]

# steps_pipeline.append(("estimator", XGBRegressor()))
pipeline = Pipeline(steps_pipeline)

In [30]:
def preprocess_y(xds: xr.Dataset) -> np.ndarray:
    df = xds[[TARGET] + S_COLUMNS].to_dataframe()
    y = df[[TARGET]].groupby(["ts_obs", "ts_aug"]).first()
    return y.reorder_levels(["ts_obs", "ts_aug"]).sort_index()

data_path = os.path.join(ROOT_DIR, "data", "interim", FOLDER, "train.nc")
xds = xr.open_dataset(data_path, engine="scipy")

y_train = preprocess_y(xds)

In [33]:
obs_idx = xds["ts_obs"].values
obs_idx = obs_idx.reshape(-1, 1)
n_splits = 10
xds_train = None
xds_test = None
y_train = None
y_test = None
for index_train, index_test in KFold(n_splits=n_splits).split(obs_idx):
    xds_train = xds.sel(ts_obs=obs_idx[index_train].reshape(-1))
    xds_test = xds.sel(ts_obs=obs_idx[index_test].reshape(-1))
    
    train_df = pipeline.fit_transform(xds_train)
    test_df = pipeline.fit_transform(xds_test)
    
    y_train = preprocess_y(xds_train)
    y_test = preprocess_y(xds_test)
    
    break

In [38]:
(train_df.index == y_train.index).all() & (test_df.index == y_test.index).all()

True

In [27]:
XGBRegressor(pipeline.fit_transform(xds, y_train), y_train).

Unnamed: 0_level_0,Unnamed: 1_level_0,Rice Yield (kg/ha)
ts_obs,ts_aug,Unnamed: 2_level_1
0,0,5500
0,1,5500
0,2,5500
0,3,5500
0,4,5500
...,...,...
556,95,7200
556,96,7200
556,97,7200
556,98,7200


In [15]:
pipeline['estimator'].feature_importances_

array([0.5049667 , 0.0100252 , 0.01040307, 0.01911996, 0.00649522,
       0.01092529, 0.0138432 , 0.01213815, 0.01221612, 0.01495591,
       0.01046163, 0.00944952, 0.01718526, 0.01087538, 0.01007912,
       0.01153454, 0.00555495, 0.01048146, 0.01895331, 0.01081887,
       0.00893562, 0.00704036, 0.008929  , 0.00737329, 0.0082045 ,
       0.00763884, 0.00505139, 0.0106377 , 0.01274521, 0.00798748,
       0.01018606, 0.00776447, 0.00934054, 0.00847016, 0.0070497 ,
       0.01198343, 0.00515981, 0.00467342, 0.00909384, 0.01395343,
       0.00615744, 0.00923261, 0.00908891, 0.00998733, 0.01257749,
       0.00954159, 0.00270748, 0.01156283, 0.01079056, 0.01565266],
      dtype=float32)