# Library

In [57]:
# Native library
import os, sys

path = os.path.join(os.pardir, os.pardir)
sys.path.insert(1, path)

from utils import ROOT_DIR

# Data management
import numpy as np
import xarray as xr

from src.constants import TARGET, FOLDER, S_COLUMNS

# Data prepocessing
from src.data.preprocessing import Smoother, Convertor, Filler, Sorter
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# Regressor models
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# Training
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

In [58]:
steps_pipeline = [
    ("filler", Filler()),
    ("smoother", Smoother()),
    ("convertor", Convertor(agg=True)),
    ("sorter", Sorter()),
]

# steps_pipeline.append(("estimator", XGBRegressor()))
pipeline = Pipeline(steps_pipeline)

In [59]:
def preprocess_y(xds: xr.Dataset) -> np.ndarray:
    df = xds[[TARGET] + S_COLUMNS].to_dataframe()
    y = df[[TARGET]].groupby(["ts_obs", "ts_aug"]).first()
    return y.reorder_levels(["ts_obs", "ts_aug"]).sort_index()

data_path = os.path.join(ROOT_DIR, "data", "interim", FOLDER, "train.nc")
xds = xr.open_dataset(data_path, engine="scipy")

y_train = preprocess_y(xds)

In [60]:
obs_idx = xds["ts_obs"].values
obs_idx = obs_idx.reshape(-1, 1)
n_splits = 10
X_train = None
X_test = None
y_train = None
y_test = None
val_score = 0
for index_train, index_test in KFold(n_splits=n_splits).split(obs_idx):
    xds_train = xds.sel(ts_obs=obs_idx[index_train].reshape(-1))
    xds_test = xds.sel(ts_obs=obs_idx[index_test].reshape(-1))
    
    X_train = pipeline.fit_transform(xds_train)
    X_test = pipeline.transform(xds_test)
    
    y_train = preprocess_y(xds_train)
    y_test = preprocess_y(xds_test)
    
    val_split_score = XGBRegressor().fit(X_train, y_train).score(X_test, y_test)
    val_score += val_split_score
    
val_score / n_splits

-0.24561670424404988

In [61]:
import plotly.express as px
import pandas as pd
# Plot features importance
df_fi = pd.DataFrame(columns=['Feature', 'Importance'])
df_fi['Feature'] = X.columns
df_fi['Importance'] = xgb.feature_importances_
df_fi.sort_values('Importance', inplace=True, ascending=False)

fig = px.bar(df_fi.head(10), x='Feature', y='Importance', title="Feature importance")
fig.show()