# Library

In [1]:
# Native library
import copy
import warnings
import pathlib
import os
import sys
warnings.filterwarnings('ignore')
path = os.path.join(os.pardir, os.pardir, 'src') # '../../src/'
sys.path.append(path)

# Save object
import joblib

# Data management
import numpy as np
import pandas as pd
import xarray as xr

# Data prepocessing
from datascaler import DatasetScaler
from sklearn.preprocessing import MinMaxScaler

# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Hyperoptimization
from sklearn.model_selection import GridSearchCV

# Regressor models
from xgboost import XGBRegressor

# Model evaluation
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

In [2]:
# Target variable
TARGET = "Rice Yield (kg/ha)"
TARGET_TEST = 'Predicted Rice Yield (kg/ha)'
S_COLUMNS = ['ndvi', 'savi', 'evi', 'rep', 'osavi', 'rdvi', 'mtvi1', 'lswi']
G_COLUMNS = ['Field size (ha)', 'Rice Crop Intensity(D=Double, T=Triple)']
M_COLUMNS = ['tempmax', 'tempmin', 'temp', 'dew', 'humidity', 'precip', 'precipprob', 'precipcover', 'windspeed', 'winddir', 
             'sealevelpressure', 'cloudcover', 'solarradiation', 'solarenergy', 'uvindex', 'moonphase', 'solarexposure']
FOLDER = 'augment_10_5'

MODEL_PATH = os.path.join('model', FOLDER, 'XGBoost', 'Aggregate')
DATA_PATH = os.path.join(os.pardir, os.pardir, 'data', 'processed', FOLDER)
os.makedirs(MODEL_PATH, exist_ok=True)

# Train

## Import Data

In [3]:
xdf = xr.open_dataset(os.path.join(DATA_PATH, 'train_processed.nc'))
xdf = xr.merge([xdf[G_COLUMNS], xdf[M_COLUMNS].sel(datetime=xdf['time'], name=xdf['District']), xdf[S_COLUMNS], xdf[[TARGET]]])
xdf = xdf.drop(['name', 'datetime'])
xdf

## Compute aggregation

In [4]:
xdf = xr.concat([xdf.mean(dim='state_dev'), xdf.max(dim='state_dev'), xdf.min(dim='state_dev')], dim='agg')
xdf['agg'] = ['mean', 'max', 'min'] 
xdf

## Format Data

In [5]:
df = xdf.to_dataframe()
df.reset_index(inplace=True)
df.set_index(['ts_id', 'ts_obs', 'ts_aug'] + G_COLUMNS + [TARGET], drop=True, inplace=True)
df = df.pivot(columns='agg')
df.columns = df.columns.map('_'.join).str.strip('_')
df.reset_index(inplace=True)
df.set_index(['ts_id', 'ts_obs', 'ts_aug'], drop=True, inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Field size (ha),"Rice Crop Intensity(D=Double, T=Triple)",Rice Yield (kg/ha),tempmax_max,tempmax_mean,tempmax_min,tempmin_max,tempmin_mean,tempmin_min,temp_max,...,osavi_min,rdvi_max,rdvi_mean,rdvi_min,mtvi1_max,mtvi1_mean,mtvi1_min,lswi_max,lswi_mean,lswi_min
ts_id,ts_obs,ts_aug,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,0,0,0.588832,0.43024,0.107143,1.605007,0.403939,-1.110033,1.966868,0.492688,-0.666903,2.161660,...,-1.674072,0.934289,-0.082817,-1.597402,1.083855,-0.016945,-1.432723,0.701071,-0.267268,-1.028830
1,0,1,0.588832,0.43024,0.107143,1.605007,0.403939,-1.110033,1.966868,0.492688,-0.666903,2.161660,...,-1.939070,1.127370,-0.058626,-1.827706,1.183513,-0.000760,-1.644201,0.946509,-0.254700,-1.322344
2,0,2,0.588832,0.43024,0.107143,1.605007,0.403939,-1.110033,1.966868,0.492688,-0.666903,2.161660,...,-1.774103,1.378619,0.075912,-1.705366,1.509188,0.165716,-1.564741,1.112695,-0.182137,-1.263092
3,0,3,0.588832,0.43024,0.107143,1.605007,0.403939,-1.110033,1.966868,0.492688,-0.666903,2.161660,...,-1.643810,1.177675,-0.058706,-1.523833,1.304718,-0.001834,-1.354105,0.977083,-0.181882,-1.134358
4,0,4,0.588832,0.43024,0.107143,1.605007,0.403939,-1.110033,1.966868,0.492688,-0.666903,2.161660,...,-1.873486,1.346554,0.032197,-1.799793,1.421819,0.095357,-1.658284,1.152990,-0.212619,-1.562033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5565,556,5,-0.294495,0.43024,0.714286,1.605007,0.095849,-1.283334,1.088945,-0.502292,-2.422750,1.893658,...,-1.884258,1.222347,-0.131362,-1.759351,1.266699,-0.177623,-1.656060,1.159994,0.060452,-1.632104
5566,556,6,-0.294495,0.43024,0.714286,1.605007,0.095849,-1.283334,1.088945,-0.502292,-2.422750,1.893658,...,-1.912984,0.816759,-0.186636,-1.773445,0.821349,-0.257327,-1.659888,0.651309,-0.262772,-1.554542
5567,556,7,-0.294495,0.43024,0.714286,1.605007,0.095849,-1.283334,1.088945,-0.502292,-2.422750,1.893658,...,-1.305538,1.048638,-0.078343,-1.433728,1.089599,-0.153113,-1.435511,0.984078,0.021034,-1.665391
5568,556,8,-0.294495,0.43024,0.714286,1.605007,0.095849,-1.283334,1.088945,-0.502292,-2.422750,1.893658,...,-2.139328,1.242285,-0.147745,-1.967255,1.211020,-0.193899,-1.866153,1.031263,-0.099591,-1.732658


In [6]:
X, y = df.drop(columns=TARGET), df[[TARGET]]

## Split Data

In [7]:
# Split dataset into Train/Test subdataset equitably distributed according to TARGET
# Repeat the operation to create Train/Val/Test subdataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Machine Learning Regression

### XGBoost

#### Train

In [None]:
xgbr = XGBRegressor(random_state=0, tree_method='gpu_hist')

param_grid = { 
    'max_depth' : np.linspace(2, 10, 5, dtype=int),
    'n_estimators': np.linspace(100, 1000, 5, dtype=int),
    'learning_rate': np.linspace(0.01, 0.3, 5, dtype=float),
    'colsample_bytree': np.linspace(0.5, 1, 5, dtype=float),
    # 'subsample': np.linspace(0.6, 1, 5, dtype=float),
}

# Perform a grid search
cv_xgbr = GridSearchCV(xgbr, param_grid, n_jobs=-1, verbose=10)
cv_xgbr.fit(X_train, y_train)

Fitting 5 folds for each of 625 candidates, totalling 3125 fits
[CV 4/5; 1/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100
Parameters: { "verbose" } are not used.

[CV 4/5; 1/625] END colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=100;, score=0.600 total time=   3.1s
[CV 2/5; 2/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=325
Parameters: { "verbose" } are not used.

[CV 2/5; 2/625] END colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=325;, score=0.702 total time=   4.8s
[CV 5/5; 2/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=325
Parameters: { "verbose" } are not used.

[CV 5/5; 2/625] END colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=325;, score=0.690 total time=   4.0s
[CV 4/5; 3/625] START colsample_bytree=0.5, learning_rate=0.01, max_depth=2, n_estimators=550
Parameters: { "verbose" } are not used.

[CV 4/5; 3/625] END c

In [None]:
cv_xgbr.best_params_

#### Evaluate

In [None]:
scaler: DatasetScaler = joblib.load(os.path.join(DATA_PATH, 'scaler_dataset.joblib'))

y_pred = cv_xgbr.predict(X_test)
y_pred = scaler.scaler_t.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)
y_true = scaler.scaler_t.inverse_transform(X_test.reshape(-1, 1)).reshape(-1)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse:.0f} | MAPE: {100*mape:.1f}% | R2 score: {r2:.3f}')

In [None]:
# Plot features importance
df_fi = pd.DataFrame(columns=['Feature', 'Importance'])
df_fi['Feature'] = X.columns
df_fi['Importance'] = cv_xgbr.best_estimator_.feature_importances_
df_fi.sort_values('Importance', inplace=True, ascending=False)

fig = px.bar(df_fi.head(10), x='Feature', y='Importance', title="Feature importance")
fig.show()

### Save the model

In [None]:
model_filename = os.path.join(MODEL_PATH, 'cv_xgboost.save')
joblib.dump(cv_xgbr, model_filename)

### Train Final Model

In [None]:
param = copy.deepcopy(cv_xgbr.best_params_)
param["random_state"] = 0
param["tree_method"] = 'gpu_hist'

xgbr = XGBRegressor(**param)

xgbr.fit(X, y)

In [None]:
model_filename = os.path.join(MODEL_PATH, 'xgboost.save')
joblib.dump(xgbr, model_filename)

# Test

## Import Data

In [None]:
xdf = xr.open_dataset(os.path.join(DATA_PATH, 'test_processed.nc'))
xdf = xr.merge([xdf[G_COLUMNS], xdf[M_COLUMNS].sel(datetime=xdf['time'], name=xdf['District']), xdf[S_COLUMNS]])
xdf = xdf.drop(['name', 'datetime'])
xdf

## Compute aggregation

In [None]:
xdf = xr.concat([xdf.mean(dim='state_dev'), xdf.max(dim='state_dev'), xdf.min(dim='state_dev')], dim='agg')
xdf['agg'] = ['mean', 'max', 'min'] 
xdf

## Format Data

In [None]:
df = xdf.to_dataframe()
df.reset_index(inplace=True)
df.set_index(['ts_id', 'ts_obs', 'ts_aug'] + G_COLUMNS, drop=True, inplace=True)
df = df.pivot(columns='agg')
df.columns = df.columns.map('_'.join).str.strip('_')
df.reset_index(inplace=True)
df.set_index(['ts_id', 'ts_obs', 'ts_aug'], drop=True, inplace=True)
X = df
X

## Load the model

In [None]:
model_filename = os.path.join(MODEL_PATH, 'xgboost.save')
model: XGBRegressor = joblib.load(model_filename)

## Predict data

In [None]:
scaler: DatasetScaler = joblib.load(os.path.join(DATA_PATH, 'scaler_dataset.joblib'))

y_pred = model.predict(X_test)
y_pred = scaler.scaler_t.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)

In [None]:
s_pred = pd.Series(y_pred, index=df.index, name=TARGET_TEST)
s_pred.reset_index(['ts_id', 'ts_aug'], drop=True, inplace=True)
s_pred.reset_index(inplace=True)
s_pred = s_pred.groupby('ts_obs').mean()

## Create Submissions file

In [None]:
df_sub = pd.read_csv(os.path.join(os.pardir, os.pardir, 'data', 'raw', 'test.csv'))
df_sub.drop(columns=TARGET_TEST, inplace=True)
df_sub = pd.merge(df_sub, s_pred, right_index=True, left_index=True)

In [None]:
sub_filename = os.path.join(MODEL_PATH, 'submission.csv')
df_sub.to_csv(sub_filename, index=False, header=True)