# Library

In [31]:
# Native library
import copy
import collections
import multiprocessing as mp
from typing import Union

import warnings
warnings.filterwarnings('ignore')

import os
import sys
path = os.path.join(os.pardir, os.pardir)
sys.path.append(path)

# Save object
import joblib

from tqdm import tqdm

# Data management
import numpy as np
import pandas as pd
import xarray as xr

import plotly.express as px

from src.constants import TARGET, TARGET_TEST, FOLDER, S_COLUMNS, G_COLUMNS, M_COLUMNS
from src.data.preprocessing import Smoother, Convertor, Filler, Sorter

# Data prepocessing
from src.data.datascaler import DatasetScaler
# from src.data.process_data import statedev_fill
# from sklearn.preprocessing import MinMaxScaler

# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Hyperoptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import wandb

# Regressor models
from xgboost import XGBRegressor


# Model evaluation
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

# Constant

In [32]:
DATA_PATH = os.path.join(os.pardir, os.pardir, 'data', 'interim', FOLDER)

## Import Data

In [33]:
xds = xr.open_dataset(os.path.join(DATA_PATH, 'train.nc'), engine='scipy')
xds = xds.sel(ts_obs=np.arange(555, step=2))
# xds = xr.merge([xds[G_COLUMNS], xds[M_COLUMNS].sel(datetime=xds['time'], name=xds['District']), xds[S_COLUMNS], xds[[TARGET]]])
# xds = xds.drop(['name', 'datetime'])
xds

# Fill Nan values

In [34]:
def statedev_fill(xds: xr.Dataset) -> xr.Dataset:
    # Fill missing vegetable indice and replace abnormal values

    def replaceinf(arr: np.ndarray) -> np.ndarray:
        if np.issubdtype(arr.dtype, np.number):
            arr[np.isinf(arr)] = np.nan
        return arr

    # replace ± infinite value by na
    xr.apply_ufunc(replaceinf, xds[S_COLUMNS])
    # compute mean of all stage of developpement and all obsevation
    xds_mean = xds[S_COLUMNS].mean(dim="ts_aug", skipna=True)
    # fill na value with computed mean
    xds[S_COLUMNS] = xds[S_COLUMNS].fillna(xds_mean)
    # compute mean of all stage of developpement of rice field to complete last na values
    xds_mean = xds_mean.mean(dim="ts_obs", skipna=True)
    # fill na value with computed mean
    xds[S_COLUMNS] = xds[S_COLUMNS].fillna(xds_mean)

    return xds, xds_mean

xds, xds_mean = statedev_fill(xds)

In [35]:
filler = Filler()
xds_filler = filler.fit_transform(xr.open_dataset(os.path.join(DATA_PATH, 'train.nc'), engine='scipy'))

In [36]:
(xds[S_COLUMNS + M_COLUMNS + G_COLUMNS] == xds_filler[S_COLUMNS + M_COLUMNS + G_COLUMNS]).all()

# Smooth data

In [38]:
xds = Smoother().fit_transform(xds)

## Compute aggregation

In [39]:
xds = xr.merge([xds[G_COLUMNS], xds[M_COLUMNS].sel(datetime=xds['time'], name=xds['District']), xds[S_COLUMNS], xds[[TARGET]]])
xds = xds.drop(['name', 'datetime'])
xds = xr.concat([xds.mean(dim='state_dev'), xds.max(dim='state_dev'), xds.min(dim='state_dev')], dim='agg')
xds['agg'] = ['mean', 'max', 'min'] 
xds

## Format Data

In [40]:
df = xds.to_dataframe()
df.reset_index(inplace=True)
df.set_index(['ts_obs', 'ts_aug'] + G_COLUMNS + [TARGET], drop=True, inplace=True)
df = df.pivot(columns='agg')
df.columns = df.columns.map('_'.join).str.strip('_')
df.reset_index(inplace=True)
df.set_index(['ts_obs', 'ts_aug'], drop=True, inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Other Rice Yield (kg/ha),Field size (ha),"Rice Crop Intensity(D=Double, T=Triple)",Rice Yield (kg/ha),tempmax_max,tempmax_mean,tempmax_min,tempmin_max,tempmin_mean,tempmin_min,...,osavi_min,rdvi_max,rdvi_mean,rdvi_min,mtvi1_max,mtvi1_mean,mtvi1_min,lswi_max,lswi_mean,lswi_min
ts_obs,ts_aug,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,6800,3.4,3,5500,34.0,31.920833,29.3,26.0,24.320833,23.0,...,0.089422,45.355713,28.114283,5.428384,5788.039062,3418.952393,483.139404,0.284386,0.176018,0.082123
0,1,6800,3.4,3,5500,34.0,31.920833,29.3,26.0,24.320833,23.0,...,0.065481,44.305420,27.110739,3.591432,5726.416016,3321.700439,199.339951,0.275647,0.161572,0.043390
0,2,6800,3.4,3,5500,34.0,31.920833,29.3,26.0,24.320833,23.0,...,0.043535,45.443909,27.238119,2.286913,5853.989258,3334.708984,125.650253,0.283569,0.166662,0.058301
0,3,6800,3.4,3,5500,34.0,31.920833,29.3,26.0,24.320833,23.0,...,0.098437,43.147346,27.324083,6.108664,5612.366699,3311.447021,571.652893,0.274889,0.170156,0.083576
0,4,6800,3.4,3,5500,34.0,31.920833,29.3,26.0,24.320833,23.0,...,0.076256,42.094658,26.541466,4.678495,5532.002441,3220.694092,418.133423,0.262922,0.163260,0.085781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554,95,5600,6.2,3,7200,34.0,31.387500,29.0,25.0,23.187500,21.0,...,0.114631,42.548187,26.800919,5.357734,5446.602051,3073.440186,451.448608,0.271350,0.175831,0.054570
554,96,5600,6.2,3,7200,34.0,31.387500,29.0,25.0,23.187500,21.0,...,0.086139,45.441803,27.604746,4.614069,6033.647461,3244.229248,266.417389,0.298123,0.174809,0.008207
554,97,5600,6.2,3,7200,34.0,31.387500,29.0,25.0,23.187500,21.0,...,0.113291,45.933376,27.783472,6.120148,6156.487305,3259.412109,470.283661,0.310665,0.196981,0.059249
554,98,5600,6.2,3,7200,34.0,31.387500,29.0,25.0,23.187500,21.0,...,0.151546,41.287273,26.968645,7.400481,5275.067383,3065.578125,636.744629,0.257201,0.170862,0.053449


In [42]:
from src.data.preprocessing import Smoother, Convertor, Filler, Sorter
from sklearn.pipeline import Pipeline

steps_pipeline = [
    ("filler", Filler()),
    ("smoother", Smoother()),
    ("convertor", Convertor(agg=True)),
    ("sorter", Sorter()),
    # ("estimator", XGBRegressor())
]

xds_pipe = xr.open_dataset(os.path.join(DATA_PATH, 'train.nc'), engine='scipy')
# steps_pipeline.append(("estimator", XGBRegressor()))
pipeline = Pipeline(steps_pipeline)

def preprocess_y(xds: xr.Dataset):
    df = xds[[TARGET] + S_COLUMNS].to_dataframe()
    y = df[[TARGET]].groupby(["ts_obs", "ts_aug"]).first()
    return y.reorder_levels(["ts_obs", "ts_aug"]).sort_index()


y = preprocess_y(xds_pipe)

pipe_df: pd.DataFrame = pipeline.fit_transform(xds_pipe.sel(ts_obs=np.arange(555, step=2)))
# pipe_df = pd.merge(pipe_df, y, left_index=True, right_index=True)

In [43]:
df.columns[~df.columns.isin(pipe_df.columns)]

Index(['Rice Yield (kg/ha)'], dtype='object')

In [44]:
(pipe_df == df.drop(columns=TARGET)).value_counts()

Other Rice Yield (kg/ha)  Field size (ha)  Rice Crop Intensity(D=Double, T=Triple)  tempmax_max  tempmax_mean  tempmax_min  tempmin_max  tempmin_mean  tempmin_min  temp_max  temp_mean  temp_min  dew_max  dew_mean  dew_min  humidity_max  humidity_mean  humidity_min  precip_max  precip_mean  precip_min  precipprob_max  precipprob_mean  precipprob_min  precipcover_max  precipcover_mean  precipcover_min  windspeed_max  windspeed_mean  windspeed_min  winddir_max  winddir_mean  winddir_min  sealevelpressure_max  sealevelpressure_mean  sealevelpressure_min  cloudcover_max  cloudcover_mean  cloudcover_min  solarradiation_max  solarradiation_mean  solarradiation_min  solarenergy_max  solarenergy_mean  solarenergy_min  uvindex_max  uvindex_mean  uvindex_min  moonphase_max  moonphase_mean  moonphase_min  solarexposure_max  solarexposure_mean  solarexposure_min  ndvi_max  ndvi_mean  ndvi_min  savi_max  savi_mean  savi_min  evi_max  evi_mean  evi_min  rep_max  rep_mean  rep_min  osavi_max  osavi_me

In [39]:
X, y = df.drop(columns=TARGET), df[[TARGET]]

## Split Data

In [40]:
# Split dataset into Train/Test subdataset equitably distributed according to TARGET
# Repeat the operation to create Train/Val/Test subdataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
train_idx, test_idx = train_test_split(X.index.levels[1], test_size=.2, random_state=0)
X_train, X_test, y_train, y_test = X.loc[:, train_idx, :], X.loc[:, test_idx, :], y.loc[:, train_idx, :], y.loc[:, test_idx, :] 

## Machine Learning Regression

### XGBoost

In [41]:
steps_pipeline = [
    ("filler", Filler()),
    ("smoother", Smoother()),
    ("convertor", Convertor(agg=True)),
    ("sorter", Sorter()),
    ("estimator", XGBRegressor())
]

xgbr = Pipeline(steps_pipeline)

xgbr.fit(X_train, y_train)

In [None]:
cv_xgbr.best_params_

{'colsample_bytree': 0.625,
 'learning_rate': 0.01,
 'max_depth': 10,
 'n_estimators': 1000}

#### Evaluate

In [42]:
# scaler: DatasetScaler = joblib.load(os.path.join(DATA_PATH, 'scaler_dataset.joblib'))

y_pred = xgbr.predict(X_test)
# y_pred = scaler.scaler_t.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)
# y_true = scaler.scaler_t.inverse_transform(y_test).reshape(-1)
y_true = y_test
rmse = mean_squared_error(y_true, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f'RMSE: {rmse:.0f} | MAPE: {100*mape:.1f}% | R2 score: {r2:.3f}')

RMSE: 145 | MAPE: 1.5% | R2 score: 0.967


In [43]:
# Plot features importance
df_fi = pd.DataFrame(columns=['Feature', 'Importance'])
df_fi['Feature'] = X.columns
df_fi['Importance'] = xgbr.feature_importances_
df_fi.sort_values('Importance', inplace=True, ascending=False)

fig = px.bar(df_fi.head(10), x='Feature', y='Importance', title="Feature importance")
fig.show()

### Save the model

In [None]:
model_filename = os.path.join(MODEL_PATH, 'cv_xgboost.save')
joblib.dump(cv_xgbr, model_filename)

### Train Final Model

In [None]:
param = copy.deepcopy(cv_xgbr.best_params_)
param["random_state"] = 0
param["tree_method"] = 'gpu_hist'

xgbr = XGBRegressor(**param)

xgbr.fit(X, y)

In [None]:
model_filename = os.path.join(MODEL_PATH, 'xgboost.save')
joblib.dump(xgbr, model_filename)

# Test

## Import Data

In [None]:
xdf = xr.open_dataset(os.path.join(DATA_PATH, 'test_processed.nc'))
xdf = xr.merge([xdf[G_COLUMNS], xdf[M_COLUMNS].sel(datetime=xdf['time'], name=xdf['District']), xdf[S_COLUMNS]])
xdf = xdf.drop(['name', 'datetime'])
xdf

## Compute aggregation

In [None]:
xdf = xr.concat([xdf.mean(dim='state_dev'), xdf.max(dim='state_dev'), xdf.min(dim='state_dev')], dim='agg')
xdf['agg'] = ['mean', 'max', 'min'] 
xdf

## Format Data

In [None]:
df = xdf.to_dataframe()
df.reset_index(inplace=True)
df.set_index(['ts_id', 'ts_obs', 'ts_aug'] + G_COLUMNS, drop=True, inplace=True)
df = df.pivot(columns='agg')
df.columns = df.columns.map('_'.join).str.strip('_')
df.reset_index(inplace=True)
df.set_index(['ts_id', 'ts_obs', 'ts_aug'], drop=True, inplace=True)
X = df
X

## Load the model

In [None]:
model_filename = os.path.join(MODEL_PATH, 'xgboost.save')
model: XGBRegressor = joblib.load(model_filename)

## Predict data

In [None]:
scaler: DatasetScaler = joblib.load(os.path.join(DATA_PATH, 'scaler_dataset.joblib'))

y_pred = model.predict(X_test)
y_pred = scaler.scaler_t.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1)

In [None]:
s_pred = pd.Series(y_pred, index=df.index, name=TARGET_TEST)
s_pred.reset_index(['ts_id', 'ts_aug'], drop=True, inplace=True)
s_pred.reset_index(inplace=True)
s_pred = s_pred.groupby('ts_obs').mean()

## Create Submissions file

In [None]:
df_sub = pd.read_csv(os.path.join(os.pardir, os.pardir, 'data', 'raw', 'test.csv'))
df_sub.drop(columns=TARGET_TEST, inplace=True)
df_sub = pd.merge(df_sub, s_pred, right_index=True, left_index=True)

In [None]:
sub_filename = os.path.join(MODEL_PATH, 'submission.csv')
df_sub.to_csv(sub_filename, index=False, header=True)