# Driver

## Imports

In [None]:
import modin.pandas as pd
from transformer.DataAggregator import DataAggregator
from transformer.Dataformator import DataFormator
from transformer.ImputeMean import ImputeMean
from transformer.TrainTestSplit import TrainTestSplit
import utils
import numpy as np
import json
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.metrics import mean_absolute_error
from sklearn.multioutput import MultiOutputRegressor
from configs.space import space

## Data Loading

In [None]:
sales = pd.read_csv("./data/sales_train_validation.csv")
calender =pd.read_csv("./data/calendar.csv")

## Data Transformation

In [None]:
data_aggregator = DataAggregator(sales)
aggregated_data = data_aggregator.aggregate(["store_id","dept_id"],"sum")
data_formator =DataFormator(aggregated_data,calender)
data = data_formator.format_data('store_id','dept_id')
impute_mean =ImputeMean(data)
data = impute_mean.replace_zero_with_mean()
tts = TrainTestSplit( data, test_size=0.3, random_state=0,shuffle=False)
X_train_, X_test_, y_train_, y_test_ = tts.split_data()

## Parameter Space for Xgboost data preparation
The link to papers is given below in the notebook, for The approch used in this model data preparation.

In [None]:
target_sequence_length = 1 #The forecasting horizon
test_size = 0.30
# for data preparation for xgboost
hyperparameters = {
    "in_length" : 1, # =target_sequence_length, forecasting horizon length
    "step_size" : 4, # window size
    }

## Training for best Parameters Model tuning

### Model Tuning

In [None]:
best ={}

for i in data.columns:
    y_train_data_ = pd.DataFrame()
    y_test_data_ = pd.DataFrame()
    y_train_data_[i] = y_train_[i]
    y_test_data_[i] = y_test_[i]
    
    x_train, y_train,x_test, y_test = utils.prepare_data_for_xgb(
                                    y_train_data_,y_test_data_,
                                    hyperparameters["in_length"],hyperparameters["step_size"],
                                    target_sequence_length
                                                            )
    def objective(params):
        model = xgb.XGBRegressor(**params)
        model = MultiOutputRegressor(model).fit(x_train, y_train)
        
        # train_forecasts = model.predict(x_train)
        test_forecasts = model.predict(x_test)
        mse = mean_absolute_error(y_test, test_forecasts)
        # print(f'Test MAE: { mse}')
        # print("Mean test data value: {}".format(np.mean(y_test)))
        return {'loss': mse, 'status': STATUS_OK}

    # optimize hyperparameters using Hyperopt's Tree-structured Parzen Estimator (TPE) algorithm
    print(f"Training .......\n model_{i} ")
    best["best_param_{0}".format(i)] = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=20)

    print(best["best_param_{0}".format(i)])

### Save Best parameters

In [None]:
utils.save_to_json('xgb_best_params',best,'w')

## Training with best parameters Model Training

### Reading Best Parameters

In [None]:
best =utils.read_from_json('xgb_best_params',mode="r")
best.keys()

### Model Training

In [None]:
trained_model={}

for i in data.columns:
    y_train_data_ = pd.DataFrame()
    y_test_data_ = pd.DataFrame()
    y_train_data_[i] = y_train_[i]
    y_test_data_[i] = y_test_[i]
    
    x_train, y_train,x_test, y_test = utils.prepare_data_for_xgb(
                                    y_train_data_,y_test_data_,
                                    hyperparameters["in_length"],hyperparameters["step_size"],
                                    target_sequence_length
                                                            )
    
    
    model = xgb.XGBRegressor(
        gamma= best[f"best_param_{i}"]['gamma'],
        learning_rate= best[f"best_param_{i}"]['learning_rate'],
        max_depth= best[f"best_param_{i}"]['max_depth'],
        n_estimators= best[f"best_param_{i}"]['n_estimators'],
        reg_alpha= best[f"best_param_{i}"]['reg_alpha'],
        reg_lambda= best[f"best_param_{i}"]['reg_lambda'],
        subsample= best[f"best_param_{i}"]['subsample']
        )

    trained_model["model_{0}".format(i)] = MultiOutputRegressor(model).fit(x_train, y_train)
    print(f"trained_model   :{i}")
    # train_forecasts = trained_model["model_{0}".format(i)].predict(x_train)
    test_forecasts = trained_model["model_{0}".format(i)].predict(x_test)
    test_mae = mean_absolute_error(y_test, test_forecasts)
    print(f'Test MAE: { test_mae}')
    # print("Mean test data value: {}".format(np.mean(y_test)))

In [None]:
model = trained_model['model_CA_1_FOODS_1']
model

## Generating forecast

#### Refrence of papers for the used approch: 
https://arxiv.org/abs/1603.02754  &    
https://arxiv.org/abs/2101.02118
#### Refrence to site for data preparing steps:
https://towardsdatascience.com/multi-step-time-series-forecasting-with-xgboost-65d6820bec39