# General Residual Error Prediction Object Scratch

In [15]:
from darts import TimeSeries
import pandas as pd
import matplotlib.pyplot as plt
from optuna.integration import PyTorchLightningPruningCallback
import pandas as pd
from darts.models import GaussianProcessFilter
from darts import TimeSeries
from sklearn.gaussian_process.kernels import RBF
from darts.models import BlockRNNModel
from darts.utils.likelihood_models import QuantileRegression
from darts.dataprocessing.transformers import Scaler
from darts.metrics import smape
import ray
import os
import optuna
import argparse

class ResidualForecaster():
    def __init__(self,
                 input_csv_name: str,
                 target_variable_column_name: str,
                 predicted_target_column_name: str,
                 datetime_column_name: str,
                 covariates_names: list,
                 output_csv_name: str,
                 validation_split_date: str, #YYYY-MM-DD
                 ):

        self.df = pd.read_csv(self.input_csv_name)
        self.input_csv_name = input_csv_name
        self.target_variable_column_name = target_variable_column_name
        self.predicted_target_column_name = predicted_target_column_name
        self.datetime_column_name = datetime_column_name
        self.covariates_names = covariates_names
        self.output_csv_name = output_csv_name
        self.validation_split_date = validation_split_name

        self._preprocess_data()
        self.make_residual_forecasts()
    
    def make_stitched_series(self, variable_tseries):
        """
        Returns a dictionary {"variable": stitched time series of variable}
        """
        kernel = RBF()
        
        gpf_missing = GaussianProcessFilter(kernel=kernel, 
                                            alpha=0.001, 
                                            n_restarts_optimizer=100)
        
        gpf_missing_big_gaps = GaussianProcessFilter(kernel=kernel, 
                                                     alpha=2, 
                                                     n_restarts_optimizer=10)
        stitched_series = {}
    
        # Filtering the TimeSeries
        try:
            filtered = gpf_missing.filter(variable_tseries, num_samples=500)
            filtered_big_gaps = gpf_missing_big_gaps.filter(variable_tseries, 
                                                            num_samples=500)
        except:
            return None
    
        #if there is a gap over 7 indices, use big gap filter
        gap_series = variable_tseries.gaps()
        stitched_df = filtered.pd_dataframe()
        replacement_df = filtered_big_gaps.pd_dataframe()
        
        for index, row in gap_series.iterrows():
            if row["gap_size"] > 7:
                for date in pd.date_range(row["gap_start"], row["gap_end"]):
                    stitched_df.loc[date] = replacement_df.loc[date]
        
        stitched_series = TimeSeries.from_times_and_values(
                                    stitched_df.index, 
                                    stitched_df.values.reshape(
                                                len(stitched_df), 
                                                1, 
                                                -1))
        
        return stitched_series
        
    def _preprocess_data(self):
        times = pd.to_datetime(self.df[self.datetime_column_name])
        times = pd.DatetimeIndex(times)
        variable_list = self.covariates_name + [self.target_variable_column_name,
                                                self.predicted_target_column_name]
        
        var_series_dict = {var: TimeSeries.from_times_and_values(times, 
                                                                 df[var], 
                                                                 fill_missing_dates=True,
                                                                 freq="D") 
                                                        for var in variable_list}

        stitched_series_dict = {self.make_stitched_series(
                                            variable_series_dict[var])
                                                    for var in variable_list}
        self.inputs = stitched_series_dict[self.target_variable_column_name] - \
                       stitched_series_dict[self.predicted_target_column_name]

        self.covariates = stitched_series_dict[self.covariates_names[0]]
        for cov_var in self.covariates_names[1:]:
            self.covariates.concatenate(stitched_series_dict[cov_var], 
                                                 axis=1, 
                                                 ignore_time_axis=True)

        # Should add an if statement here for tuning
        year = int(self.validation_split_date[:4])
        month = int(self.validation_split_date[5:7])
        day = int(self.validation_split_date[8:])
        split_date = pd.Timestamp(year=year, month=month, day=day)
        self.training_set, self.validation_set = inputs.split_before(split_date)



    def tune(self):
        # Relevant to tuning
        def objective(trial):
            callback = [PyTorchLightningPruningCallback(trial, monitor="val_loss")]
        
            # Hyperparameters
            input_chunk_length = trial.suggest_categorical("input_chunk_length", 
                                                               [31, 60, 180, 356])
            kernel_size = trial.suggest_categorical("kernel_size", [2, 3, 5])
            num_filters = trial.suggest_categorical("num_filters", [1, 3, 5])
            num_layers = trial.suggest_categorical("num_layers", [None, 1, 2, 3])
            dilation_base = trial.suggest_categorical("dilation_base", [1, 2, 3])
            dropout = trial.suggest_categorical("dropout", [0.1, 0.2, 0.3])
        
            tcn_model = TCNModel(input_chunk_length=input_chunk_length,
                            kernel_size=kernel_size,
                            num_filters=num_filters,
                            output_chunk_length=30,
                            likelihood=QuantileRegression([0.05, 0.1, 0.5, 0.9, 0.95]))
        
            tcn_model.fit(self.training_set,
                          past_covariates=self.covariates,
                          epochs=400, 
                          verbose=False)
        
            predictions = tcn_model.predict(n=len(self.validation_set[:30]), 
                                            past_covariates=self.covariates, 
                                            num_samples=50)
            smapes = smape(self.validation_set[:30], predictions, n_jobs=-1, verbose=False)
            smape_val = np.mean(smapes)
        
            return smape_val if smape_val != np.nan else float("inf")


        study = optuna.create_study(direction="minimize")
        
        study.optimize(objective, n_trials=10) # Note 10 trials pretty meaningless here
        
        # We could also have used a command as follows to limit the number of trials instead:
        # study.optimize(objective, n_trials=100, callbacks=[print_callback])
        
        # Finally, print the best value and best hyperparameters:
        #print(f"Best value: {study.best_value}, Best params: {study.best_trial.params}")
        self.hyperparams = study.best_trial.params

    def make_residual_forecasts(self):
        tcn = TCNModel(input_chunk_length=35,
               output_chunk_length=34,
               likelihood=QuantileRegression([0.05, 0.1, 0.5, 0.9, 0.95]),
               random_state=0)
    
        tcn.fit(training_set,
                past_covariates=covs,
                epochs=500, 
                verbose=False)

        predictions = tcn.predict(n=34,
                                  past_covariates=covs, 
                                  num_samples=500)

        predictions.pd_dataframe().to_csv(self.output_csv_name)
    