## Import Libraries

In [None]:
import time
import pandas as pd
import os

# added for progress bar
from tqdm.auto import tqdm

import numpy as np
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
# import math
from pandas import DataFrame
from pandas import concat
from numpy import asarray
# from sklearn.metrics import mean_absolute_error
from matplotlib import pyplot

## Transform TLM Data function

In [None]:
def _transform_data(df_resampled: pd.DataFrame):
    """A preprocessing function that creates a feature dataframe from the
    raw measurement data and adds a column for an exogenous variable
    which represents whether a given day is a business day.
    This is done by
    (1) Calculating the difference in volume between successive days,
    (2) Setting the differences greater than 2 gallons equal to nan,
    (3) calculating the median consumption
    (4) Filling nan values with the median
    (5) Computing a negative cumulative summation
    (6) Applying a cumulative maximum
    (7) Appending a column to the dataframe that represents a bus. day bool

    :param df_resampled: A dataframe of resampled raw measurement data
    :type df_resampled: pd.DataFrame
    :returns: A dataframe of consumption features
    """
    df_differences = df_resampled.diff().copy()
    df_differences[df_differences > 2] = np.nan
    median_consumption = df_differences.median()
    df_differences.fillna(median_consumption)
    df_features = (-df_differences.cumsum()).cummax()
    return df_features

## Function to create TLM consumption

In [None]:
def _create_tlm_consumption(tlm_data: pd.DataFrame):
    """A function which generates the consumption for the TLM base data
    params: tlm_data : A dataframe containing the TLM measurement readings
    returns dataframe with consumption values for the TLM base data
    """
    tlm_data_pivot = tlm_data.pivot(index="MeasurementDate", columns="TankID")
    tlm_data_pivot.index = tlm_data_pivot.index.astype("M8[ns]")
    df_resampled = tlm_data_pivot.resample("D").mean().interpolate()
    df_features = _transform_data(df_resampled).diff()
    median_consumption = df_features.median()
    df_features.fillna(median_consumption, inplace=True)
    df_features.columns = df_features.columns.droplevel(0)
    return df_features

## Function to convert tlm consumption from timeseries datset to a supervised leardning dataset

In [None]:
def _series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
            data: Sequence of observations as a list or NumPy array.
            n_in: Number of lag observations as input (X).
            n_out: Number of observations as output (y).
            dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
            Pandas DataFrame of series framed for supervised learning.
            Dataframe has columns var(t-x) to var(t+y).
            x,y are periods in training and test dataset respectively.
            Column var(t-x) to var(t-1) represents input data (training data)
            Columns var(t) to var(t+y) represent output data (testing data)
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

## fit an xgboost model and make a one step prediction

In [None]:
def xgboost_forecast(train, testX):
    # transform list into array
    train = asarray(train)
    # split into input and output columns
    trainX, trainy = train[:, :-1], train[:, -1]
    # fit model
    model = XGBRegressor(objective='reg:squarederror', n_estimators=1000,n_jobs =-1)
    model.fit(trainX, trainy)
    # make a one-step prediction
    yhat = model.predict(asarray([testX]))
    return yhat[0]

## Walkforward validation for the XGboost model (Train and test the model)

In [None]:
def walk_forward_validation(data, n_test):
    predictions = list()
    # split dataset
    train, test = train_test_split(data, n_test)
    # seed history with training dataset
    history = [x for x in train]
    # step over each time-step in the test set
    for i in range(len(test)):
        # split test row into input and output columns
        testX, testy = test[i, :-1], test[i, -1]
        # fit model on history and make a prediction
        yhat = xgboost_forecast(history, testX)
        # store forecast in list of predictions
        predictions.append(yhat)
        # add actual observation to history for the next loop
        history.append(test[i])
        # summarize progress
        print('>expected=%.1f, predicted=%.1f' % (testy, yhat))
    # estimate prediction error
    error = mean_absolute_error(test[:, -1], predictions)
    return error, test[:, -1], predictions


## split a univariate dataset into train/test sets

In [None]:
def train_test_split(data, n_test):
    return data.iloc[:-n_test, ].values, data.iloc[-n_test:, :].values

In [None]:
## Generate Forecasts using XGBoost model

In [None]:
def supervised_learning_model(tlm_consumption: pd.DataFrame,
                              n_in: int,
                              horizon_window: list = [42, 28],
                              n_out: int = 1,
                              ):
    """
    Parameters
    ----------
    tlm_consumption : pd.DataFrame
        Dataframe with consumption.
    n_in : int
        Number of days to train the model for future forecasting.
    n_out : int, optional
        Number of days in future to predict for in one iteration.
        The default is 1.
    Returns A dataframe containing the future forecast
    """
    forecast_dates = pd.date_range(
        start=max(tlm_consumption.index), periods=(n_in+1), freq="D")[1:]
    supervised_forecast = pd.DataFrame(forecast_dates)
    supervised_forecast.columns = ['Forecast_Date']
    for tanks in tlm_consumption.columns:
        tanks = str(tanks)
        # Train the Model
        supervised_forecasting_dataset = _series_to_supervised(
            tlm_consumption[tanks].to_list(), n_in=n_in, n_out=n_out)
        train = asarray(supervised_forecasting_dataset)
        trainX, trainy = train[:, :-1], train[:, -1]
        # fit model
        model = XGBRegressor(
            objective='reg:squarederror', n_estimators=1000,n_jobs =-1)
        model.fit(trainX, trainy)
        row = tlm_consumption[tanks][-n_in:].values.flatten()
        rows_forecasted = []
        rows_forecasting = row
        prediction = []
        for days in range(n_in):
            if len(prediction) > 0:
                rows_forecasting = rows_forecasting[1:]
                rows_forecasting = np.append(rows_forecasting, prediction)
                rows_forecasting[-n_in:].flatten()
            # make a one-step prediction
            prediction = model.predict(asarray([rows_forecasting]))
            rows_forecasted.append(prediction[0])
            # print(rows_forecasted)
        forecast_values = pd.DataFrame(
            {'Forecast_Date': forecast_dates, tanks: rows_forecasted})
        supervised_forecast = pd.merge(supervised_forecast,
                                       forecast_values,
                                       how='left',
                                       left_on='Forecast_Date',
                                       right_on='Forecast_Date')
    return supervised_forecast


## Get Dates for test and train period

In [None]:
def _get_training_dates(
    t0: str = "today",
    num_results: int = 1,
    forecast_horizon: int = 42,
    train_periods: int = 180,
    skip: int = 1,
    offset: int = 0,
):
    """Generate start and end dates for training and test periods.

    Args:
      t0: The end date of collected data.In production, this should be "today".
      In model training, this can be in the form of pandas accepted datetime
      string. eg "2020-08-31"

      num_results: The number of training and test periods. Number iterations to generate dates for

      forecast_horizon: The forecast horizon in days

      train_periods: The necessary training periods in days

      skip:

      offset: Shifts the start date

    Returns:
      A list of dictionaries of generated start and end dates for training
      and test periods.

    """

    t0 = pd.Timestamp(t0).normalize() - np.timedelta64(offset, "D")
    t0 = t0 - pd.DateOffset(days=forecast_horizon)

    check_points = pd.date_range(
        end=t0, freq=str(skip) + "D", periods=num_results, closed="right"
    )

    ts_cv_dates = [
        {
            "start_train": np.datetime64(
                point - pd.DateOffset(days=train_periods - 1), "D"
            ),
            "end_train": np.datetime64(point - pd.DateOffset(days=1), "D"),
            "start_test": np.datetime64(point - pd.DateOffset(days=0), "D"),
            "end_test": np.datetime64(
                point + pd.DateOffset(days=forecast_horizon), "D"
            ),
        }
        for point in check_points
    ]

    return ts_cv_dates

## Manual Run

In [None]:
# Import Data
# Update path to files appropriately
tlm_base = pd.read_parquet('tlm_base.parquet')
tank_context = pd.read_csv('tank_context.csv')
hom_base = pd.read_parquet('hom_base.parquet')

# Check Data types
tlm_base.dtypes
hom_base.dtypes
tank_context.dtypes

# Change Data Types for Tank ID to Str
tlm_base.TankID = tlm_base.TankID.astype(str)
tank_context.TankID = tank_context.TankID.astype(str)

# Filter TLM base data for tanks in tank context
tlm_base_tall = pd.merge(tlm_base,
                         tank_context[['UID', 'TankID']].drop_duplicates(),
                         left_on='TankID',
                         right_on='TankID',
                         how='inner')

# Compute Consumption from TLM base data
tlm_consumption = _create_tlm_consumption(tlm_data=tlm_base_tall)

# Choose a tank for verifying function output
supervised_forecasting = _series_to_supervised(
    tlm_consumption['10017'].tolist(), n_in=42, n_out=42)

# Train and test the model
mae, y, yhat = walk_forward_validation(supervised_forecasting, 42)
# Print MAPE score of the forecasti
print('MAE: %.3f' % mae)

# Testing for code output in prediction
train = asarray(supervised_forecasting)
trainX, trainy = train[:, :-1], train[:, -1]
# fit model
model = XGBRegressor(objective='reg:squarederror', n_estimators=1000,n_jobs=-1)
model.fit(trainX, trainy)
# construct an input for a new preduction
row = tlm_consumption['10017'][-120:].values.flatten()
prediction = model.predict(asarray([row]))


## Iterate through dates and generate forecasts for validation. This is to be converted into a function

In [None]:
def generate_forecast(date,num_results,forecast_horizon,offset):
    
    main_dir = 'Onedrive/doc/....'
    
    date_function_output = _get_training_dates(
    t0= date,
    num_results= num_results, # 18,
    forecast_horizon= forecast_horizon, #42 - 1,
    offset=offset#0,
    )
    
    
    
    tlm_base = pd.read_parquet('tlm_base.parquet')
    tank_context = pd.read_csv('tank_context.csv')
    hom_base = pd.read_parquet('hom_base.parquet')

    # Change Data Types for Tank ID to Str
    tlm_base.TankID = tlm_base.TankID.astype(str)
    tank_context.TankID = tank_context.TankID.astype(str)

    tlm_base_tall = pd.merge(tlm_base,
                             tank_context[['UID', 'TankID']].drop_duplicates(),
                             left_on='TankID',
                             right_on='TankID',
                             how='inner')

    # Compute Consumption from TLM base data
    tlm_consumption = _create_tlm_consumption(tlm_data=tlm_base_tall)
    
    # tqdm added for progress bar
    for j in tqdm(range(len(date_function_output))):
    #     i = i + 1
    #     print(i)
        start_train = [x["start_train"] for x in date_function_output][j]
        end_train = [x["end_train"] for x in date_function_output][j]
        start_test = [x["start_test"] for x in date_function_output][j]
        end_test = [x["end_test"] for x in date_function_output][j]
        tlm_consumption_train = tlm_consumption[(tlm_consumption.index <= end_train) & (
            tlm_consumption.index >= start_train)].copy()
        tlm_actual_consumption = tlm_consumption[(tlm_consumption.index <= end_test) & (
            tlm_consumption.index >= start_test)].copy()
        xgboost_forecast_values = supervised_learning_model(
            tlm_consumption=tlm_consumption_train,
            n_in=42,
        )
        try:
            # Uncomment and update path in next line
            os.mkdir("XgBoost Model Validation Forecasts")
        except Exception:
            print('Folder already exists')
        date_for_file_name = str(end_train)
        year = date_for_file_name.split("-")[0]
        month = date_for_file_name.split("-")[1]
        day = date_for_file_name.split("-")[2]
        
        x = os.path.join(main_dir,'XgBoost Model Validation Forecasts')
        y = os.path.join(x,str(year))
        z = os.path.join(y,str(month))
        dir_path = os.path.join(y,str(day))
        
        
#         dir_path = "XgBoost Model Validation Forecasts//" + \
#             str(year) + '//' + str(month) + '//' + str(day)
        
        file_path = os.path.join(dir_path,'XGBoost Model Validation Forecast.csv')
        file_path_actual = os.path.join(dir_path,'Actual Consumption.csv')
        
#         file_path = dir_path + "//XGBoost Model Validation Forecast.csv"

#         file_path_actual = dir_path + "//Actual Consumption.csv"
        try:
            # Uncomment and update paths in next line
            os.makedirs(dir_path, exist_ok=True)
            xgboost_forecast_values.to_csv(file_path, index=False)
            tlm_actual_consumption.to_csv(file_path_actual)
        except Exception:
            xgboost_forecast_values.to_csv(file_path, index=False)
            tlm_actual_consumption.to_csv(file_path_actual)


In [None]:
generate_forecast('2022-01-27',18,42 - 1,0)