In [None]:
import numpy as np
from sklearn.datasets import fetch_california_housing, load_digits, load_iris
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
import pandas as pd

import xgboost as xgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_rows', None)


df = pd.read_csv('entities-2023-09-02_17 33 41.csv', usecols=['start', 'mean'])
# df = pd.read_csv('entities-2023-09-06_10 17 37.csv', usecols=['start', 'mean'])
df['start'] = pd.to_datetime(df['start'], format='%Y-%m-%d %H:%M:%S')
df = df.set_index('start')
df = df.asfreq(freq='H')
df = df[['mean']].head(8*7*24)
# df = df.tail(14*7*24)

# df['holiday'] = False
# df.loc['2023-08-25 00:00:00':'2023-09-03 00:00:00', 'holiday'] = True

df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['is_weekend'] = (df.index.dayofweek >= 5)
df['is_weekend'] = df['is_weekend'].replace({True: 1, False: 0})
# df['day_so_far'] = df.groupby(df.index.day)['mean'].cumsum()
df['past_24h'] = df['mean'].rolling(24).sum().shift()
df['yesterday'] = df.groupby(df.index.date)['mean'].sum().shift(1) / 24
df['yesterday'] = df['yesterday'].fillna(method='ffill')
df['yesterday_2'] = df.groupby(df.index.date)['mean'].sum().shift(2) / 24
df['yesterday_2'] = df['yesterday_2'].fillna(method='ffill')
df['yesterday_peak'] = df.groupby(df.index.date)['mean'].max().shift(1)
df['yesterday_peak'] = df['yesterday_peak'].fillna(method='ffill')
df['hlag_1d'] = df['mean'].shift(24)
df['hlag_2d'] = df['mean'].shift(24*2)
df['hlag_3d'] = df['mean'].shift(24*3)
df['hlag_4d'] = df['mean'].shift(24*4)
df['hlag_5d'] = df['mean'].shift(24*5)
df['hlag_6d'] = df['mean'].shift(24*6)
df['hlag_7d'] = df['mean'].shift(24*7)
df['yesterday_25'] =  df.groupby(df.index.date)['mean'].quantile(0.8).shift(1) / 24
df['yesterday_25'] = df['yesterday_25'].fillna(method='ffill')

df['cum_sum_today'] = df.groupby(df.index.date)['mean'].cumsum().shift(1)
df['lag_1h'] = df['mean'].shift(1)
df['lag_2h'] = df['mean'].shift(2)
df['lag_3h'] = df['mean'].shift(3)

exogs = ['hour', 'dayofweek', 'yesterday', 'cum_sum_today', 'lag_1h', 'lag_2h', 'lag_3h']

# train_period = 24 * 7 * 9
train_period = 24*7*5
test_period = 24*4
# # for x in range(7*3):
for x in range(0, len(df) - train_period - test_period, 24):
    train_start = 0
    train_end = x + train_period
    test_start = train_end
    test_end = test_start + test_period
    predict_start = test_end
    predict_end = predict_start + 24
    predict_start_date = df.iloc[predict_start].name

    print(f"Training from {train_start} - {train_end - 1}: {df.iloc[train_start].name} - {df.iloc[train_end - 1].name}")

    X_train = df[train_start:train_end][exogs]
    y_train = df[train_start:train_end]['mean']

    X_test = df[test_start:test_end][exogs]
    y_test = df[test_start:test_end]['mean']
    
    X_predict = df[predict_start:predict_end][exogs]

    model = xgb.XGBRegressor(verbosity=0, objective="reg:tweedie", num_estimators=500, early_stopping_rounds=100)
    model.fit(X_train, y_train, verbose=False, eval_set=[(X_train, y_train), (X_test, y_test)])

    xgb.plot_importance(model, height=0.9)

    prediction = model.predict(X_predict)
    forecasted_df = pd.DataFrame(prediction, columns=['predicted'], index=pd.date_range(predict_start_date, predict_start_date + pd.Timedelta(hours=23), freq='H'))
    df = df.combine_first(forecasted_df)



In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from matplotlib import pyplot

df = df[['mean', 'predicted']]

forecast_start = train_period + test_period

daily_sums = df[forecast_start:].resample('D').sum()

pyplot.figure()
fig, ax = pyplot.subplots(figsize=(15, 5))
df[forecast_start-48:]['mean'].plot(ax=ax)
df[forecast_start-48:]['predicted'].plot(ax=ax)
(daily_sums['mean'] - daily_sums['predicted']).shift(1).plot(ax=ax)
# (df[forecast_start-48:]['mean'] - df[forecast_start-48:]['predicted']).plot(ax=ax)
# ax.fill_between(df.index, df['predicted_lower'], df['predicted_upper'], color='k', alpha=0.1);  
pyplot.legend()
pyplot.show()


print(mean_absolute_error(df[forecast_start:]['mean'], df[forecast_start:]['predicted']))
print(daily_sums)
print(mean_squared_error(daily_sums['mean'], daily_sums['predicted']))
print(mean_absolute_error(daily_sums['mean'], daily_sums['predicted']))

In [None]:
from statistics import LinearRegression
from mlforecast import MLForecast
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from window_ops.rolling import rolling_mean, rolling_max, rolling_min

df['unique_id'] = 'id'
df['ds'] = df.index
# df = df.drop(columns=['past_24h'])
df['past_24h'] = df['mean'].rolling(24).sum().shift()
input = df.copy()

for x in range(0, len(df) - train_period, 24):
# for x in range(1):
    train_start = x
    train_end = x + train_period
    predict_start = input.iloc[train_end].name

    print(f"Training from {train_start} - {train_end - 1}: {input.iloc[train_start].name} - {input.iloc[train_end - 1].name}")

    # X_train = input[train_start:train_end][exogs]
    y_train = input[train_start:train_end]['mean']

    # X_test = input[train_end:train_end+24][exogs]
    y_test = input[train_end:train_end+24]['mean']

    models = [xgb.XGBRegressor(), HistGradientBoostingRegressor()]

    model = MLForecast(models=models,
                    freq='H',
                    lags=[24,24*7],
                    lag_transforms={
                        2: [
                            (rolling_mean, 24),
                            (rolling_mean, 24*7),
                        ]
                    },
                    date_features=['hour', 'dayofweek'],
                    
                    )

    model.fit(input[train_start:train_end], id_col='unique_id', time_col='ds', target_col='mean', static_features=[])
    print(input[train_end:train_end+24][['past_24h']])
    pred = model.predict(horizon=24, dynamic_dfs=input[train_end:train_end+24][['past_24h']])
    pred.index = pred['ds']
    pred = pred[['XGBRegressor']]
    pred = pred.rename(columns={'XGBRegressor': 'predicted'})

    # model = xgb.XGBRegressor(verbosity=0, objective="reg:absoluteerror", eval_metric="mae")
    # model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)

    # xgb.plot_importance(model, height=0.9)

    # prediction = model.predict(X_test)
    # forecasted_df = pd.DataFrame(prediction, columns=['predicted'], index=pd.date_range(predict_start, predict_start + pd.Timedelta(hours=23), freq='H'))
    df = df.combine_first(pred)



In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from matplotlib import pyplot

df = df[['mean', 'predicted']]

pyplot.figure()
fig, ax = pyplot.subplots(figsize=(15, 5))
df[train_period-48:]['mean'].plot(ax=ax)
df[train_period-48:]['predicted'].plot(ax=ax)
# (df[train_period-48:]['mean'] - df[train_period-48:]['predicted']).plot(ax=ax)
# ax.fill_between(df.index, df['predicted_lower'], df['predicted_upper'], color='k', alpha=0.1);  
pyplot.legend()
pyplot.show()

daily_sums = df[train_period:].resample('D').sum()
print(mean_absolute_error(df[train_period:]['mean'], df[train_period:]['predicted']))
print(daily_sums)
print(mean_squared_error(daily_sums['mean'], daily_sums['predicted']))
print(mean_absolute_error(daily_sums['mean'], daily_sums['predicted']))