In [None]:
import pandas as pd
from sktime.transformations.series.outlier_detection import HampelFilter

pd.set_option('display.max_rows', None)

# df = pd.read_csv('entities-2023-09-02_17 33 41.csv', usecols=['start', 'mean'])
df = pd.read_csv('entities-2023-09-06_10 17 37.csv', usecols=['start', 'mean'])
df['start'] = pd.to_datetime(df['start'], format='%Y-%m-%d %H:%M:%S')
df = df.set_index('start')
df = df.asfreq(freq='H')
df = df.tail(9*7*24)

df['hour'] = df.index.hour
df['weekday'] = df.index.weekday >= 5

train_period = 24 * 7 * 2
for x in range(0, len(df) - train_period, 24):
    train_start = x
    train_end = x + train_period
    train_start_date = df.iloc[train_start].name
    train_end_date = df.iloc[train_end].name
    predict_start = df.iloc[train_end].name

    print(f"Training from {train_start} - {train_end - 1}: {df.iloc[train_start].name} - {df.iloc[train_end - 1].name}")
    training_df = df.loc[df['weekday'] == (predict_start.weekday() >= 5)]
    # print(training_df[train_start_date:train_end_date].groupby('hour').tail(5))
    pdf = training_df[train_start_date:train_end_date].groupby('hour').apply(lambda x: x.tail(2).mean())
    prediction = pd.DataFrame(pdf['mean'].tolist(), columns=['predicted'], index=pd.date_range(predict_start, predict_start + pd.Timedelta(hours=23), freq='H'))
    # print(prediction)
    df = df.combine_first(prediction)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from matplotlib import pyplot

pyplot.figure()
fig, ax = pyplot.subplots(figsize=(15, 5))
df[train_period-48:]['mean'].plot(ax=ax)
df[train_period-48:]['predicted'].plot(ax=ax)
# (df[train_period-48:]['mean'] - df[train_period-48:]['predicted']).plot(ax=ax)
# ax.fill_between(df.index, df['predicted_lower'], df['predicted_upper'], color='k', alpha=0.1);  
pyplot.legend()
pyplot.show()

daily_sums = df[train_period:].resample('D').sum()
print(mean_absolute_error(df[train_period:]['mean'], df[train_period:]['predicted']))
print(daily_sums)
print(mean_squared_error(daily_sums['mean'], daily_sums['predicted']))
print(mean_absolute_error(daily_sums['mean'], daily_sums['predicted']))

In [None]:
from matplotlib import pyplot

transformer = HampelFilter(window_length=48, n_sigma=2)
y_hat = transformer.fit_transform(df['mean'])
df['hampel'] = y_hat

pyplot.figure()
fig, ax = pyplot.subplots(figsize=(15, 5))
df[:7*24*3]['mean'].plot(ax=ax)
df[:7*24*3]['hampel'].plot(ax=ax)
# (df[train_period-48:]['mean'] - df[train_period-48:]['predicted']).plot(ax=ax)
# ax.fill_between(df.index, df['predicted_lower'], df['predicted_upper'], color='k', alpha=0.1);  
pyplot.legend()
pyplot.show()