In [None]:
import numpy as np
from sklearn.datasets import fetch_california_housing, load_digits, load_iris
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
import pandas as pd
from statsmodels.graphics.tsaplots import plot_pacf

import xgboost as xgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_rows', None)


df = pd.read_csv('entities-2023-09-02_17 33 41.csv', usecols=['start', 'mean'])
# df = pd.read_csv('entities-2023-09-06_10 17 37.csv', usecols=['start', 'mean'])
df['start'] = pd.to_datetime(df['start'], format='%Y-%m-%d %H:%M:%S')
df = df.set_index('start')
df = df.asfreq(freq='H')
df = df.tail(9*7*24)

plot_pacf(df["mean"], lags=24)

df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['is_weekend'] = (df.index.dayofweek >= 5)
df['is_weekend'] = df['is_weekend'].replace({True: 1, False: 0})
# df['day_so_far'] = df.groupby(df.index.day)['mean'].cumsum()
df['past_24h'] = df['mean'].rolling(24).sum().shift()
df['yesterday'] = df.groupby(df.index.date)['mean'].sum().shift(-1)
df['yesterday'] = df['yesterday'].fillna(method='ffill')
df['week_of_year'] = df.index.isocalendar().week
df['lag_1h'] = df['mean'].shift(1)
exogs = ['hour', 'yesterday', 'dayofweek', 'week_of_year']

# train_period = 24 * 7 * 9
train_period = 24*7*6
# # for x in range(7*3):
for x in range(0, len(df) - train_period, 24):
    train_start = x
    train_end = x + train_period
    predict_start = df.iloc[train_end].name

    print(f"Training from {train_start} - {train_end - 1}: {df.iloc[train_start].name} - {df.iloc[train_end - 1].name}")

    X_train = df[train_start:train_end][exogs]
    y_train = df[train_start:train_end]['mean']

    X_test = df[train_end:train_end+24][exogs]
    y_test = df[train_end:train_end+24]['mean']

    model = xgb.XGBRegressor(verbosity=0, objective="reg:absoluteerror", reg_alpha=0.1)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=False)

    # xgb.plot_importance(model, height=0.9)

    prediction = model.predict(X_test)
    forecasted_df = pd.DataFrame(prediction, columns=['predicted'], index=pd.date_range(predict_start, predict_start + pd.Timedelta(hours=23), freq='H'))
    df = df.combine_first(forecasted_df)







In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from matplotlib import pyplot

df = df[['mean', 'predicted']]

pyplot.figure()
fig, ax = pyplot.subplots(figsize=(15, 5))
df[train_period-48:]['mean'].plot(ax=ax)
df[train_period-48:]['predicted'].plot(ax=ax)
# (df[train_period-48:]['mean'] - df[train_period-48:]['predicted']).plot(ax=ax)
# ax.fill_between(df.index, df['predicted_lower'], df['predicted_upper'], color='k', alpha=0.1);  
pyplot.legend()
pyplot.show()

daily_sums = df[train_period:].resample('D').sum()
print(mean_absolute_error(df[train_period:]['mean'], df[train_period:]['predicted']))
print(daily_sums)
print(mean_squared_error(daily_sums['mean'], daily_sums['predicted']))
print(mean_absolute_error(daily_sums['mean'], daily_sums['predicted']))

In [None]:

import mlflow
from numpy import fmin
from sklearn.metrics import roc_auc_score 
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope

df = pd.read_csv('entities-2023-09-02_17 33 41.csv', usecols=['start', 'mean'])
# df = pd.read_csv('entities-2023-09-06_10 17 37.csv', usecols=['start', 'mean'])
df['start'] = pd.to_datetime(df['start'], format='%Y-%m-%d %H:%M:%S')
df = df.set_index('start')
df = df.asfreq(freq='H')
df = df.tail(9*7*24)

df['hour'] = df.index.hour
df['dayofweek'] = df.index.dayofweek
df['is_weekend'] = (df.index.dayofweek >= 5)
df['is_weekend'] = df['is_weekend'].replace({True: 1, False: 0})
df['yesterday'] = df.groupby(df.index.date)['mean'].sum().shift(-1)
df['yesterday'] = df['yesterday'].fillna(method='ffill')
df['week_of_year'] = df.index.isocalendar().week

mlflow.xgboost.autolog(silent=True)

train_period = 24*7*4
test_period = 24*7*3

train = df[:train_period]
test = df[train_period:train_period]

X_train = train.drop(columns="mean")
X_test = test.drop(columns="mean")
y_train = train["mean"]
y_test = test["mean"]

train = xgb.DMatrix(data=X_train, label=y_train)
test = xgb.DMatrix(data=X_test, label=y_test)

search_space = {
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'max_depth': scope.int(hp.uniform('max_depth', 1, 100)),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.loguniform('gamma', -10, 10),
    'alpha': hp.loguniform('alpha', -10, 10),
    'lambda': hp.loguniform('lambda', -10, 10),
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'seed': 123,
}

def train_model(params):
    with mlflow.start_run(nested=True):
        booster = xgb.train(params=params, dtrain=train, num_boost_round=5000, evals=[(test, "test")], early_stopping_rounds=50, verbose_eval=False)

        # Record AUC as primary loss for Hyperopt to minimize
        predictions_test = booster.predict(test)
        auc_score = roc_auc_score(y_test, predictions_test)

        # Set the loss to -1*auc_score so fmin maximizes the auc_score
        return {'status': STATUS_OK, 'loss': -auc_score, 'booster': booster.attributes()}
    
with mlflow.start_run(run_name='initial_search'):
    best_params = fmin(
      fn=train_model,
      space=search_space,
      algo=tpe.suggest,
      max_evals=25,
    #   rstate=np.random.RandomState(123),
      #trials=spark_trials
    )

best_params