In [None]:
import sys

DIR = "/workspaces/ts-forecasting/"

sys.path.append("/workspaces/ts-forecasting")
sys.path.append("/app")

In [None]:
from src.psql import PostgresDB

db = PostgresDB()
df = db.table_to_df("stock_data")
df.head(5)

Unnamed: 0,ticker,ts,open,high,low,close,vwap,transactions
0,AAPL,2024-07-01 08:00:00+00:00,211.89,211.89,211.04,211.09,211.4229,1316
1,AAPL,2024-07-01 09:00:00+00:00,211.09,211.39,210.5,210.99,210.8973,1732
2,AAPL,2024-07-01 10:00:00+00:00,211.19,211.51,211.15,211.15,211.2866,286
3,AAPL,2024-07-01 11:00:00+00:00,211.14,211.66,211.14,211.65,211.4742,1135
4,AAPL,2024-07-01 12:00:00+00:00,211.32,212.2,210.62,211.45,211.2834,4964


In [3]:
from src.ml.data import generate_features

fdf = generate_features(
    df,
    means=[3, 7, 14, 28, 56],
    mns=[10, 30, 50],
    mxs=[10, 30, 50],
    stds=[3, 7, 14, 28, 56],
    lags=[3, 7, 14],
    futs=[3],
    verbose=True,
)
fdf.head(5)

4it [00:00, 44.05it/s]
100%|██████████| 3/3 [00:00<00:00, 170.37it/s]
100%|██████████| 1/1 [00:00<00:00, 173.20it/s]


Unnamed: 0,ticker,ts,open,high,low,close,vwap,transactions,mean_3,mean_7,...,min_10,min_30,min_50,max_10,max_30,max_50,lag_3,lag_7,lag_14,fut_3
0,AAPL,2024-07-01 08:00:00+00:00,211.89,211.89,211.04,211.09,211.4229,1316,,,...,,,,,,,,,,211.4742
1,AAPL,2024-07-01 09:00:00+00:00,211.09,211.39,210.5,210.99,210.8973,1732,,,...,,,,,,,,,,211.2834
2,AAPL,2024-07-01 10:00:00+00:00,211.19,211.51,211.15,211.15,211.2866,286,211.202267,,...,,,,,,,,,,212.9082
3,AAPL,2024-07-01 11:00:00+00:00,211.14,211.66,211.14,211.65,211.4742,1135,211.219367,,...,,,,,,,211.4229,,,214.4637
4,AAPL,2024-07-01 12:00:00+00:00,211.32,212.2,210.62,211.45,211.2834,4964,211.348067,,...,,,,,,,210.8973,,,215.102


In [4]:
import pandas as pd
import numpy as np

last_train_day = "2025-05-01"

fdf["mean_encoding"] = None
fdf["std_encoding"] = None

for t in fdf.ticker.unique():
    fdf.loc[fdf.ticker == t, "mean_encoding"] = fdf[
        (fdf.ts <= last_train_day) & (fdf.ticker == t)
    ]["vwap"].mean()
    
    fdf.loc[fdf.ticker == t, "std_encoding"] = fdf[
        (fdf.ts <= last_train_day) & (fdf.ticker == t)
    ]["vwap"].std()

fdf["mean_encoding"] = fdf["mean_encoding"].astype(np.float32)
fdf["std_encoding"] = fdf["std_encoding"].astype(np.float32)

fdf.columns = pd.Index([str(col) for col in fdf.columns])

train = fdf[fdf.ts < "2025-05-01"]
valid = fdf[(fdf.ts >= "2025-05-01") & (fdf.ts < "2025-06-01")]
test = fdf[fdf.ts >= "2025-06-01"]

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_cols = train.columns.drop(["ticker", "ts", "mean_encoding", "std_encoding"])

train.loc[:, scaled_cols] = scaler.fit_transform(train[scaled_cols])
valid.loc[:, scaled_cols] = scaler.transform(valid[scaled_cols])
test.loc[:, scaled_cols] = scaler.transform(test[scaled_cols])

train, valid, test = [_df.dropna() for _df in [train, valid, test]]

 -0.46803296]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  train.loc[:, scaled_cols] = scaler.fit_transform(train[scaled_cols])
 -0.51979981]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  valid.loc[:, scaled_cols] = scaler.transform(valid[scaled_cols])
 -0.55249905]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  test.loc[:, scaled_cols] = scaler.transform(test[scaled_cols])


In [6]:
from sklearn.metrics import mean_squared_error

import mlflow
import mlflow.sklearn # For logging scikit-learn compatible models
import mlflow.xgboost # For logging XGBoost models specifically
import mlflow.lightgbm # For logging LightGBM models specifically
import mlflow.catboost # For logging CatBoost models specifically


import xgboost as xgb
import lightgbm as lgb
import catboost as cb

features = train.columns.drop(["ticker", "ts", "fut_3"])
target = "fut_3"

X_train, y_train = train[features], train[target]
X_valid, y_valid = valid[features], valid[target]
X_test, y_test = test[features], test[target]

In [7]:
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    median_absolute_error,
    mean_absolute_percentage_error,
)


def get_or_create_experiment(experiment_name):
    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)


def log_regression_metrics(y_pred, y_test):
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    medae = median_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100

    mlflow.log_metric("rmse_test", rmse)
    mlflow.log_metric("mae_test", mae)
    mlflow.log_metric("r2_test", r2)
    mlflow.log_metric("medae_test", medae)
    mlflow.log_metric("mape_test", mape)

experiment_id = get_or_create_experiment("Benchmarks")
mlflow.set_experiment("Benchmarks")
run_name = "default"


with mlflow.start_run(run_name="Benchmark - Last"):
    log_regression_metrics(X_test["vwap"], y_test)

In [8]:
import numpy as np
import mlflow
import optuna
import xgboost as xgb


def xgb_objective(trial):
    with mlflow.start_run(nested=True):
        xgb_params = {
            "objective": "reg:squarederror",
            "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 30),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
            "gamma": trial.suggest_float("gamma", 0.0, 0.5),
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
            "random_state": 42,
            "n_jobs": -1,
            "tree_method": "hist",
            "early_stopping_rounds": 50,
        }

        model = xgb.XGBRegressor(**xgb_params)

        model.fit(
            X_train,
            y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=False,
        )

        y_pred = model.predict(X_test)
        log_regression_metrics(y_test, y_pred)

        return np.sqrt(mean_squared_error(y_valid, model.predict(X_valid)))


def lgb_objective(trial):
    lgb_params = {
        "objective": "regression_l2",
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 255),
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "verbose": -1,
    }

    model = lgb.LGBMRegressor(**lgb_params)

    early_stopping_rounds = 50
    callbacks = [lgb.early_stopping(early_stopping_rounds, verbose=False)]

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='rmse',
        callbacks=callbacks,
    )

    y_pred = model.predict(X_test)
    log_regression_metrics(y_test, y_pred)

    return np.sqrt(mean_squared_error(y_valid, model.predict(X_valid)))


def cb_objective(trial):
    cb_params = {
        "objective": "RMSE",
        "iterations": trial.suggest_int("iterations", 100, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 3, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
        "random_seed": 42,
        "verbose": 0,
    }

    model = cb.CatBoostRegressor(**cb_params)

    early_stopping_rounds = 50
    model.fit(
        X_train,
        y_train,
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=early_stopping_rounds,
    )

    y_pred = model.predict(X_test)
    log_regression_metrics(y_test, y_pred)

    return np.sqrt(mean_squared_error(y_valid, model.predict(X_valid)))


def get_or_create_experiment(experiment_name):
    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)


def run_study(objective, experiment_name, run_name, n_trials=100):
    experiment_id = get_or_create_experiment(experiment_name)
    mlflow.set_experiment(experiment_name)
    run_name = run_name

    with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
        study = optuna.create_study(
            direction="minimize",
            study_name=experiment_name,
        )
        study.optimize(
            objective,
            n_trials=n_trials,
        )

        mlflow.log_params(study.best_params)
        mlflow.log_metric("best_rmse", study.best_value)


run_study(xgb_objective, "XGB", "default")
run_study(lgb_objective, "LGB", "default")
run_study(cb_objective, "CB", "default")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-07-02 01:55:27,495] A new study created in memory with name: XGB
[I 2025-07-02 01:55:27,701] Trial 0 finished with value: 0.0310009906749555 and parameters: {'n_estimators': 654, 'learning_rate': 0.17916456258932764, 'max_depth': 18, 'subsample': 0.9757395880376338, 'colsample_bytree': 0.9009580577920515, 'min_child_weight': 3, 'gamma': 0.42023561636685847, 'lambda': 2.028667391833394e-05, 'alpha': 0.16076497604178888}. Best is trial 0 with value: 0.0310009906749555.
[I 2025-07-02 01:55:28,069] Trial 1 finished with value: 0.03174611850225929 and parameters: {'n_estimators': 1734, 'learning_rate': 0.020863699870613418, 'max_depth': 22, 'subsample': 0.7738664119425993, 'colsample_bytree': 0.8489597329110046, 'min_child_weight': 2, 'gamma': 0.4335803043598193, 'lambda': 3.354042958233628e-07, 'alpha': 0.0002868172688913342}. Best is trial 0 with value: 0.0310009906749555.
[I 2025-07-02 01:55:28,276] Trial 2 finished with value: 0.