In [1]:
import pandas as pd
import logging
import numpy as np
import ta

logging.basicConfig(level=logging.INFO)

In [2]:
INTERVAL_1_MIN = pd.Timedelta(minutes=1)
INTERVAL_5_MIN = pd.Timedelta(minutes=5)
INTERVAL_15_MIN = pd.Timedelta(minutes=15)
INTERVAL_30_MIN = pd.Timedelta(minutes=30)


In [3]:
merdeg_data_path = "/teamspace/studios/this_studio/informer-validation/data/btc_with_macro.csv"

merged_df = pd.read_csv(merdeg_data_path, parse_dates=["datetime"])

# Hourly price data
df_5min  = merged_df.resample('5min', on='datetime').first().reset_index()


df_5min

Unnamed: 0,datetime,close,high,low,open,volume,fng_value,vix_value,fed_funds_rate
0,2023-01-01 00:00:00,16543.67,16544.76,16538.45,16541.77,83.08143,26.0,21.67,4.33
1,2023-01-01 00:05:00,16536.70,16537.80,16533.94,16534.91,53.58957,26.0,21.67,4.33
2,2023-01-01 00:10:00,16525.34,16530.87,16522.55,16526.67,96.60421,26.0,21.67,4.33
3,2023-01-01 00:15:00,16521.00,16526.84,16519.40,16521.26,114.31289,26.0,21.67,4.33
4,2023-01-01 00:20:00,16536.61,16537.82,16534.15,16534.94,37.63557,26.0,21.67,4.33
...,...,...,...,...,...,...,...,...,...
230683,2025-03-11 23:35:00,82699.67,82699.93,82622.63,82627.82,4.78086,24.0,26.92,4.33
230684,2025-03-11 23:40:00,82731.14,82884.81,82731.14,82880.90,19.86220,24.0,26.92,4.33
230685,2025-03-11 23:45:00,82818.44,82818.45,82732.01,82786.94,11.97104,24.0,26.92,4.33
230686,2025-03-11 23:50:00,82699.99,82700.00,82663.93,82663.93,2.94964,24.0,26.92,4.33


In [4]:
import numpy as np
import pandas as pd
import ta

def preprocess_augment_data(
    data: pd.DataFrame,
    interval: int,                 # minutes per bar, e.g. 5
    horizon_steps: int = 12,       # how many steps ahead to predict (e.g. 12×5min = 1h)
    winsor_days: int = 7,          # rolling window length for winsorization
    eps: float = 1e-8
) -> pd.DataFrame:
    """
    Preprocess BTCUSDT OHLCV features for Informer with strict no-leakage rules.

    Adds:
      - roll_vol_1d          : rolling (past-only) 1-day std of returns
      - ret_volnorm          : returns / roll_vol_1d
      - y_dir, y_mag         : sign and |returns|
      - ret_wins             : returns winsorized with rolling 0.5%/99.5% quantiles
      - target_close_h       : close shifted by -horizon_steps
      - target_ret_volnorm_h : ret_volnorm shifted by -horizon_steps
    """
    assert "close" in data.columns and "datetime" in data.columns, "missing required columns"

    result = data.copy()

    # ---- datetime index (sorted), interpolate forward-in-time only
    result["datetime"] = pd.to_datetime(result["datetime"], utc=False, errors="coerce")
    result = result.set_index("datetime").sort_index()
    result = result.interpolate(method="time", limit_direction="forward")
    result["datetime"] = result.index

    # ---- Required for PyTorch Forecasting
    result["group_id"] = "BTCUSDT"

    # ---- Time-of-day / weekday (categorical)
    result["hour"]    = result["datetime"].dt.hour.astype(str)
    result["weekday"] = result["datetime"].dt.weekday.astype(str)

    # ---- Price ratios (scale-free, less drift-prone)
    result["open_to_close"]  = (result["open"] / result["close"]).replace([np.inf, -np.inf], np.nan)
    result["high_to_close"]  = (result["high"] / result["close"]).replace([np.inf, -np.inf], np.nan)
    result["low_to_close"]   = (result["low"]  / result["close"]).replace([np.inf, -np.inf], np.nan)
    result["high_to_low_price"] = (result["high"] / result["low"]).replace([np.inf, -np.inf], np.nan)

    # ---- Returns (simple + log)
    close = result["close"].astype(float)
    # simple returns r_t = C_t/C_{t-1} - 1 (causal by construction)
    result["returns"] = close.pct_change().fillna(0.0)
    # binary sign
    result["returns_binary"] = (result["returns"] > 0).astype(np.int32)
    # log returns
    result["log_returns"] = np.log(close).diff().fillna(0.0)

    # ---- Realized volatility approximations (past-only)
    # rolling realized vol over 1h, 1d, 7d using cumulative sum of squared log returns
    rr = result["log_returns"].to_numpy()
    rr2_cum = np.cumsum(rr * rr)

    bars_per_hour = max(1, 60 // int(interval))
    obs_1h = bars_per_hour
    obs_1d = 24 * bars_per_hour
    obs_7d = 7 * obs_1d

    def rolling_realized_vol(cum, w):
        if w <= 1:
            return np.zeros_like(cum)
        # sqrt( sum_{t-w+1..t} r^2 )
        out = np.sqrt(cum - np.pad(cum[:-w], (w, 0), "constant", constant_values=(0, 0)))
        # first w-1 entries are sqrt(cum) which is not a true window; set them to nan then forward-fill
        out[:w-1] = np.nan
        return out

    result["vol_1h"] = rolling_realized_vol(rr2_cum, obs_1h)
    result["vol_1d"] = rolling_realized_vol(rr2_cum, obs_1d)
    result["vol_7d"] = rolling_realized_vol(rr2_cum, obs_7d)

    # ---- SMAs/EMAs to close (scale-free ratios)
    result["sma_1h_to_close"] = (ta.trend.sma_indicator(close, window=obs_1h) / close).fillna(0)
    result["ema_1h_to_close"] = (ta.trend.ema_indicator(close, window=obs_1h) / close).fillna(0)
    result["sma_1d_to_close"] = (ta.trend.sma_indicator(close, window=obs_1d) / close).fillna(0)
    result["ema_1d_to_close"] = (ta.trend.ema_indicator(close, window=obs_1d) / close).fillna(0)
    result["sma_7d_to_close"] = (ta.trend.sma_indicator(close, window=obs_7d) / close).fillna(0)

    # ---- MACD / signal
    result["macd"]        = ta.trend.macd(close, window_slow=26, window_fast=12).fillna(0)
    result["macd_signal"] = ta.trend.macd_signal(close, window_slow=26, window_fast=12, window_sign=9).fillna(0)

    # ---- RSI
    result["rsi"] = ta.momentum.rsi(close, window=14).fillna(0)

    # ---- Bollinger to close (scale-free)
    bb_lower = ta.volatility.bollinger_lband(close, window=20, window_dev=2)
    bb_upper = ta.volatility.bollinger_hband(close, window=20, window_dev=2)
    bb_mid   = ta.volatility.bollinger_mavg(close,  window=20)
    result["low_bband_to_close"] = (bb_lower / close).fillna(0)
    result["up_bband_to_close"]  = (bb_upper / close).fillna(0)
    result["mid_bband_to_close"] = (bb_mid   / close).fillna(0)

    # ======================================================================
    # New engineered targets (causal, past-only)
    # ======================================================================

    # 1) roll_vol_1d: *std of simple returns* over 1 day (past-only)
    #    std is more interpretable for normalization than realized-vol sqrt(sum r^2).
    roll_vol_1d = result["returns"].rolling(window=obs_1d, min_periods=obs_1d//2).std(ddof=0)
    result["roll_vol_1d"] = roll_vol_1d.replace(0, np.nan)

    # 2) ret_volnorm: returns normalized by roll_vol_1d
    result["ret_volnorm"] = (result["returns"] / (result["roll_vol_1d"] + eps)).replace([np.inf, -np.inf], np.nan)

    # 3) y_dir / y_mag
    result["y_dir"] = np.sign(result["returns"]).astype(int)
    result["y_mag"] = np.abs(result["returns"])

    # 4) ret_wins: rolling winsorization (avoid look-ahead)
    #    Use a large past window (winsor_days) to compute per-time low/high quantiles.
    wins_w = max(obs_1d, int(winsor_days) * obs_1d)
    low_q  = result["returns"].rolling(window=wins_w, min_periods=obs_1d).quantile(0.005)
    high_q = result["returns"].rolling(window=wins_w, min_periods=obs_1d).quantile(0.995)
    result["ret_wins"] = result["returns"].clip(lower=low_q, upper=high_q)

    # ======================================================================
    # Targets with explicit horizon (no leakage)
    # ======================================================================

    h = int(horizon_steps)
    # Price target H steps ahead
    result["target_close_h"] = result["close"].shift(-h)
    # Vol-normalized return target H steps ahead
    result["target_ret_volnorm_h"] = result["ret_volnorm"].shift(-h)

    # ---- Clean up NaNs created by rolling/shift
    result = result.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

    # ---- Required integer time index for PyTorch Forecasting
    result["time_index"] = np.arange(len(result), dtype=np.int64)

    # sanity checks
    assert result["time_index"].is_unique
    assert not result.isna().any().any()

    return result


df_feat = preprocess_augment_data(df_5min, interval=5, horizon_steps=12, winsor_days=7)

df_feat.to_csv("/teamspace/studios/this_studio/informer-validation/data/btcusdt_5min_features_v2.csv", index=False)
df_feat

Unnamed: 0_level_0,close,high,low,open,volume,fng_value,vix_value,fed_funds_rate,datetime,group_id,...,up_bband_to_close,mid_bband_to_close,roll_vol_1d,ret_volnorm,y_dir,y_mag,ret_wins,target_close_h,target_ret_volnorm_h,time_index
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-07 23:55:00,16945.11,16945.26,16944.72,16945.03,25.79317,25.0,21.13,4.33,2023-01-07 23:55:00,BTCUSDT,...,1.000134,0.999707,0.000222,0.628491,1,0.000139,0.000139,16924.50,-0.561657,0
2023-01-08 00:00:00,16947.53,16950.49,16943.21,16943.83,224.68365,25.0,21.13,4.33,2023-01-08 00:00:00,BTCUSDT,...,1.000044,0.999602,0.000222,0.644145,1,0.000143,0.000143,16923.80,-0.186161,1
2023-01-08 00:05:00,16951.22,16952.87,16950.72,16951.70,58.72817,25.0,21.13,4.33,2023-01-08 00:05:00,BTCUSDT,...,0.999923,0.999431,0.000220,0.990502,1,0.000218,0.000218,16930.82,1.872567,2
2023-01-08 00:10:00,16948.35,16952.35,16947.11,16951.68,122.41208,25.0,21.13,4.33,2023-01-08 00:10:00,BTCUSDT,...,1.000114,0.999644,0.000220,-0.769803,-1,0.000169,-0.000169,16932.80,0.527890,3
2023-01-08 00:15:00,16945.86,16947.59,16943.00,16945.39,107.11774,25.0,21.13,4.33,2023-01-08 00:15:00,BTCUSDT,...,1.000253,0.999824,0.000220,-0.668907,-1,0.000147,-0.000147,16933.01,0.056003,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-11 22:35:00,83055.65,83104.52,83055.64,83104.52,8.55873,24.0,26.92,4.33,2025-03-11 22:35:00,BTCUSDT,...,1.002660,1.000519,0.002691,-0.087711,-1,0.000236,-0.000236,82699.67,0.113595,228656
2025-03-11 22:40:00,83074.02,83077.53,82983.33,82996.41,15.15600,24.0,26.92,4.33,2025-03-11 22:40:00,BTCUSDT,...,1.001980,1.000447,0.002690,0.082219,1,0.000221,0.000221,82731.14,0.142553,228657
2025-03-11 22:45:00,83082.83,83141.64,83078.20,83094.03,4.92149,24.0,26.92,4.33,2025-03-11 22:45:00,BTCUSDT,...,1.001848,1.000373,0.002690,0.039422,1,0.000106,0.000106,82818.44,0.395449,228658
2025-03-11 22:50:00,83065.05,83088.38,83049.21,83049.21,4.49744,24.0,26.92,4.33,2025-03-11 22:50:00,BTCUSDT,...,1.002066,1.000575,0.002685,-0.079711,-1,0.000214,-0.000214,82699.99,-0.537039,228659


In [29]:
df_feat['check'] = df_feat['close'].shift(-5)

df_feat[['datetime', 'close', 'check', 'target']].head(10)

Unnamed: 0_level_0,datetime,close,check,target
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-01-01 00:00:00,2023-01-01 00:00:00,16543.67,16541.44,16525.34
2023-01-01 00:05:00,2023-01-01 00:05:00,16536.7,16537.29,16521.0
2023-01-01 00:10:00,2023-01-01 00:10:00,16525.34,16524.14,16536.61
2023-01-01 00:15:00,2023-01-01 00:15:00,16521.0,16523.64,16541.44
2023-01-01 00:20:00,2023-01-01 00:20:00,16536.61,16519.03,16537.29
2023-01-01 00:25:00,2023-01-01 00:25:00,16541.44,16527.22,16524.14
2023-01-01 00:30:00,2023-01-01 00:30:00,16537.29,16532.21,16523.64
2023-01-01 00:35:00,2023-01-01 00:35:00,16524.14,16528.04,16519.03
2023-01-01 00:40:00,2023-01-01 00:40:00,16523.64,16531.35,16527.22
2023-01-01 00:45:00,2023-01-01 00:45:00,16519.03,16540.56,16532.21
