
# DOGE-USD Daily Forecast (2017–2025) with Transformer — Return Target

This notebook updates the earlier pipeline to:
- **Fetch data** from Yahoo Finance through **2025-06-30**
- Predict **next-day log-return** and then **reconstruct price**
- Avoid label leakage by **splitting before label/windowing**
- Add **level-invariant features** (EMA ratios, z-score of log-price)
- Evaluate with **sMAPE** and **directional accuracy**, alongside MAE/RMSE
- Keep a compact **Transformer** with positional embeddings, pooling head, dropout, L2, gradient clipping

> If you're offline or prefer a local file, set `USE_YFINANCE=False` and place a CSV named `DOGE-USD.csv` in the same folder.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install numpy pandas scikit-learn tensorflow matplotlib



## Setup & Imports

In [20]:

# If needed on first run:
# !pip install --quiet yfinance numpy pandas scikit-learn tensorflow matplotlib

import os
import math
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers

from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from datetime import datetime


print("TensorFlow:", tf.__version__)
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)


TensorFlow: 2.19.0


## Parameters

In [4]:

# ==== Data fetch ====
USE_YFINANCE = False
SYMBOL = "DOGE-USD"
START_DATE = "2017-01-01"
END_DATE = "2025-06-30"
CSV_PATH = "/content/drive/MyDrive/doge_forecast/doge_usd_2017_2025.csv"
# ==== BTC data (exogenous) ====
BTC_CSV_PATH = "/content/drive/MyDrive/doge_forecast/BTC_2016_2025.csv"  # <- change to your path


# ==== Save dir ====
SAVE_DIR = "/content/drive/MyDrive/doge_forecast/artifacts_doge_returns_transformer"
os.makedirs(SAVE_DIR, exist_ok=True)

# ==== Splits ====
TRAIN_END = "2023-12-31"
VAL_END   = "2024-12-31"   # test is 2025-01-01 .. 2025-06-30

# ==== Modeling ====
SEQ_LEN = 120
BATCH_SIZE = 32
EPOCHS = 120
LEARNING_RATE = 1e-3
L2_REG = 1e-4
D_MODEL = 96
NUM_HEADS = 4
NUM_LAYERS = 2
FF_DIM = 192
DROPOUT_RATE = 0.25
CLIPNORM = 1.0
PATIENCE = 8

# ==== Scaling ====
USE_ROBUST_SCALER = True  # (otherwise StandardScaler)


## Turn on Mixed Precision


In [5]:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")


In [6]:

tf.config.list_physical_devices('GPU')


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Load Data (Yahoo Finance, fallback to CSV) & Sanity Checks

In [7]:

if USE_YFINANCE:
    try:
        import yfinance as yf
        df = yf.download(SYMBOL, start=START_DATE, end=END_DATE, interval="1d", auto_adjust=False, progress=False)
        df = df.reset_index().rename(columns=str.lower)
        df = df.rename(columns={
            "adj close": "adj_close"
        })
        df.to_csv(CSV_PATH, index=False)
        print(f"Downloaded {len(df)} rows and saved to {CSV_PATH}")
    except Exception as e:
        print("yfinance failed, falling back to local CSV. Error:", e)
        df = pd.read_csv(CSV_PATH if os.path.exists(CSV_PATH) else "DOGE-USD.csv")
else:
    df = pd.read_csv(CSV_PATH if os.path.exists(CSV_PATH) else "DOGE-USD.csv")

# Standardize columns
df.columns = [c.strip().replace(' ', '_').lower() for c in df.columns]
if "date" not in df.columns:
    # yfinance uses 'Date' index sometimes; ensure it's present
    if "datetime" in df.columns:
        df = df.rename(columns={"datetime":"date"})
    else:
        raise ValueError("No 'date' column found.")

df["date"] = pd.to_datetime(df["date"].astype(str).str.strip(),
                            format="%d-%m-%y", utc=True)
df = df.dropna(subset=["date"]).sort_values("date").drop_duplicates(subset=["date"]).reset_index(drop=True)

# Expect OHLCV
expected = {'date','open','high','low','close','volume'}
if not expected.issubset(set(df.columns)):
    raise ValueError(f"CSV must include at least {expected}. Got: {set(df.columns)}")

# Core NaN drop
core_cols = ['open','high','low','close','volume']
n0 = len(df)
df = df.dropna(subset=core_cols)
print(f"Dropped {n0 - len(df)} rows with critical NaNs. Final rows: {len(df)}")

# Sanity: High >= Low; nonnegative
bad = df['high'] < df['low']
if bad.any():
    b = bad.sum()
    print(f"Fixing {b} rows with high<low by swapping values.")
    df.loc[bad, ['high','low']] = df.loc[bad, ['low','high']].values

df.head()


Dropped 0 rows with critical NaNs. Final rows: 3147


Unnamed: 0,date,open,high,low,close,volume
0,2017-01-01 00:00:00+00:00,0.000223,0.000228,0.000221,0.000228,136829.0
1,2017-01-02 00:00:00+00:00,0.000228,0.00023,0.000222,0.000226,195520.0
2,2017-01-03 00:00:00+00:00,0.000224,0.000228,0.000222,0.000227,132352.0
3,2017-01-04 00:00:00+00:00,0.000227,0.000239,0.000225,0.000236,442095.0
4,2017-01-05 00:00:00+00:00,0.000233,0.000257,0.000221,0.000229,994730.0


## BTC Loader

In [8]:
# Load BTC daily data and clean (2016-01-01 -> 2025-08-13 recommended)
btc = pd.read_csv(BTC_CSV_PATH)
btc.columns = [c.strip().replace(' ', '_').lower() for c in btc.columns]

required = {'date','open','high','low','close','volume'}
if not required.issubset(set(btc.columns)):
    raise ValueError(f"BTC CSV must include {required}, got {set(btc.columns)}")

# Robust day-first parsing (works for 2- or 4-digit years)
s = btc["date"].astype(str).str.strip()
d = pd.to_datetime(s, format="%d-%m-%Y", errors="coerce", utc=True)
mask = d.isna()
d[mask] = pd.to_datetime(s[mask], format="%d-%m-%y", errors="coerce", utc=True)
btc["date"] = d

btc = (btc
       .dropna(subset=["date"])
       .sort_values("date")
       .drop_duplicates(subset=["date"])
       .reset_index(drop=True))

# Fix any high<low anomalies
bad = btc["high"] < btc["low"]
if bad.any():
    btc.loc[bad, ["high","low"]] = btc.loc[bad, ["low","high"]].values

# Drop rows with missing core fields
n0 = len(btc)
btc = btc.dropna(subset=["open","high","low","close","volume"]).reset_index(drop=True)
print(f"[BTC] Dropped {n0 - len(btc)} rows with critical NaNs. Rows now: {len(btc)}")
print("BTC range:", btc["date"].min(), "->", btc["date"].max())
btc.head()


[BTC] Dropped 0 rows with critical NaNs. Rows now: 3513
BTC range: 2016-01-01 00:00:00+00:00 -> 2025-08-13 00:00:00+00:00


Unnamed: 0,date,open,high,low,close,volume
0,2016-01-01 00:00:00+00:00,430.721008,436.246002,427.515015,434.334015,36278900.0
1,2016-01-02 00:00:00+00:00,434.622009,436.062012,431.869995,433.437988,30096600.0
2,2016-01-03 00:00:00+00:00,433.578003,433.743012,424.705994,430.010986,39633800.0
3,2016-01-04 00:00:00+00:00,430.061005,434.516998,429.084015,433.091003,38477500.0
4,2016-01-05 00:00:00+00:00,433.069,434.182007,429.675995,431.959992,34522600.0


## Feature Engineering (Level-invariant signals)

In [9]:

def ema(series: pd.Series, span: int) -> pd.Series:
    return series.ewm(span=span, adjust=False).mean()

def rsi(series: pd.Series, period: int = 14) -> pd.Series:
    delta = series.diff()
    gain = (delta.where(delta > 0, 0.0)).ewm(alpha=1/period, adjust=False).mean()
    loss = (-delta.where(delta < 0, 0.0)).ewm(alpha=1/period, adjust=False).mean()
    rs = gain / (loss + 1e-9)
    return 100.0 - (100.0 / (1.0 + rs))

def macd(series: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9):
    ema_fast = ema(series, fast)
    ema_slow = ema(series, slow)
    macd_line = ema_fast - ema_slow
    signal_line = ema(macd_line, signal)
    hist = macd_line - signal_line
    return macd_line, signal_line, hist

def atr(high: pd.Series, low: pd.Series, close: pd.Series, period: int = 14) -> pd.Series:
    prev_close = close.shift(1)
    tr = pd.concat([
        (high - low).abs(),
        (high - prev_close).abs(),
        (low - prev_close).abs()
    ], axis=1).max(axis=1)
    return tr.rolling(window=period, min_periods=1).mean()

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out['log_close'] = np.log(out['close'].astype(float).clip(lower=1e-12))
    out['log_ret_1d'] = out['log_close'].diff()

    # EMAs for ratios
    out['ema_20'] = ema(out['close'], 20)
    out['ema_50'] = ema(out['close'], 50)
    out['ratio_close_ema20'] = out['close'] / (out['ema_20'] + 1e-9)
    out['ratio_close_ema50'] = out['close'] / (out['ema_50'] + 1e-9)

    # Log-price z-score (60d)
    lc_rolling_mean = out['log_close'].rolling(60).mean()
    lc_rolling_std  = out['log_close'].rolling(60).std()
    out['z_logclose_60'] = (out['log_close'] - lc_rolling_mean) / (lc_rolling_std + 1e-9)

    # Ranges and spreads
    out['hl_range_pct'] = (out['high'] - out['low']) / out['close'].shift(1).replace(0, np.nan)
    out['oc_change_pct'] = (out['close'] - out['open']) / out['open'].replace(0, np.nan)

    # Rolling stats on returns
    out['ret_mean_5'] = out['log_ret_1d'].rolling(5).mean()
    out['ret_std_5']  = out['log_ret_1d'].rolling(5).std()
    out['ret_mean_20'] = out['log_ret_1d'].rolling(20).mean()
    out['ret_std_20']  = out['log_ret_1d'].rolling(20).std()

    # ATR, RSI, MACD
    out['atr_14'] = atr(out['high'], out['low'], out['close'], period=14)
    out['rsi_14'] = rsi(out['close'], period=14)
    macd_line, signal_line, hist = macd(out['close'])
    out['macd'] = macd_line
    out['macd_signal'] = signal_line
    out['macd_hist'] = hist

    # Volume features
    out['log_volume'] = np.log1p(out['volume'].astype(float).clip(lower=0))
    out['vol_z_20'] = (out['volume'] - out['volume'].rolling(20).mean()) / (out['volume'].rolling(20).std() + 1e-9)

    # Calendar
    out['dow'] = out['date'].dt.dayofweek
    out['dow_sin'] = np.sin(2 * np.pi * out['dow'] / 7.0)
    out['dow_cos'] = np.cos(2 * np.pi * out['dow'] / 7.0)

    out = out.dropna().reset_index(drop=True)
    return out

feat_df = add_features(df)

feature_cols = [
    'open','high','low','close','volume',
    'ema_20','ema_50','ratio_close_ema20','ratio_close_ema50','z_logclose_60',
    'log_ret_1d','hl_range_pct','oc_change_pct',
    'ret_mean_5','ret_std_5','ret_mean_20','ret_std_20',
    'atr_14','rsi_14','macd','macd_signal','macd_hist',
    'log_volume','vol_z_20','dow_sin','dow_cos'
]

print("Number of features:", len(feature_cols))
feat_df[['date'] + feature_cols[:6]].head()


Number of features: 26


Unnamed: 0,date,open,high,low,close,volume,ema_20
0,2017-03-01 00:00:00+00:00,0.000206,0.000209,0.000201,0.000203,136870.0,0.000206
1,2017-03-02 00:00:00+00:00,0.000205,0.00021,0.000202,0.000206,203994.0,0.000206
2,2017-03-03 00:00:00+00:00,0.000205,0.000213,0.000202,0.000211,146345.0,0.000206
3,2017-03-04 00:00:00+00:00,0.000211,0.000217,0.000195,0.000205,94177.0,0.000206
4,2017-03-05 00:00:00+00:00,0.000204,0.000211,0.000202,0.000206,69489.8,0.000206


## BTC Feature Engineering + Merge Cell

In [10]:
# ---- BTC features (level-invariant) ----
btc_feat = btc.copy()

# Reuse helpers from earlier cell: ema(...), etc.
btc_feat["btc_log_close"] = np.log(btc_feat["close"].astype(float).clip(lower=1e-12))
btc_feat["btc_log_ret_1d"] = btc_feat["btc_log_close"].diff()

btc_feat["btc_ema_20"] = ema(btc_feat["close"], 20)
btc_feat["btc_ema_50"] = ema(btc_feat["close"], 50)
btc_feat["btc_ratio_close_ema20"] = btc_feat["close"] / (btc_feat["btc_ema_20"] + 1e-9)
btc_feat["btc_ratio_close_ema50"] = btc_feat["close"] / (btc_feat["btc_ema_50"] + 1e-9)

lc_mean_60 = btc_feat["btc_log_close"].rolling(60).mean()
lc_std_60  = btc_feat["btc_log_close"].rolling(60).std()
btc_feat["btc_z_logclose_60"] = (btc_feat["btc_log_close"] - lc_mean_60) / (lc_std_60 + 1e-9)

btc_feat["btc_ret_mean_5"] = btc_feat["btc_log_ret_1d"].rolling(5).mean()
btc_feat["btc_ret_std_20"]  = btc_feat["btc_log_ret_1d"].rolling(20).std()

btc_feat["btc_log_volume"] = np.log1p(btc_feat["volume"].astype(float).clip(lower=0))
btc_feat["btc_vol_z_20"]   = (btc_feat["volume"] - btc_feat["volume"].rolling(20).mean()) / (btc_feat["volume"].rolling(20).std() + 1e-9)

# Keep just the columns we need
btc_keep = [
    "date",
    "btc_log_close", "btc_log_ret_1d",
    "btc_ema_20","btc_ema_50","btc_ratio_close_ema20","btc_ratio_close_ema50",
    "btc_z_logclose_60","btc_ret_mean_5","btc_ret_std_20",
    "btc_log_volume","btc_vol_z_20"
]
btc_feat = btc_feat[btc_keep].dropna().reset_index(drop=True)

# ---- Merge DOGE + BTC on date ----
merged = feat_df.merge(btc_feat, on="date", how="inner")

# ---- Cross-asset features (no leakage: past-only rolls) ----
merged["log_spread_doge_btc"] = merged["log_close"] - merged["btc_log_close"]
merged["ret_corr_20"] = merged["log_ret_1d"].rolling(20).corr(merged["btc_log_ret_1d"])

# Final feature frame (drop early NaNs from rolls)
feat_df = merged.dropna().reset_index(drop=True)

# Extend feature_cols with btc + cross features
btc_feature_cols = [
    "btc_log_ret_1d","btc_ret_mean_5","btc_ret_std_20",
    "btc_ratio_close_ema20","btc_ratio_close_ema50","btc_z_logclose_60",
    "btc_log_volume","btc_vol_z_20"
]
cross_cols = ["log_spread_doge_btc","ret_corr_20"]

feature_cols = feature_cols + btc_feature_cols + cross_cols

print("Total features after adding BTC:", len(feature_cols))
feat_df[["date"] + feature_cols[:8]].head()


Total features after adding BTC: 36


Unnamed: 0,date,open,high,low,close,volume,ema_20,ema_50,ratio_close_ema20
0,2017-03-20 00:00:00+00:00,0.000235,0.000239,0.000226,0.000236,339899.0,0.000223,0.000216,1.061008
1,2017-03-21 00:00:00+00:00,0.000227,0.000248,0.000227,0.000246,243988.0,0.000225,0.000218,1.09237
2,2017-03-22 00:00:00+00:00,0.000245,0.000247,0.000235,0.000243,344528.0,0.000227,0.000219,1.073949
3,2017-03-23 00:00:00+00:00,0.000239,0.000245,0.000235,0.000242,195765.0,0.000228,0.00022,1.061283
4,2017-03-24 00:00:00+00:00,0.000242,0.000305,0.000235,0.000264,1776640.0,0.000232,0.000221,1.141983


Notes

• We do not include btc_log_close itself in feature_cols (we use it only to build the cross-asset log_spread_doge_btc).

• All BTC features are time-t values or past-window aggregates—so they’re safe.

• We merged before splitting; that’s fine because no feature uses future data.

## Split by Date (then build labels/windows within each split)

In [11]:
feat_df = feat_df.sort_values('date').reset_index(drop=True)

TEST_END = pd.to_datetime("2025-06-30", utc=True)

train_mask = feat_df['date'] <= pd.to_datetime(TRAIN_END, utc=True)
val_mask   = (feat_df['date'] > pd.to_datetime(TRAIN_END, utc=True)) & \
             (feat_df['date'] <= pd.to_datetime(VAL_END, utc=True))
test_mask  = (feat_df['date'] > pd.to_datetime(VAL_END, utc=True)) & \
             (feat_df['date'] <= TEST_END)

train_df = feat_df.loc[train_mask].reset_index(drop=True)
val_df   = feat_df.loc[val_mask].reset_index(drop=True)
test_df  = feat_df.loc[test_mask].reset_index(drop=True)

# optional: create a holdout for 2025-07-01 .. 2025-08-13
holdout_df = feat_df.loc[feat_df['date'] > TEST_END].reset_index(drop=True)

## Build Targets (next-day log-return) and Window per Split

In [12]:

def make_windows_for_split(split_df, feature_cols, seq_len):
    # y_full: next-day log-return aligned to t (return from t -> t+1)
    log_close = split_df['log_close'].values
    y_full = np.empty_like(log_close)
    y_full[:-1] = log_close[1:] - log_close[:-1]
    y_full[-1] = np.nan  # last has no next day

    X_raw = split_df[feature_cols].values.astype(np.float32)
    close_arr = split_df['close'].values.astype(np.float32)

    xs, ys, last_close, next_close, dates = [], [], [], [], []
    # windows end at t = idx_end, predict return for t -> t+1
    # ensure t+1 exists => idx_end <= len(split_df)-2
    max_end = len(split_df) - 2
    for idx_end in range(seq_len-1, max_end+1):
        i = idx_end - (seq_len - 1)
        xw = X_raw[i: i + seq_len]
        y  = y_full[idx_end]
        if np.isnan(y):
            continue
        xs.append(xw)
        ys.append(y)
        last_close.append(close_arr[idx_end])      # C_t
        next_close.append(close_arr[idx_end + 1])  # C_{t+1}
        dates.append(split_df['date'].iloc[idx_end + 1])  # prediction date (t+1)
    if not xs:
        return (np.empty((0, seq_len, X_raw.shape[1]), dtype=np.float32),
                np.empty((0,), dtype=np.float32),
                np.empty((0,), dtype=np.float32),
                np.empty((0,), dtype=np.float32),
                [])
    return (np.stack(xs).astype(np.float32),
            np.array(ys, dtype=np.float32),
            np.array(last_close, dtype=np.float32),
            np.array(next_close, dtype=np.float32),
            pd.to_datetime(dates))

# Build split arrays
Xtr_raw, ytr, tr_lastC, tr_nextC, tr_dates = make_windows_for_split(train_df, feature_cols, SEQ_LEN)
Xv_raw,  yv,  v_lastC,  v_nextC,  v_dates  = make_windows_for_split(val_df,   feature_cols, SEQ_LEN)
Xte_raw, yte, te_lastC, te_nextC, te_dates = make_windows_for_split(test_df,  feature_cols, SEQ_LEN)

print("Shapes:")
print("  Train:", Xtr_raw.shape, ytr.shape)
print("  Val  :", Xv_raw.shape,  yv.shape)
print("  Test :", Xte_raw.shape, yte.shape)


Shapes:
  Train: (2358, 120, 36) (2358,)
  Val  : (246, 120, 36) (246,)
  Test : (61, 120, 36) (61,)


## Scale Features (fit on Train only) & Build Datasets

In [13]:

if USE_ROBUST_SCALER:
    scaler = RobustScaler()
else:
    scaler = StandardScaler()

# Fit on TRAIN only
n_features = Xtr_raw.shape[-1]
Xtr_flat = Xtr_raw.reshape(-1, n_features)
scaler.fit(Xtr_flat)

# Transform each split
def transform_windows(X_raw, scaler):
    n = X_raw.shape[0]
    T = X_raw.shape[1]
    F = X_raw.shape[2]
    X_flat = X_raw.reshape(-1, F)
    X_scaled = scaler.transform(X_flat).astype(np.float32)
    return X_scaled.reshape(n, T, F)

Xtr = transform_windows(Xtr_raw, scaler)
Xv  = transform_windows(Xv_raw,  scaler)
Xte = transform_windows(Xte_raw, scaler)

# tf.data
train_ds = tf.data.Dataset.from_tensor_slices((Xtr, ytr)).shuffle(4096, seed=SEED).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_ds   = tf.data.Dataset.from_tensor_slices((Xv,  yv)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_ds  = tf.data.Dataset.from_tensor_slices((Xte, yte)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

# Persist scaler
import joblib, os
joblib.dump(scaler, os.path.join(SAVE_DIR, "feature_scaler.joblib"))


['/content/drive/MyDrive/doge_forecast/artifacts_doge_returns_transformer/feature_scaler.joblib']

## Transformer Model

In [14]:

class PositionalEmbedding(layers.Layer):
    def __init__(self, seq_len: int, d_model: int, **kwargs):
        super().__init__(**kwargs)
        self.seq_len = seq_len
        self.d_model = d_model

    def build(self, input_shape):
        self.pos_emb = self.add_weight(
            shape=(1, self.seq_len, self.d_model),
            initializer="random_normal",
            trainable=True,
            name="pos_embedding"
        )

    def call(self, x):
        return x + self.pos_emb

def transformer_encoder_block(d_model: int, num_heads: int, ff_dim: int, dropout: float, l2_reg: float):
    inputs = layers.Input(shape=(None, d_model))
    attn_out = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout)(inputs, inputs)
    attn_out = layers.Dropout(dropout)(attn_out)
    x = layers.LayerNormalization(epsilon=1e-6)(inputs + attn_out)

    ff = keras.Sequential([
        layers.Dense(ff_dim, activation="relu", kernel_regularizer=regularizers.l2(l2_reg)),
        layers.Dropout(dropout),
        layers.Dense(d_model, kernel_regularizer=regularizers.l2(l2_reg))
    ])
    ff_out = ff(x)
    ff_out = layers.Dropout(dropout)(ff_out)
    x = layers.LayerNormalization(epsilon=1e-6)(x + ff_out)
    return keras.Model(inputs, x)  # no explicit name to avoid collisions

def build_model(seq_len: int, n_features: int,
                d_model: int = 128, num_heads: int = 4, num_layers: int = 3, ff_dim: int = 256,
                dropout: float = 0.25, l2_reg: float = 1e-4) -> keras.Model:
    inputs = layers.Input(shape=(seq_len, n_features))
    x = layers.Dense(d_model, kernel_regularizer=regularizers.l2(l2_reg))(inputs)
    x = PositionalEmbedding(seq_len, d_model)(x)
    for _ in range(num_layers):
        enc = transformer_encoder_block(d_model, num_heads, ff_dim, dropout, l2_reg)
        x = enc(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(64, activation="relu", kernel_regularizer=regularizers.l2(l2_reg))(x)
    x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1, name="log_return_next")(x)

    model = keras.Model(inputs, outputs, name="doge_transformer_returns")
    opt = keras.optimizers.Adam(learning_rate=LEARNING_RATE, clipnorm=CLIPNORM)
    loss = keras.losses.Huber(delta=1.0)  # robust for spikes
    model.compile(optimizer=opt, loss=loss, metrics=[keras.metrics.MeanAbsoluteError(name="mae")])
    return model

model = build_model(SEQ_LEN, n_features=Xtr.shape[-1],
                    d_model=D_MODEL, num_heads=NUM_HEADS, num_layers=NUM_LAYERS,
                    ff_dim=FF_DIM, dropout=DROPOUT_RATE, l2_reg=L2_REG)
model.summary()


## Train

In [15]:

log_dir = os.path.join(SAVE_DIR, "tb_logs")
ckpt_path = os.path.join(SAVE_DIR, "best.weights.h5")  # weights-only to avoid format issues

callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss", patience=PATIENCE, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=max(4, PATIENCE//3)),
    keras.callbacks.ModelCheckpoint(ckpt_path, monitor="val_loss", save_best_only=True, save_weights_only=True),
    keras.callbacks.TensorBoard(log_dir=log_dir)
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)


Epoch 1/120
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 278ms/step - loss: 0.1410 - mae: 0.2678 - val_loss: 0.0646 - val_mae: 0.0366 - learning_rate: 0.0010
Epoch 2/120
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 14ms/step - loss: 0.0680 - mae: 0.0585 - val_loss: 0.0623 - val_mae: 0.0361 - learning_rate: 0.0010
Epoch 3/120
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.0642 - mae: 0.0478 - val_loss: 0.0595 - val_mae: 0.0360 - learning_rate: 0.0010
Epoch 4/120
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.0612 - mae: 0.0429 - val_loss: 0.0564 - val_mae: 0.0360 - learning_rate: 0.0010
Epoch 5/120
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.0569 - mae: 0.0408 - val_loss: 0.0531 - val_mae: 0.0359 - learning_rate: 0.0010
Epoch 6/120
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 0.0537 - mae: 0.0407 - v

## Evaluation — Reconstruct Price, Baselines, sMAPE & Directional Accuracy

In [16]:

# Predict returns for test windows
y_pred_ret = model.predict(test_ds, verbose=0).flatten()

# Reconstruct price for each prediction:
# Use the last close in each window as the anchor: C_{t+1} = C_t * exp(pred_ret)
pred_close = te_lastC * np.exp(y_pred_ret)
actual_close = te_nextC  # true C_{t+1}

# Baselines (price space)
naive_pred = te_lastC  # predict tomorrow equals today's
ema_span = 10
ema_series = pd.Series(test_df['close'].values).ewm(span=ema_span, adjust=False).mean().values
# align EMA: use EMA at time t (window end) as forecast for t+1
ema_pred = ema_series[SEQ_LEN-1 : len(ema_series)-1]

# Metrics
def smape(y_true, y_pred):
    return 100.0 * np.mean(2.0 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-9))

def direction_accuracy(true_rets, pred_rets):
    return np.mean(np.sign(true_rets) == np.sign(pred_rets)) * 100.0

results = {
    "MODEL": {
        "MAE": float(mean_absolute_error(actual_close, pred_close)),
        "RMSE": float(np.sqrt(mean_squared_error(actual_close, pred_close))),
        "sMAPE": float(smape(actual_close, pred_close)),
        "Direction_Accuracy_%": float(direction_accuracy(yte, y_pred_ret)),
    },
    "NAIVE": {
        "MAE": float(mean_absolute_error(actual_close, naive_pred)),
        "RMSE": float(np.sqrt(mean_squared_error(actual_close, naive_pred))),
        "sMAPE": float(smape(actual_close, naive_pred)),
    },
    "EMA10": {
        "MAE": float(mean_absolute_error(actual_close, ema_pred)),
        "RMSE": float(np.sqrt(mean_squared_error(actual_close, ema_pred))),
        "sMAPE": float(smape(actual_close, ema_pred)),
    }
}

print(json.dumps(results, indent=2))

# Save predictions
eval_df = pd.DataFrame({
    "date": pd.to_datetime(te_dates),
    "actual_close": actual_close,
    "pred_close": pred_close,
    "naive_close": naive_pred,
    "ema10_close": ema_pred,
    "pred_ret": y_pred_ret,
    "actual_ret": yte
})
eval_path = os.path.join(SAVE_DIR, "test_predictions_2025H1.csv")
eval_df.to_csv(eval_path, index=False)
print("Saved predictions to:", eval_path)

# Overlay plot
plt.figure(figsize=(12,5))
plt.plot(eval_df['date'], eval_df['actual_close'], label="Actual")
plt.plot(eval_df['date'], eval_df['naive_close'], label="Naïve")
plt.plot(eval_df['date'], eval_df['ema10_close'], label="EMA(10)")
plt.plot(eval_df['date'], eval_df['pred_close'], label="Transformer Pred", linewidth=2.2, zorder=3, linestyle="--")
plt.title("DOGE-USD (2025H1): Actual vs Forecasts")
plt.xlabel("Date"); plt.ylabel("Price (USD)")
plt.legend(); plt.tight_layout()
plot1 = os.path.join(SAVE_DIR, "overlay_2025H1.png")
plt.savefig(plot1, dpi=150); plt.close()

# Scatter
plt.figure(figsize=(6,6))
plt.scatter(eval_df['actual_close'], eval_df['pred_close'], s=8)
mn = min(eval_df['actual_close'].min(), eval_df['pred_close'].min())
mx = max(eval_df['actual_close'].max(), eval_df['pred_close'].max())
plt.plot([mn, mx], [mn, mx], linestyle="--")
plt.title("Predicted vs Actual (Price) — 2025H1")
plt.xlabel("Actual"); plt.ylabel("Predicted")
plt.tight_layout()
plot2 = os.path.join(SAVE_DIR, "scatter_2025H1.png")
plt.savefig(plot2, dpi=150); plt.close()

print("Saved plots to:", plot1, "and", plot2)


{
  "MODEL": {
    "MAE": 0.006630873307585716,
    "RMSE": 0.010066989697755293,
    "sMAPE": 3.3202781677246094,
    "Direction_Accuracy_%": 47.540983606557376
  },
  "NAIVE": {
    "MAE": 0.006587877403944731,
    "RMSE": 0.010046033314205243,
    "sMAPE": 3.3008363246917725
  },
  "EMA10": {
    "MAE": 0.011718484751365723,
    "RMSE": 0.016530410055939568,
    "sMAPE": 5.936707253085462
  }
}
Saved predictions to: /content/drive/MyDrive/doge_forecast/artifacts_doge_returns_transformer/test_predictions_2025H1.csv
Saved plots to: /content/drive/MyDrive/doge_forecast/artifacts_doge_returns_transformer/overlay_2025H1.png and /content/drive/MyDrive/doge_forecast/artifacts_doge_returns_transformer/scatter_2025H1.png


## Holdout Evaluation

In [18]:
# === Holdout Evaluation: 2025-07-01 → 2025-08-13 ===

# 1) Ensure TEST_END and holdout_df exist (cap test at 2025-06-30, holdout = after that)
try:
    TEST_END
except NameError:
    TEST_END = pd.to_datetime("2025-06-30", utc=True)

if 'holdout_df' not in globals():
    holdout_df = feat_df.loc[feat_df['date'] > TEST_END].reset_index(drop=True)

holdout_start = holdout_df['date'].min() if len(holdout_df) else None
if (holdout_start is None) or (len(holdout_df) == 0):
    print("No holdout rows found after TEST_END. Nothing to evaluate.")
else:
    # 2) Build a context tail (last SEQ_LEN rows before holdout start) so windows have history
    context_tail = feat_df.loc[feat_df['date'] <= TEST_END].tail(SEQ_LEN).reset_index(drop=True)
    concat_df = pd.concat([context_tail, holdout_df], ignore_index=True)

    # 3) Reuse the same windowing function (predict next-day log-return) on the concatenated df
    Xho_raw, yho_ret, ho_lastC, ho_nextC, ho_dates = make_windows_for_split(concat_df, feature_cols, SEQ_LEN)

    # Keep only predictions whose date lies inside the actual holdout range
    mask_keep = (ho_dates >= holdout_start)
    if mask_keep.sum() == 0:
        print("Not enough history to create holdout windows. Try reducing SEQ_LEN or extending context.")
    else:
        Xho_raw   = Xho_raw[mask_keep]
        yho_ret   = yho_ret[mask_keep]
        ho_lastC  = ho_lastC[mask_keep]
        ho_nextC  = ho_nextC[mask_keep]
        ho_dates  = ho_dates[mask_keep]

        # 4) Scale features with the TRAIN-FIT scaler and build dataset
        Xho = transform_windows(Xho_raw, scaler)
        ho_ds = tf.data.Dataset.from_tensor_slices((Xho, yho_ret)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

        # 5) Predict returns, reconstruct prices for t+1:  Ĉ_{t+1} = C_t * exp( r̂_{t+1} )
        y_pred_ret_ho = model.predict(ho_ds, verbose=0).flatten()
        pred_close_ho = ho_lastC * np.exp(y_pred_ret_ho)
        actual_close_ho = ho_nextC

        # 6) Baselines (price space)
        # Naïve: predict tomorrow equals today's
        naive_pred_ho = ho_lastC

        # EMA(10): compute on concat_df so it has context; use EMA at time t to predict t+1
        ema_span = 10
        ctx = concat_df[['date','close']].copy()
        ctx['ema10'] = ctx['close'].ewm(span=ema_span, adjust=False).mean()
        # Build a mapping of prediction-date -> EMA_at_t
        ema_pred_map = pd.DataFrame({
            'pred_date': ctx['date'].iloc[1:].to_numpy(),        # t+1
            'ema10_pred': ctx['ema10'].iloc[:-1].to_numpy()      # EMA at t
        })
        ho_eval_df = pd.DataFrame({'date': ho_dates})
        ho_eval_df = ho_eval_df.merge(ema_pred_map, left_on='date', right_on='pred_date', how='left')
        ema_pred_ho = ho_eval_df['ema10_pred'].to_numpy()

        # 7) Metrics
        def smape(y_true, y_pred):
            return 100.0 * np.mean(2.0 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-9))

        def direction_accuracy(true_rets, pred_rets):
            return np.mean(np.sign(true_rets) == np.sign(pred_rets)) * 100.0

        from sklearn.metrics import mean_absolute_error, mean_squared_error
        results_holdout = {
            "MODEL": {
                "MAE": float(mean_absolute_error(actual_close_ho, pred_close_ho)),
                "RMSE": float(np.sqrt(mean_squared_error(actual_close_ho, pred_close_ho))),
                "sMAPE": float(smape(actual_close_ho, pred_close_ho)),
                "Direction_Accuracy_%": float(direction_accuracy(yho_ret, y_pred_ret_ho)),
            },
            "NAIVE": {
                "MAE": float(mean_absolute_error(actual_close_ho, naive_pred_ho)),
                "RMSE": float(np.sqrt(mean_squared_error(actual_close_ho, naive_pred_ho))),
                "sMAPE": float(smape(actual_close_ho, naive_pred_ho)),
            },
            "EMA10": {
                "MAE": float(mean_absolute_error(actual_close_ho, ema_pred_ho)),
                "RMSE": float(np.sqrt(mean_squared_error(actual_close_ho, ema_pred_ho))),
                "sMAPE": float(smape(actual_close_ho, ema_pred_ho)),
            }
        }
        print(json.dumps(results_holdout, indent=2))

        # 8) Save predictions & plots
        out_df = pd.DataFrame({
            "date": ho_dates,
            "actual_close": actual_close_ho,
            "pred_close": pred_close_ho,
            "naive_close": naive_pred_ho,
            "ema10_close": ema_pred_ho,
            "pred_ret": y_pred_ret_ho,
            "actual_ret": yho_ret
        })
        out_csv = os.path.join(SAVE_DIR, "holdout_predictions_2025Q3.csv")
        out_df.to_csv(out_csv, index=False)
        print("Saved holdout predictions to:", out_csv)

        # Overlay
        plt.figure(figsize=(12,5))
        plt.plot(out_df['date'], out_df['actual_close'], label="Actual")
        plt.plot(out_df['date'], out_df['pred_close'], label="Transformer Pred", linewidth=2.2, zorder=3, linestyle="--")
        plt.plot(out_df['date'], out_df['naive_close'], label="Naïve")
        plt.plot(out_df['date'], out_df['ema10_close'], label="EMA(10)")
        plt.title("DOGE-USD Holdout (2025-07-01 → 2025-08-13): Actual vs Forecasts")
        plt.xlabel("Date"); plt.ylabel("Price (USD)")
        plt.legend(); plt.tight_layout()
        p1 = os.path.join(SAVE_DIR, "overlay_holdout_2025Q3.png")
        plt.savefig(p1, dpi=150); plt.close()

        # Scatter
        plt.figure(figsize=(6,6))
        plt.scatter(out_df['actual_close'], out_df['pred_close'], s=8)
        mn = min(out_df['actual_close'].min(), out_df['pred_close'].min())
        mx = max(out_df['actual_close'].max(), out_df['pred_close'].max())
        plt.plot([mn, mx], [mn, mx], linestyle="--")
        plt.title("Predicted vs Actual (Price) — Holdout 2025Q3")
        plt.xlabel("Actual"); plt.ylabel("Predicted")
        plt.tight_layout()
        p2 = os.path.join(SAVE_DIR, "scatter_holdout_2025Q3.png")
        plt.savefig(p2, dpi=150); plt.close()

        print("Saved holdout plots to:", p1, "and", p2)

{
  "MODEL": {
    "MAE": 0.008413519710302353,
    "RMSE": 0.010660383604939143,
    "sMAPE": 3.9474434852600098,
    "Direction_Accuracy_%": 56.81818181818182
  },
  "NAIVE": {
    "MAE": 0.008450660854578018,
    "RMSE": 0.01071009087675045,
    "sMAPE": 3.9740257263183594
  },
  "EMA10": {
    "MAE": 0.016006066728409808,
    "RMSE": 0.02085956343079419,
    "sMAPE": 7.493838818513976
  }
}
Saved holdout predictions to: /content/drive/MyDrive/doge_forecast/artifacts_doge_returns_transformer/holdout_predictions_2025Q3.csv
Saved holdout plots to: /content/drive/MyDrive/doge_forecast/artifacts_doge_returns_transformer/overlay_holdout_2025Q3.png and /content/drive/MyDrive/doge_forecast/artifacts_doge_returns_transformer/scatter_holdout_2025Q3.png


## Save Artifacts & Notes

In [21]:

meta = {
    "generated_at": str(datetime.utcnow()),
    "data": {"symbol": SYMBOL, "start": START_DATE, "end": END_DATE},
    "splits": {"train_end": TRAIN_END, "val_end": VAL_END, "seq_len": SEQ_LEN},
    "features": feature_cols,
    "model_config": {
        "d_model": D_MODEL, "num_heads": NUM_HEADS, "num_layers": NUM_LAYERS,
        "ff_dim": FF_DIM, "dropout": DROPOUT_RATE, "l2_reg": L2_REG
    },
    "training": {
        "epochs": EPOCHS, "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE, "clipnorm": CLIPNORM, "patience": PATIENCE
    }
}
with open(os.path.join(SAVE_DIR, "metadata.json"), "w") as f:
    json.dump(meta, f, indent=2)

print("Artifacts directory:", SAVE_DIR)
print("Done.")


Artifacts directory: /content/drive/MyDrive/doge_forecast/artifacts_doge_returns_transformer
Done.
