In [None]:
!pip install torch xgboost numpy pandas scikit-learn matplotlib


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

np.random.seed(42)
torch.manual_seed(42)


In [None]:
T = 1500
time = np.arange(T)

trend = 0.01 * time
seasonality = 10 * np.sin(2 * np.pi * time / 50)

regime = np.where(time > 800, 20, 0)

noise = np.random.normal(0, 1 + 0.005*time, T)

y = trend + seasonality + regime + noise

data = pd.DataFrame({
    "f1": trend,
    "f2": seasonality,
    "f3": regime,
    "y": y
})

plt.figure(figsize=(12,4))
plt.plot(y)
plt.title("Synthetic Time Series")
plt.show()


In [None]:
def create_sequences(data, seq_len=30):
    X, y = [], []
    for i in range(len(data) - seq_len):
        X.append(data.iloc[i:i+seq_len][["f1","f2","f3"]].values)
        y.append(data.iloc[i+seq_len]["y"])
    return np.array(X), np.array(y)

X, y = create_sequences(data, seq_len=30)


In [None]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_flat = X.reshape(-1, X.shape[-1])
X_scaled = scaler_X.fit_transform(X_flat).reshape(X.shape)

y_scaled = scaler_y.fit_transform(y.reshape(-1,1)).flatten()


In [None]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim=3, d_model=64, nhead=4, num_layers=2):
        super().__init__()

        self.input_projection = nn.Linear(input_dim, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=128,
            dropout=0.1,
            batch_first=True
        )

        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.input_projection(x)
        x = self.transformer(x)
        x = x.mean(dim=1)
        return self.fc(x)


In [7]:
def rolling_origin_cv_transformer(X, y, n_splits=5, train_size=800, step=100):

    results = []

    for i in range(n_splits):

        end_train = train_size + i * step
        end_test = end_train + step

        X_train = torch.tensor(X[:end_train], dtype=torch.float32)
        y_train = torch.tensor(y[:end_train], dtype=torch.float32).view(-1,1)

        X_test = torch.tensor(X[end_train:end_test], dtype=torch.float32)
        y_test = y[end_train:end_test]

        model = TransformerModel()
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        for epoch in range(20):
            model.train()
            optimizer.zero_grad()
            output = model(X_train)
            loss = criterion(output, y_train)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            preds = model(X_test).numpy()

        preds = scaler_y.inverse_transform(preds)
        y_true = scaler_y.inverse_transform(y_test.reshape(-1,1))

        mae = mean_absolute_error(y_true, preds)
        rmse = np.sqrt(mean_squared_error(y_true, preds))

        results.append((mae, rmse))

    return results

transformer_results = rolling_origin_cv_transformer(X_scaled, y_scaled)
print("Transformer CV Results:", transformer_results)


Transformer CV Results: [(11.744543744136015, np.float64(13.601048811259064)), (9.364332898901756, np.float64(11.799676912851364)), (7.42360746175388, np.float64(9.503335387847013)), (8.592534488631792, np.float64(10.540018246267659)), (9.212344548916594, np.float64(11.006044862304996))]


In [8]:
def rolling_origin_cv_xgb(X, y, n_splits=5, train_size=800, step=100):

    results = []
    X_flat = X.reshape(X.shape[0], -1)

    for i in range(n_splits):

        end_train = train_size + i * step
        end_test = end_train + step

        X_train = X_flat[:end_train]
        y_train = y[:end_train]

        X_test = X_flat[end_train:end_test]
        y_test = y[end_train:end_test]

        model = XGBRegressor(n_estimators=200, max_depth=4)
        model.fit(X_train, y_train)

        preds = model.predict(X_test)

        preds = scaler_y.inverse_transform(preds.reshape(-1,1))
        y_true = scaler_y.inverse_transform(y_test.reshape(-1,1))

        mae = mean_absolute_error(y_true, preds)
        rmse = np.sqrt(mean_squared_error(y_true, preds))

        results.append((mae, rmse))

    return results

xgb_results = rolling_origin_cv_xgb(X_scaled, y_scaled)
print("XGBoost CV Results:", xgb_results)


XGBoost CV Results: [(9.170314282781543, np.float64(11.308523246329775)), (7.585778688365378, np.float64(9.539443328573423)), (8.305801233528937, np.float64(10.012052895221965)), (7.078967464919645, np.float64(8.79418290369048)), (11.218591926931074, np.float64(13.344101172959405))]


In [9]:
def average_results(results):
    maes = [r[0] for r in results]
    rmses = [r[1] for r in results]
    return np.mean(maes), np.mean(rmses)

print("Transformer Avg:", average_results(transformer_results))
print("XGBoost Avg:", average_results(xgb_results))


Transformer Avg: (np.float64(9.267472628468008), np.float64(11.29002484410602))
XGBoost Avg: (np.float64(8.671890719305315), np.float64(10.599660709355009))


Project Explanation
1. Transformer Architecture
This project uses a Transformer Encoder model instead of an LSTM-based model.
The Transformer leverages multi-head self-attention mechanisms, allowing the model to capture long-range temporal dependencies more effectively.
Unlike recurrent models, the Transformer processes sequences in parallel and learns contextual importance through attention weights.
Model Configuration:
d_model = 64 (embedding dimension)
nhead = 4 (multi-head attention)
num_layers = 2 (stacked encoder layers)
learning_rate = 0.001 (Adam optimizer)
These values were chosen to balance representational capacity and computational efficiency.
2. XGBoost Baseline
To provide a strong benchmark, XGBoost regression was implemented using lagged time-series features.
Hyperparameters:
n_estimators = 200
max_depth = 4
XGBoost serves as a powerful tree-based ensemble baseline, commonly used in structured time-series forecasting tasks.
3. Rolling-Origin Cross-Validation
Instead of a simple train-test split, rolling-origin cross-validation was implemented.
Procedure:
Initial training window = 800 observations
Test window = 100 observations
Window expands forward in each fold
This simulates real-world forecasting scenarios and prevents data leakage.
Evaluation metrics:
MAE (Mean Absolute Error)
RMSE (Root Mean Squared Error)
Performance is averaged across all folds.
4. Data Complexity
The synthetic dataset includes:
Linear trend
Seasonality
Regime shift after time step 800
Heteroscedastic noise (variance increases over time)
This ensures the forecasting task reflects realistic time-series challenges.