# Hull Tactical - Training Notebook v4

**CRITICAL**: This notebook actually TRAINS models. Previous versions just loaded frozen artifacts.

Changes from v3:
- Position sizing: More aggressive (scale_factor 120 vs 80, bounds [0.0, 2.0] vs [0.2, 1.8])
- Risk aversion: Lower (35 vs 50) - take bigger bets when confident
- Feature engineering: PROMETHEUS features included in training
- Model hyperparameters: Deeper trees, more iterations

In [None]:
import os
import pickle
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import TimeSeriesSplit

warnings.filterwarnings('ignore')
np.random.seed(42)

# Paths
if os.path.exists('/kaggle/input/hull-tactical-market-prediction'):
    DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')
    ARTIFACTS_DIR = Path('/kaggle/working/artifacts_v4')
else:
    DATA_DIR = Path('/home/user/aimo3/hull/hull-tactical-market-prediction')
    ARTIFACTS_DIR = Path('/home/user/aimo3/hull/artifacts_v4')

ARTIFACTS_DIR.mkdir(exist_ok=True, parents=True)
print(f"Data: {DATA_DIR}")
print(f"Artifacts: {ARTIFACTS_DIR}")

In [None]:
# ============================================================================
# V4 CONFIG - MORE AGGRESSIVE THAN V3
# ============================================================================
CONFIG = {
    # Position sizing - MORE AGGRESSIVE
    'base_position': 1.0,
    'risk_aversion': 35.0,      # Was 50 - lower = bigger bets
    'scale_factor': 120.0,      # Was 80 - higher = more responsive
    'min_position': 0.0,        # Was 0.2 - allow going flat
    'max_position': 2.0,        # Was 1.8 - allow more leverage
    
    # Model params - DEEPER TREES
    'lgb_params': {
        'objective': 'regression',
        'metric': 'mse',
        'boosting_type': 'gbdt',
        'num_leaves': 63,       # Was 31
        'max_depth': 8,         # Added constraint
        'learning_rate': 0.02,  # Was 0.01
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'min_child_samples': 50,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'verbose': -1,
        'seed': 42,
        'n_jobs': -1
    },
    'xgb_params': {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'max_depth': 6,
        'learning_rate': 0.02,  # Was 0.01
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'min_child_weight': 50,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'seed': 42,
        'n_jobs': -1,
        'verbosity': 0
    },
    'n_models': 5,
    'num_boost_round': 1500,    # Was 1000
    'early_stopping': 100       # Was 50
}

print("V4 Config loaded:")
print(f"  Position range: [{CONFIG['min_position']}, {CONFIG['max_position']}]")
print(f"  Risk aversion: {CONFIG['risk_aversion']}")
print(f"  Scale factor: {CONFIG['scale_factor']}")

In [None]:
# Load data
train_df = pd.read_csv(DATA_DIR / 'train.csv')
print(f"Train shape: {train_df.shape}")
print(f"Date range: {train_df['date_id'].min()} - {train_df['date_id'].max()}")
print(f"\nTarget (market_forward_excess_returns):")
print(train_df['market_forward_excess_returns'].describe())

In [None]:
# ============================================================================
# PROMETHEUS FEATURE ENGINEERING - NOW USED IN TRAINING
# ============================================================================

def compute_autocorrelation(x, tau):
    """Autocorrelation at lag tau."""
    if tau >= len(x) or len(x) < 10:
        return 0.0
    x_centered = x - np.mean(x)
    n = len(x) - tau
    if n < 5:
        return 0.0
    numer = np.sum(x_centered[:n] * x_centered[tau:tau+n])
    denom = np.sum(x_centered ** 2)
    if denom < 1e-10:
        return 0.0
    return numer / denom


def add_prometheus_features(df: pd.DataFrame) -> pd.DataFrame:
    """Add PROMETHEUS features - these will be TRAINED into models."""
    df = df.copy()
    
    # === BASIC ROLLING FEATURES ===
    key_cols = ['V1', 'V2', 'V3', 'M1', 'M2', 'S1', 'S2', 'E1', 'P1', 'I1']
    key_cols = [c for c in key_cols if c in df.columns]
    
    for col in key_cols:
        for window in [5, 21, 63]:
            if len(df) >= window:
                df[f'{col}_ma{window}'] = df[col].rolling(window, min_periods=1).mean()
                df[f'{col}_std{window}'] = df[col].rolling(window, min_periods=1).std().fillna(0)
    
    # === LAGGED RETURNS ===
    if 'lagged_forward_returns' in df.columns:
        df['lagged_ret'] = df['lagged_forward_returns']
    elif 'forward_returns' in df.columns:
        df['lagged_ret'] = df['forward_returns'].shift(1).fillna(0)
    else:
        df['lagged_ret'] = 0
    
    for w in [5, 10, 21, 63]:
        if len(df) >= w:
            df[f'ret_cumsum_{w}'] = df['lagged_ret'].rolling(w, min_periods=1).sum()
            df[f'ret_vol_{w}'] = df['lagged_ret'].rolling(w, min_periods=1).std().fillna(0)
            # Sharpe-like ratio
            df[f'sharpe_{w}'] = df[f'ret_cumsum_{w}'] / (df[f'ret_vol_{w}'] * np.sqrt(w) + 1e-8)
    
    # === TIER 1: PHASE TRANSITION DETECTION ===
    
    # 1. Variance Compression
    if len(df) >= 63:
        df['var_21'] = df['lagged_ret'].rolling(21, min_periods=5).var().fillna(0)
        df['var_63'] = df['lagged_ret'].rolling(63, min_periods=10).var().fillna(0)
        df['var_ratio'] = df['var_21'] / (df['var_63'] + 1e-8)
        df['var_compression'] = (df['var_ratio'] < 0.5).astype(float)
    else:
        df['var_21'] = df['var_63'] = df['var_ratio'] = 0
        df['var_compression'] = 0
    
    # 2. Critical Slowing Down (AC1)
    if len(df) >= 63:
        def calc_ac1(x):
            if len(x) < 10:
                return 0
            try:
                return np.corrcoef(x[:-1], x[1:])[0, 1]
            except:
                return 0
        df['ac1'] = df['lagged_ret'].rolling(63, min_periods=10).apply(calc_ac1, raw=True).fillna(0)
        df['ac1_ma21'] = df['ac1'].rolling(21, min_periods=1).mean()
        df['ac1_rising'] = (df['ac1'] > df['ac1_ma21']).astype(float)
    else:
        df['ac1'] = df['ac1_ma21'] = df['ac1_rising'] = 0
    
    # === TIER 2: MARKET TEMPERATURE & COHERENCE ===
    v_cols = [c for c in df.columns if c.startswith('V') and c[1:].isdigit()]
    m_cols = [c for c in df.columns if c.startswith('M') and c[1:].isdigit()]
    s_cols = [c for c in df.columns if c.startswith('S') and c[1:].isdigit()]
    e_cols = [c for c in df.columns if c.startswith('E') and c[1:].isdigit()]
    i_cols = [c for c in df.columns if c.startswith('I') and c[1:].isdigit()]
    
    # Temperature
    if v_cols:
        df['v_mean'] = df[v_cols].mean(axis=1)
        df['v_std'] = df[v_cols].std(axis=1)
        df['temperature'] = df['v_std'] / (df['v_mean'].abs() + 1e-8)
    else:
        df['v_mean'] = df['v_std'] = df['temperature'] = 0
    
    # Order Parameters
    for prefix, cols in [('V', v_cols), ('M', m_cols), ('S', s_cols)]:
        if len(cols) >= 2:
            zscores = (df[cols] - df[cols].mean()) / (df[cols].std() + 1e-8)
            df[f'{prefix}_order'] = 1 - zscores.var(axis=1).fillna(1)
        else:
            df[f'{prefix}_order'] = 0
    
    # Volatility Regime
    if v_cols:
        df['vol_regime'] = df[v_cols].mean(axis=1)
        df['vol_regime_ma21'] = df['vol_regime'].rolling(21, min_periods=1).mean()
        df['vol_expanding'] = (df['vol_regime'] > df['vol_regime_ma21']).astype(float)
    else:
        df['vol_regime'] = df['vol_regime_ma21'] = df['vol_expanding'] = 0
    
    # === TIER 3: FEATURE INTERACTIONS ===
    
    # Sentiment mean
    if s_cols:
        df['sent_mean'] = df[s_cols].mean(axis=1)
        df['sent_vol_interact'] = df['sent_mean'] / (df['vol_regime'] + 1e-8)
    else:
        df['sent_mean'] = df['sent_vol_interact'] = 0
    
    # Momentum strength
    if 'ret_cumsum_21' in df.columns:
        df['cum_ret_std'] = df['ret_cumsum_21'].rolling(63, min_periods=1).std().fillna(0)
        df['momentum_strong'] = (df['ret_cumsum_21'].abs() > df['cum_ret_std']).astype(float)
    else:
        df['cum_ret_std'] = df['momentum_strong'] = 0
    
    # Economic surprise
    if e_cols:
        df['econ_mean'] = df[e_cols].mean(axis=1)
        df['econ_momentum'] = df['econ_mean'].diff(5).fillna(0)
        df['econ_surprise'] = df['econ_momentum'] - df['econ_momentum'].rolling(63, min_periods=1).mean()
    else:
        df['econ_mean'] = df['econ_momentum'] = df['econ_surprise'] = 0
    
    # Interest rate regime
    if len(i_cols) >= 3 and 'I3' in df.columns and 'I1' in df.columns:
        df['rate_slope'] = df['I3'] - df['I1']
        df['rate_slope_pct'] = df['rate_slope'].rolling(63, min_periods=1).rank(pct=True).fillna(0.5)
        df['rate_inverting'] = (df['rate_slope_pct'] < 0.1).astype(float)
    else:
        df['rate_slope'] = 0
        df['rate_slope_pct'] = 0.5
        df['rate_inverting'] = 0
    
    # === TIER 4: CROSS-DOMAIN CORRELATION ===
    if v_cols and m_cols and s_cols and len(df) >= 21:
        vm = df[v_cols].mean(axis=1)
        mm = df[m_cols].mean(axis=1)
        sm = df[s_cols].mean(axis=1)
        
        corr_vm = vm.rolling(21, min_periods=5).corr(mm).fillna(0)
        corr_vs = vm.rolling(21, min_periods=5).corr(sm).fillna(0)
        corr_ms = mm.rolling(21, min_periods=5).corr(sm).fillna(0)
        
        avg_corr = (abs(corr_vm) + abs(corr_vs) + abs(corr_ms)) / 3
        df['cross_domain_corr'] = avg_corr
        df['correlation_surge'] = (avg_corr > 0.7).astype(float)
    else:
        df['cross_domain_corr'] = 0
        df['correlation_surge'] = 0
    
    return df

print("PROMETHEUS feature engineering defined.")

In [None]:
# Prepare training data
print("Applying feature engineering...")
df = train_df.copy()
df = df.sort_values('date_id').reset_index(drop=True)

# Apply PROMETHEUS features
df = add_prometheus_features(df)

# Get feature columns (exclude target and metadata)
exclude_cols = ['date_id', 'forward_returns', 'risk_free_rate', 
                'market_forward_excess_returns', 'is_scored',
                'lagged_forward_returns', 'lagged_risk_free_rate',
                'lagged_market_forward_excess_returns', 'lagged_ret']

feature_cols = [c for c in df.columns if c not in exclude_cols 
                and df[c].dtype in ['float64', 'int64', 'float32', 'int32']]

print(f"Total features: {len(feature_cols)}")
print(f"\nNew PROMETHEUS features:")
prometheus_feats = [c for c in feature_cols if any(x in c for x in 
    ['var_', 'ac1', 'temperature', 'order', 'vol_regime', 'sent_', 
     'momentum_', 'econ_', 'rate_', 'cross_domain', 'correlation_', 'sharpe_'])]
print(f"  Count: {len(prometheus_feats)}")
print(f"  Examples: {prometheus_feats[:10]}")

In [None]:
# Prepare final data
df_valid = df[df['market_forward_excess_returns'].notna()].copy()
print(f"Valid samples: {len(df_valid)}")

# Fill NaN in features
X = df_valid[feature_cols].fillna(0)
y = df_valid['market_forward_excess_returns']

# Scale features
scaler = RobustScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=feature_cols, index=X.index)

print(f"X shape: {X_scaled.shape}")
print(f"y shape: {y.shape}")

In [None]:
# ============================================================================
# TRAIN MODELS WITH TIME SERIES CROSS-VALIDATION
# ============================================================================

# Use last 20% for final validation, train on first 80%
val_size = int(len(X_scaled) * 0.2)
X_train = X_scaled.iloc[:-val_size]
y_train = y.iloc[:-val_size]
X_val = X_scaled.iloc[-val_size:]
y_val = y.iloc[-val_size:]

print(f"Train: {len(X_train)}, Val: {len(X_val)}")

# Train multiple models with different seeds
lgb_models = []
xgb_models = []

for seed in range(CONFIG['n_models']):
    print(f"\nTraining model {seed+1}/{CONFIG['n_models']}...")
    
    # LightGBM
    lgb_params = CONFIG['lgb_params'].copy()
    lgb_params['seed'] = 42 + seed
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    lgb_model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=CONFIG['num_boost_round'],
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(CONFIG['early_stopping'], verbose=False)]
    )
    lgb_models.append(lgb_model)
    print(f"  LGB best iter: {lgb_model.best_iteration}")
    
    # XGBoost
    xgb_params = CONFIG['xgb_params'].copy()
    xgb_params['seed'] = 42 + seed
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    xgb_model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=CONFIG['num_boost_round'],
        evals=[(dval, 'val')],
        early_stopping_rounds=CONFIG['early_stopping'],
        verbose_eval=False
    )
    xgb_models.append(xgb_model)
    print(f"  XGB best iter: {xgb_model.best_iteration}")

print(f"\nTrained {len(lgb_models)} LGB + {len(xgb_models)} XGB models.")

In [None]:
# Validate ensemble on held-out data
print("Validating ensemble...")

predictions = []
for model in lgb_models:
    pred = model.predict(X_val)
    predictions.append(pred)

dval = xgb.DMatrix(X_val)
for model in xgb_models:
    pred = model.predict(dval)
    predictions.append(pred)

predictions = np.array(predictions)
mean_pred = predictions.mean(axis=0)
std_pred = predictions.std(axis=0)

# Position sizing
cfg = CONFIG
positions = []
for i in range(len(mean_pred)):
    uncertainty = max(std_pred[i], 1e-5)
    kelly = mean_pred[i] / (cfg['risk_aversion'] * uncertainty**2 + 1e-8)
    pos = cfg['base_position'] + cfg['scale_factor'] * kelly
    pos = np.clip(pos, cfg['min_position'], cfg['max_position'])
    positions.append(pos)
positions = np.array(positions)

# Calculate metrics
strategy_returns = positions * y_val.values
market_returns = y_val.values

strategy_mean = strategy_returns.mean() * 252
strategy_vol = strategy_returns.std() * np.sqrt(252)
market_vol = market_returns.std() * np.sqrt(252)
sharpe = strategy_mean / (strategy_vol + 1e-8)

print(f"\n=== VALIDATION METRICS ===")
print(f"Strategy Annual Return: {strategy_mean:.4f}")
print(f"Strategy Annual Vol: {strategy_vol:.4f}")
print(f"Market Annual Vol: {market_vol:.4f}")
print(f"Sharpe Ratio: {sharpe:.4f}")
print(f"Position Mean: {positions.mean():.3f}")
print(f"Position Std: {positions.std():.3f}")
print(f"Position Min/Max: {positions.min():.3f} / {positions.max():.3f}")

In [None]:
# ============================================================================
# SAVE ARTIFACTS
# ============================================================================
print(f"\nSaving artifacts to {ARTIFACTS_DIR}...")

# Save scaler
with open(ARTIFACTS_DIR / 'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save feature columns
with open(ARTIFACTS_DIR / 'feature_cols.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)

# Save LGB models
with open(ARTIFACTS_DIR / 'lgb_models.pkl', 'wb') as f:
    pickle.dump(lgb_models, f)

# Save XGB models
for i, model in enumerate(xgb_models):
    model.save_model(str(ARTIFACTS_DIR / f'xgb_model_{i}.json'))

# Save config with position sizing params
config_to_save = {
    'base_position': cfg['base_position'],
    'risk_aversion': cfg['risk_aversion'],
    'scale_factor': cfg['scale_factor'],
    'min_position': cfg['min_position'],
    'max_position': cfg['max_position']
}
with open(ARTIFACTS_DIR / 'config.pkl', 'wb') as f:
    pickle.dump(config_to_save, f)

# Save recent data for inference
recent_data = df_valid.tail(300).copy()
recent_data.to_parquet(ARTIFACTS_DIR / 'recent_data.parquet')

print(f"\nArtifacts saved:")
for f in sorted(ARTIFACTS_DIR.glob('*')):
    print(f"  {f.name}: {f.stat().st_size} bytes")

In [None]:
# Verify config changes
print("\n=== V4 CONFIG COMPARISON ===")
print("\nOld (v3) config:")
print("  base_position: 1.0")
print("  risk_aversion: 50.0")
print("  scale_factor: 80.0")
print("  min_position: 0.2")
print("  max_position: 1.8")

print("\nNew (v4) config:")
for k, v in config_to_save.items():
    print(f"  {k}: {v}")

print("\n=== FEATURE COMPARISON ===")
print(f"Old feature count: ~187")
print(f"New feature count: {len(feature_cols)}")

print("\nTraining complete! Use artifacts_v4 in your submission notebook.")