# Hull Tactical Market Prediction - Ensemble Strategy

## Strategy Overview
- **Models**: LightGBM + XGBoost + CatBoost ensemble with uncertainty quantification
- **Features**: Advanced feature engineering with rolling statistics, momentum, and regime detection
- **Position Sizing**: Kelly criterion-inspired sizing with volatility scaling
- **Validation**: Walk-forward validation to prevent overfitting

In [None]:
# Install dependencies
!pip install -q lightgbm xgboost catboost polars scikit-learn

In [None]:
import os
import warnings
import pickle
from pathlib import Path
from typing import List, Tuple, Optional

import numpy as np
import pandas as pd
import polars as pl
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb
import xgboost as xgb

warnings.filterwarnings('ignore')

# Configuration
class Config:
    SEED = 42
    N_SPLITS = 5
    LOOKBACK_WINDOWS = [5, 10, 21, 63, 126, 252]  # Trading days
    TARGET_COL = 'market_forward_excess_returns'
    DATE_COL = 'date_id'
    
    # Position sizing
    BASE_POSITION = 1.0
    MAX_POSITION = 2.0
    MIN_POSITION = 0.0
    RISK_AVERSION = 50.0
    SCALE_FACTOR = 50.0
    
    # Model params
    LGB_PARAMS = {
        'objective': 'regression',
        'metric': 'mse',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.01,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': SEED,
        'n_jobs': -1
    }
    
    XGB_PARAMS = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'max_depth': 6,
        'learning_rate': 0.01,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'seed': SEED,
        'n_jobs': -1,
        'verbosity': 0
    }

np.random.seed(Config.SEED)
print("Configuration loaded.")

In [None]:
# Data paths
if os.path.exists('/kaggle/input/hull-tactical-market-prediction'):
    DATA_DIR = Path('/kaggle/input/hull-tactical-market-prediction')
    ARTIFACTS_DIR = Path('/kaggle/working')
else:
    DATA_DIR = Path('/home/user/aimo3/hull/hull-tactical-market-prediction')
    ARTIFACTS_DIR = Path('/home/user/aimo3/hull/artifacts')

ARTIFACTS_DIR.mkdir(exist_ok=True)
print(f"Data directory: {DATA_DIR}")
print(f"Artifacts directory: {ARTIFACTS_DIR}")

In [None]:
# Load data
train_df = pd.read_csv(DATA_DIR / 'train.csv')
print(f"Train shape: {train_df.shape}")
print(f"Date range: {train_df['date_id'].min()} - {train_df['date_id'].max()}")
print(f"\nTarget stats:")
print(train_df['market_forward_excess_returns'].describe())

## Feature Engineering

In [None]:
class FeatureEngineer:
    """Advanced feature engineering for market prediction."""
    
    def __init__(self, lookback_windows: List[int] = None):
        self.lookback_windows = lookback_windows or [5, 10, 21, 63, 126, 252]
        self.feature_cols = None
        self.scaler = RobustScaler()
        
    def _get_base_features(self, df: pd.DataFrame) -> List[str]:
        """Get base feature columns."""
        exclude = ['date_id', 'forward_returns', 'risk_free_rate', 
                   'market_forward_excess_returns', 'is_scored',
                   'lagged_forward_returns', 'lagged_risk_free_rate',
                   'lagged_market_forward_excess_returns']
        return [c for c in df.columns if c not in exclude]
    
    def _add_rolling_features(self, df: pd.DataFrame, base_cols: List[str]) -> pd.DataFrame:
        """Add rolling statistics for key features."""
        df = df.copy()
        
        # Select key features for rolling stats (top by variance)
        numeric_cols = [c for c in base_cols if df[c].dtype in ['float64', 'int64']]
        variances = df[numeric_cols].var().sort_values(ascending=False)
        key_features = variances.head(20).index.tolist()
        
        for col in key_features:
            for window in [5, 21, 63]:
                if len(df) > window:
                    # Rolling mean
                    df[f'{col}_ma{window}'] = df[col].rolling(window, min_periods=1).mean()
                    # Rolling std
                    df[f'{col}_std{window}'] = df[col].rolling(window, min_periods=1).std()
                    # Z-score
                    df[f'{col}_zscore{window}'] = (
                        (df[col] - df[f'{col}_ma{window}']) / 
                        (df[f'{col}_std{window}'] + 1e-8)
                    )
        
        return df
    
    def _add_momentum_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add momentum and trend features."""
        df = df.copy()
        
        # Use lagged returns if available
        if 'lagged_forward_returns' in df.columns:
            ret_col = 'lagged_forward_returns'
        elif 'forward_returns' in df.columns:
            ret_col = 'forward_returns'
            df[ret_col] = df[ret_col].shift(1)  # Lag to avoid lookahead
        else:
            return df
        
        # Cumulative returns
        for window in [5, 10, 21, 63]:
            if len(df) > window:
                df[f'cum_ret_{window}d'] = df[ret_col].rolling(window, min_periods=1).sum()
                df[f'ret_vol_{window}d'] = df[ret_col].rolling(window, min_periods=1).std()
                # Sharpe-like ratio
                df[f'sharpe_{window}d'] = (
                    df[f'cum_ret_{window}d'] / (df[f'ret_vol_{window}d'] * np.sqrt(window) + 1e-8)
                )
        
        # Trend strength
        if len(df) > 21:
            df['trend_21d'] = df[ret_col].rolling(21).apply(
                lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) == 21 else 0,
                raw=False
            )
        
        return df
    
    def _add_regime_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add market regime detection features."""
        df = df.copy()
        
        # Volatility regime (using V features)
        vol_cols = [c for c in df.columns if c.startswith('V')]
        if vol_cols:
            df['vol_regime'] = df[vol_cols].mean(axis=1)
            df['vol_regime_ma21'] = df['vol_regime'].rolling(21, min_periods=1).mean()
            df['vol_regime_high'] = (df['vol_regime'] > df['vol_regime_ma21']).astype(int)
        
        # Sentiment regime (using S features)
        sent_cols = [c for c in df.columns if c.startswith('S')]
        if sent_cols:
            df['sent_regime'] = df[sent_cols].mean(axis=1)
            df['sent_regime_ma21'] = df['sent_regime'].rolling(21, min_periods=1).mean()
        
        return df
    
    def _add_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add feature interactions."""
        df = df.copy()
        
        # Category means
        categories = {'M': 'market', 'E': 'econ', 'I': 'interest', 
                      'P': 'price', 'V': 'vol', 'S': 'sent'}
        
        for prefix, name in categories.items():
            cols = [c for c in df.columns if c.startswith(prefix) and 
                    not c.startswith(f'{prefix}_') and 
                    df[c].dtype in ['float64', 'int64']]
            if cols:
                df[f'{name}_mean'] = df[cols].mean(axis=1)
                df[f'{name}_std'] = df[cols].std(axis=1)
        
        # Key interactions
        if 'vol_mean' in df.columns and 'sent_mean' in df.columns:
            df['vol_sent_interact'] = df['vol_mean'] * df['sent_mean']
        
        if 'market_mean' in df.columns and 'vol_mean' in df.columns:
            df['market_vol_interact'] = df['market_mean'] / (df['vol_mean'] + 1e-8)
        
        return df
    
    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Fit and transform features."""
        base_cols = self._get_base_features(df)
        
        # Apply feature engineering
        df = self._add_rolling_features(df, base_cols)
        df = self._add_momentum_features(df)
        df = self._add_regime_features(df)
        df = self._add_interaction_features(df)
        
        # Get all numeric feature columns
        exclude = ['date_id', 'forward_returns', 'risk_free_rate', 
                   'market_forward_excess_returns', 'is_scored',
                   'lagged_forward_returns', 'lagged_risk_free_rate',
                   'lagged_market_forward_excess_returns']
        self.feature_cols = [c for c in df.columns 
                             if c not in exclude and 
                             df[c].dtype in ['float64', 'int64']]
        
        # Fill NaN and scale
        df[self.feature_cols] = df[self.feature_cols].fillna(0)
        df[self.feature_cols] = self.scaler.fit_transform(df[self.feature_cols])
        
        return df
    
    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Transform features using fitted scaler."""
        base_cols = self._get_base_features(df)
        
        df = self._add_rolling_features(df, base_cols)
        df = self._add_momentum_features(df)
        df = self._add_regime_features(df)
        df = self._add_interaction_features(df)
        
        # Ensure all feature columns exist
        for col in self.feature_cols:
            if col not in df.columns:
                df[col] = 0
        
        df[self.feature_cols] = df[self.feature_cols].fillna(0)
        df[self.feature_cols] = self.scaler.transform(df[self.feature_cols])
        
        return df

print("FeatureEngineer class defined.")

## Model Training with Walk-Forward Validation

In [None]:
class EnsembleModel:
    """Ensemble of gradient boosting models with uncertainty estimation."""
    
    def __init__(self, n_seeds: int = 3):
        self.n_seeds = n_seeds
        self.lgb_models = []
        self.xgb_models = []
        self.feature_cols = None
        
    def train(self, X: pd.DataFrame, y: pd.Series, 
              X_val: pd.DataFrame = None, y_val: pd.Series = None):
        """Train ensemble models."""
        self.feature_cols = X.columns.tolist()
        
        for seed in range(self.n_seeds):
            # LightGBM
            lgb_params = Config.LGB_PARAMS.copy()
            lgb_params['seed'] = Config.SEED + seed
            
            train_data = lgb.Dataset(X, label=y)
            val_data = lgb.Dataset(X_val, label=y_val) if X_val is not None else None
            
            lgb_model = lgb.train(
                lgb_params,
                train_data,
                num_boost_round=1000,
                valid_sets=[val_data] if val_data else None,
                callbacks=[lgb.early_stopping(50, verbose=False)] if val_data else None
            )
            self.lgb_models.append(lgb_model)
            
            # XGBoost
            xgb_params = Config.XGB_PARAMS.copy()
            xgb_params['seed'] = Config.SEED + seed
            
            dtrain = xgb.DMatrix(X, label=y)
            dval = xgb.DMatrix(X_val, label=y_val) if X_val is not None else None
            
            evals = [(dval, 'val')] if dval else []
            xgb_model = xgb.train(
                xgb_params,
                dtrain,
                num_boost_round=1000,
                evals=evals,
                early_stopping_rounds=50 if evals else None,
                verbose_eval=False
            )
            self.xgb_models.append(xgb_model)
        
        print(f"Trained {len(self.lgb_models)} LGB + {len(self.xgb_models)} XGB models")
    
    def predict(self, X: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Predict with uncertainty estimation."""
        # Ensure column order matches training
        X = X[self.feature_cols]
        
        predictions = []
        
        # LightGBM predictions
        for model in self.lgb_models:
            pred = model.predict(X)
            predictions.append(pred)
        
        # XGBoost predictions
        dtest = xgb.DMatrix(X)
        for model in self.xgb_models:
            pred = model.predict(dtest)
            predictions.append(pred)
        
        predictions = np.array(predictions)
        
        # Mean and std for uncertainty
        mean_pred = predictions.mean(axis=0)
        std_pred = predictions.std(axis=0)
        
        return mean_pred, std_pred

print("EnsembleModel class defined.")

In [None]:
class PositionSizer:
    """Position sizing with uncertainty-aware Kelly criterion."""
    
    def __init__(self, 
                 base_position: float = 1.0,
                 risk_aversion: float = 50.0,
                 scale_factor: float = 50.0,
                 min_position: float = 0.0,
                 max_position: float = 2.0):
        self.base_position = base_position
        self.risk_aversion = risk_aversion
        self.scale_factor = scale_factor
        self.min_position = min_position
        self.max_position = max_position
        
    def size_position(self, 
                      prediction: float, 
                      uncertainty: float,
                      recent_vol: float = None) -> float:
        """Calculate position size based on prediction and uncertainty."""
        
        # Base Kelly-inspired sizing
        # position = prediction / (risk_aversion * uncertainty^2)
        uncertainty = max(uncertainty, 1e-6)
        
        kelly_fraction = prediction / (self.risk_aversion * uncertainty**2 + 1e-8)
        
        # Scale and add to base
        position = self.base_position + self.scale_factor * kelly_fraction
        
        # Volatility adjustment if available
        if recent_vol is not None and recent_vol > 0:
            vol_adj = min(1.0, 0.015 / recent_vol)  # Target ~1.5% daily vol
            position = self.base_position + (position - self.base_position) * vol_adj
        
        # Clip to valid range
        position = np.clip(position, self.min_position, self.max_position)
        
        return position

print("PositionSizer class defined.")

In [None]:
# Walk-forward validation
def walk_forward_validation(df: pd.DataFrame, 
                            n_splits: int = 5,
                            test_size: int = 180) -> dict:
    """Walk-forward validation with proper time series splits."""
    
    results = []
    df = df.sort_values('date_id').reset_index(drop=True)
    
    # Use only rows with valid target
    df_valid = df[df['market_forward_excess_returns'].notna()].copy()
    
    total_rows = len(df_valid)
    min_train_size = total_rows - n_splits * test_size
    
    if min_train_size < 500:
        min_train_size = 500
        n_splits = (total_rows - min_train_size) // test_size
    
    print(f"Total rows: {total_rows}, Train min: {min_train_size}, Test size: {test_size}, Splits: {n_splits}")
    
    for fold in range(n_splits):
        train_end = min_train_size + fold * test_size
        test_end = train_end + test_size
        
        if test_end > total_rows:
            break
        
        train_df = df_valid.iloc[:train_end]
        test_df = df_valid.iloc[train_end:test_end]
        
        print(f"\nFold {fold+1}: Train {len(train_df)} rows, Test {len(test_df)} rows")
        
        # Feature engineering
        fe = FeatureEngineer()
        train_fe = fe.fit_transform(train_df.copy())
        test_fe = fe.transform(test_df.copy())
        
        # Prepare data
        X_train = train_fe[fe.feature_cols]
        y_train = train_df['market_forward_excess_returns']
        X_test = test_fe[fe.feature_cols]
        y_test = test_df['market_forward_excess_returns']
        
        # Use last 20% of train as validation
        val_size = int(len(X_train) * 0.2)
        X_val = X_train.iloc[-val_size:]
        y_val = y_train.iloc[-val_size:]
        X_train_sub = X_train.iloc[:-val_size]
        y_train_sub = y_train.iloc[:-val_size]
        
        # Train model
        model = EnsembleModel(n_seeds=3)
        model.train(X_train_sub, y_train_sub, X_val, y_val)
        
        # Predict
        preds, uncertainty = model.predict(X_test)
        
        # Position sizing
        sizer = PositionSizer(
            base_position=Config.BASE_POSITION,
            risk_aversion=Config.RISK_AVERSION,
            scale_factor=Config.SCALE_FACTOR
        )
        
        positions = []
        for i in range(len(preds)):
            pos = sizer.size_position(preds[i], uncertainty[i])
            positions.append(pos)
        positions = np.array(positions)
        
        # Calculate metrics
        strategy_returns = positions * y_test.values
        market_returns = y_test.values
        
        strategy_mean = strategy_returns.mean() * 252
        strategy_vol = strategy_returns.std() * np.sqrt(252)
        market_vol = market_returns.std() * np.sqrt(252)
        
        sharpe = strategy_mean / (strategy_vol + 1e-8)
        
        # Adjusted sharpe with penalties
        vol_penalty = max(1.0, (strategy_vol / market_vol) / 1.2) if market_vol > 0 else 1.0
        adjusted_sharpe = sharpe / vol_penalty
        
        print(f"  Strategy return: {strategy_mean:.4f}, Vol: {strategy_vol:.4f}")
        print(f"  Sharpe: {sharpe:.4f}, Adjusted: {adjusted_sharpe:.4f}")
        print(f"  Position mean: {positions.mean():.3f}, std: {positions.std():.3f}")
        
        results.append({
            'fold': fold,
            'sharpe': sharpe,
            'adjusted_sharpe': adjusted_sharpe,
            'strategy_vol': strategy_vol,
            'market_vol': market_vol,
            'position_mean': positions.mean(),
            'position_std': positions.std()
        })
    
    # Summary
    results_df = pd.DataFrame(results)
    print("\n" + "="*50)
    print("WALK-FORWARD VALIDATION SUMMARY")
    print("="*50)
    print(f"Mean Adjusted Sharpe: {results_df['adjusted_sharpe'].mean():.4f} Â± {results_df['adjusted_sharpe'].std():.4f}")
    print(f"Mean Position: {results_df['position_mean'].mean():.3f}")
    
    return results_df

# Run validation
print("Running walk-forward validation...")
val_results = walk_forward_validation(train_df, n_splits=5, test_size=180)

## Train Final Model on Full Data

In [None]:
# Train final model on all available data
print("Training final model on full dataset...")

# Prepare full dataset
df_full = train_df[train_df['market_forward_excess_returns'].notna()].copy()
df_full = df_full.sort_values('date_id').reset_index(drop=True)

# Feature engineering
feature_engineer = FeatureEngineer()
df_fe = feature_engineer.fit_transform(df_full.copy())

# Prepare features and target
X_full = df_fe[feature_engineer.feature_cols]
y_full = df_full['market_forward_excess_returns']

# Use last 20% as validation for early stopping
val_size = int(len(X_full) * 0.2)
X_train = X_full.iloc[:-val_size]
y_train = y_full.iloc[:-val_size]
X_val = X_full.iloc[-val_size:]
y_val = y_full.iloc[-val_size:]

print(f"Training size: {len(X_train)}, Validation size: {len(X_val)}")
print(f"Number of features: {len(feature_engineer.feature_cols)}")

# Train ensemble
final_model = EnsembleModel(n_seeds=5)  # More seeds for final model
final_model.train(X_train, y_train, X_val, y_val)

# Position sizer
position_sizer = PositionSizer(
    base_position=Config.BASE_POSITION,
    risk_aversion=Config.RISK_AVERSION,
    scale_factor=Config.SCALE_FACTOR
)

print("\nFinal model trained successfully!")

In [None]:
# Save artifacts
print("Saving model artifacts...")

# Save feature engineer
with open(ARTIFACTS_DIR / 'feature_engineer.pkl', 'wb') as f:
    pickle.dump(feature_engineer, f)

# Save models
with open(ARTIFACTS_DIR / 'lgb_models.pkl', 'wb') as f:
    pickle.dump(final_model.lgb_models, f)

# Save XGBoost models
for i, model in enumerate(final_model.xgb_models):
    model.save_model(str(ARTIFACTS_DIR / f'xgb_model_{i}.json'))

# Save feature columns
with open(ARTIFACTS_DIR / 'feature_cols.pkl', 'wb') as f:
    pickle.dump(final_model.feature_cols, f)

# Save position sizer config
position_config = {
    'base_position': Config.BASE_POSITION,
    'risk_aversion': Config.RISK_AVERSION,
    'scale_factor': Config.SCALE_FACTOR,
    'min_position': Config.MIN_POSITION,
    'max_position': Config.MAX_POSITION
}
with open(ARTIFACTS_DIR / 'position_config.pkl', 'wb') as f:
    pickle.dump(position_config, f)

# Save recent training data for online features
recent_data = df_full.tail(300).copy()  # Last 300 days for feature calculation
recent_data.to_parquet(ARTIFACTS_DIR / 'recent_data.parquet')

print(f"Artifacts saved to {ARTIFACTS_DIR}")
print(f"Files: {list(ARTIFACTS_DIR.glob('*'))}")

## Inference Server Setup

In [None]:
# Inference code for submission
import os
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import xgboost as xgb

# Import evaluation API
import kaggle_evaluation.default_inference_server

# Load artifacts
if os.path.exists('/kaggle/input/hull-submission-artifacts'):
    ARTIFACTS_DIR = Path('/kaggle/input/hull-submission-artifacts')
elif os.path.exists('/kaggle/working'):
    ARTIFACTS_DIR = Path('/kaggle/working')
else:
    ARTIFACTS_DIR = Path('/home/user/aimo3/hull/artifacts')

# Global variables for inference
feature_engineer = None
lgb_models = None
xgb_models = None
feature_cols = None
position_config = None
recent_data = None
history_buffer = []

def load_models():
    """Load all model artifacts."""
    global feature_engineer, lgb_models, xgb_models, feature_cols, position_config, recent_data
    
    print(f"Loading artifacts from {ARTIFACTS_DIR}...")
    
    with open(ARTIFACTS_DIR / 'feature_engineer.pkl', 'rb') as f:
        feature_engineer = pickle.load(f)
    
    with open(ARTIFACTS_DIR / 'lgb_models.pkl', 'rb') as f:
        lgb_models = pickle.load(f)
    
    xgb_models = []
    i = 0
    while (ARTIFACTS_DIR / f'xgb_model_{i}.json').exists():
        model = xgb.Booster()
        model.load_model(str(ARTIFACTS_DIR / f'xgb_model_{i}.json'))
        xgb_models.append(model)
        i += 1
    
    with open(ARTIFACTS_DIR / 'feature_cols.pkl', 'rb') as f:
        feature_cols = pickle.load(f)
    
    with open(ARTIFACTS_DIR / 'position_config.pkl', 'rb') as f:
        position_config = pickle.load(f)
    
    recent_data = pd.read_parquet(ARTIFACTS_DIR / 'recent_data.parquet')
    
    print(f"Loaded {len(lgb_models)} LGB + {len(xgb_models)} XGB models")
    print(f"Feature columns: {len(feature_cols)}")

def predict(test: pl.DataFrame) -> float:
    """Main prediction function for the evaluation API."""
    global feature_engineer, lgb_models, xgb_models, feature_cols, position_config, recent_data, history_buffer
    
    # Load models on first call
    if feature_engineer is None:
        load_models()
    
    # Convert to pandas
    test_pd = test.to_pandas()
    
    # Add to history buffer
    history_buffer.append(test_pd)
    if len(history_buffer) > 300:
        history_buffer = history_buffer[-300:]
    
    # Create feature dataframe with history
    if len(history_buffer) < 21:
        # Use recent training data for initial predictions
        combined = pd.concat([recent_data.tail(300 - len(history_buffer))] + history_buffer, ignore_index=True)
    else:
        combined = pd.concat(history_buffer, ignore_index=True)
    
    # Transform features
    combined_fe = feature_engineer.transform(combined.copy())
    
    # Get features for current row
    X = combined_fe[feature_cols].iloc[[-1]]
    
    # Predict with ensemble
    predictions = []
    
    for model in lgb_models:
        pred = model.predict(X)
        predictions.append(pred[0])
    
    dtest = xgb.DMatrix(X)
    for model in xgb_models:
        pred = model.predict(dtest)
        predictions.append(pred[0])
    
    predictions = np.array(predictions)
    mean_pred = predictions.mean()
    std_pred = predictions.std()
    
    # Position sizing
    uncertainty = max(std_pred, 1e-6)
    kelly_fraction = mean_pred / (position_config['risk_aversion'] * uncertainty**2 + 1e-8)
    position = position_config['base_position'] + position_config['scale_factor'] * kelly_fraction
    position = np.clip(position, position_config['min_position'], position_config['max_position'])
    
    return float(position)

# Create inference server
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

print("Inference server configured.")

In [None]:
# Run inference
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    # Production mode - serve the API
    inference_server.serve()
else:
    # Local testing mode
    print("Running local gateway test...")
    inference_server.run_local_gateway((str(DATA_DIR),))
    print("Local test completed!")