In [1]:
# Imports Required Modules
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV  # ← Added GridSearchCV
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
print("Linear Models Ridge + XGBoost + LightGBM for Alpha Prediction")

Linear Models Ridge + XGBoost + LightGBM for Alpha Prediction


In [2]:
# Load Data
print("1. Loading 19 alpha factors...")
data = pd.read_csv('alpha_factors.csv', index_col=0, parse_dates=True)
X = data.drop(columns='spy_return')  # Features
y = data['spy_return']               # Target next-day return
print(f"Dataset: {X.shape} features x {len(y)} observations")
print(f"Target Next-day SPY return mean: {y.mean():.4f}")

1. Loading 19 alpha factors...
Dataset: (2729, 19) features x 2729 observations
Target Next-day SPY return mean: 0.0005


In [3]:
# Time-Series Cross-Validation for Easy retrival of trading data based on Date
print("\nStep2: Time-Series CV setup....")

# This will do 5 folds. Fold1: Train on 2015 - 2018 data, test on 2019 data, similarly for all 
tscv = TimeSeriesSplit(n_splits = 5)
ridge_scores, xgb_scores = [], []

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.fillna(0)) # Fill NaN for ML

for i, (train_idx, test_idx) in enumerate(tscv.split(X_scaled)):
    X_tr, X_te = X_scaled[train_idx], X_scaled[test_idx]
    y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]

    # Ridge Regression
    ridge = Ridge(alpha = 1.0)
    ridge.fit(X_tr, y_tr)
    ridge_scores.append(ridge.score(X_te, y_te))

    # XGBoost
    xgb1 = xgb.XGBRegressor(
        n_estimators = 100, 
        max_depth = 4,
        learning_rate = 0.05, 
        random_state = 42,
        objective = 'reg:squarederror'
    )
    xgb1.fit(X_tr, y_tr)
    xgb_scores.append(xgb1.score(X_te, y_te))

    print(f"Fold {i+1}: Ridge R²={ridge.score(X_te, y_te):.3f}, XGB R²={xgb1.score(X_te, y_te):.3f}")

print(f"\nMEAN R²: Ridge={np.mean(ridge_scores):.3f}, XGBoost={np.mean(xgb_scores):.3f}")


Step2: Time-Series CV setup....
Fold 1: Ridge R²=0.118, XGB R²=0.411
Fold 2: Ridge R²=-0.045, XGB R²=0.302
Fold 3: Ridge R²=0.129, XGB R²=0.479
Fold 4: Ridge R²=0.402, XGB R²=0.440
Fold 5: Ridge R²=0.332, XGB R²=0.521

MEAN R²: Ridge=0.187, XGBoost=0.431


In [4]:
# Hyperparameter Turning for XGBoost
from sklearn.model_selection import GridSearchCV
print("\nStep3: Hyperparameter Turning..........")

# Conservative Params for Financial Data - To avoid overfitting
xgb_param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 4],
    'learning_rate': [0.01, 0.05],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'reg_lambda': [1.0, 2.0] # L2 for Regularization
}

tscv = TimeSeriesSplit(n_splits = 3) # Fewer folds for Speed
xgb_grid = GridSearchCV(
    xgb.XGBRegressor(random_state = 42, objective = 'reg:squarederror'),
    xgb_param_grid, cv = tscv, scoring = 'r2', n_jobs = -1, verbose = 1
)

# Apply standard scaler
scaler = StandardScaler()
X_scaled_tune = scaler.fit_transform(X.fillna(0))

# Fit on First 80% for Tuning
split_tune = int(0.8 * len(X_scaled_tune))
xgb_grid.fit(X_scaled_tune[:split_tune], y.iloc[:split_tune])

print(f"\nBest param: {xgb_grid.best_params_}")
print(f"\nBest CV R-Square: {xgb_grid.best_score_:.3f}")

# use best parameters for final model
best_xgb_params = xgb_grid.best_params_
best_xgb_params.update({'random_state': 42, 'objective':'reg:squarederror'})


Step3: Hyperparameter Turning..........
Fitting 3 folds for each of 16 candidates, totalling 48 fits

Best param: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'reg_lambda': 1.0, 'subsample': 0.8}

Best CV R-Square: 0.397


In [5]:
# Train/Test Split + Model
print("\nStep4: Train/Test Split...")
split_idx = int(0.8 * len(X))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.fillna(0))
X_train, X_test = X_scaled[:split_idx], X_scaled[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print(f"Train: {len(X_train)} days, Test: {len(X_test)} days")

# Use best params
final_xgb = xgb.XGBRegressor(**best_xgb_params)
final_xgb.fit(X_train, y_train)

data['predictedreturn'] = pd.Series(final_xgb.predict(X_scaled), index=X.index)
test_r2 = final_xgb.score(X_test, y_test)
print(f"OOS Test R²: {test_r2:.3f}")



Step4: Train/Test Split...
Train: 2183 days, Test: 546 days
OOS Test R²: 0.535


In [6]:
# We are now using the Long Only Strategy
print("\nStep6: Long Only Strategy...")

pred_std = data['predictedreturn'].std()
long_threshold = 0.3 * pred_std  # change back to 5

# LONG-ONLY: No shorts, just best long signals
data['long_signal'] = (data['predictedreturn'] > long_threshold).astype(int)
data['weight'] = data['long_signal'] * 0.8  # 80% position on best signals

# Smooth to hold winners longer
data['weight'] = data['weight'].ewm(span=5).mean()
data['strategy_return'] = data['weight'].shift(1) * data['spy_return']
sharpe_ratio = data['strategy_return'].mean() / data['strategy_return'].std() * np.sqrt(252)

print(f"LONG-ONLY Sharpe: {sharpe_ratio:.3f}")
print(f"Long signals: {(data['long_signal']==1).sum()}")
print(f"Avg weight: {data['weight'].mean():.2f}")
print(f"Turnover: {(data['weight'].diff().abs() > 0.1).mean():.2f}")


Step6: Long Only Strategy...
LONG-ONLY Sharpe: 0.418
Long signals: 1056
Avg weight: 0.31
Turnover: 0.50


In [7]:
# Transaction Costs
# LONG-ONLY BENCHMARK + TRANSACTION COSTS
print("\nStep7: Benchmark Comparision")

# 1. LONG-ONLY BENCHMARK (60% constant SPY exposure)
data['long_only'] = 0.8 * data['spy_return'] # Change back 6
long_only_sharpe = data['long_only'].mean() / data['long_only'].std() * np.sqrt(252)
long_only_alpha = sharpe_ratio - long_only_sharpe

# 2. QUICK TRANSACTION COSTS (simplified)
turnover = data['weight'].diff().abs().fillna(0)
tc_bps = 2 / 10000  # 2bps round-trip (conservative)
data['tc_cost'] = turnover * tc_bps
data['net_return'] = data['strategy_return'] - data['tc_cost']
net_sharpe = data['net_return'].mean() / data['net_return'].std() * np.sqrt(252)

# 3. RESULTS TABLE
print(f"{'='*50}")
print(f"STRATEGY PERFORMANCE")
print(f"{'='*50}")
print(f"Strategy (Gross) Sharpe:   {sharpe_ratio:.3f}")
print(f"Net Sharpe (after costs):  {net_sharpe:.3f}")
print(f"{'-'*50}")
print(f"LONG-ONLY 60% SPY Sharpe:  {long_only_sharpe:.3f}")
print(f"Alpha vs Long-only:        {long_only_alpha:.3f}")
print(f"{'='*50}")
print(f"Turnover: {turnover.mean():.1%} ({turnover.mean()*252:.0f} trades/year)")

# Winrate
trading_days = data['weight'].abs() > 0.01
win_rate = (data['strategy_return'][trading_days] > 0).mean()
print(f"Win Rate (trading days): {win_rate:.1%}")

# Max Drawdown
equity_curve = (1 + data['strategy_return']).cumprod()
running_max = equity_curve.expanding().max()
drawdown = (equity_curve - running_max) / running_max
max_dd = drawdown.min()
print(f"Max Drawdown: {max_dd:.1%}")


Step7: Benchmark Comparision
STRATEGY PERFORMANCE
Strategy (Gross) Sharpe:   0.418
Net Sharpe (after costs):  0.327
--------------------------------------------------
LONG-ONLY 60% SPY Sharpe:  0.701
Alpha vs Long-only:        -0.283
Turnover: 10.7% (27 trades/year)
Win Rate (trading days): 55.2%
Max Drawdown: -16.1%


In [8]:
# Add this as the FINAL cell in Production_v1.0.ipynb
import os
from datetime import date
import pandas as pd

base_path = r"C:\Users\myoge\Contacts\Regression Based Project"
runs_path = os.path.join(base_path, "runs")
os.makedirs(runs_path, exist_ok=True)

run_date = date.today().strftime("%Y-%m-%d")

# Create daily summary WITH Sharpe column (required by app.py)
daily_summary = pd.DataFrame({
    'Date': [run_date],
    'sharpe_gross': [sharpe_ratio],
    'sharpe_net': [net_sharpe],
    'win_rate': [win_rate],
    'max_drawdown': [max_dd],
    'long_signals': [(data['long_signal']==1).sum()],
    'turnover': [turnover.mean()]
})

out_path = os.path.join(runs_path, f"trading_results_{run_date}.csv")
daily_summary.to_csv(out_path, index=False)

print(f"✅ Saved daily results to: {out_path}")

✅ Saved daily results to: C:\Users\myoge\Contacts\Regression Based Project\runs\trading_results_2026-01-05.csv


### Final Summary about the Project

#### Phase1: Data & Prediction Power
- 19 Alpha Factors → 2729 trading days (2015-2025)
> Target: Next-day SPY return (mean +0.0005/day)
- Model Performance:
> Ridge Regression: R² = 0.187 (Baseline)
>
> XGBoost (Tuned): R² = 0.431 CV → **0.535 OOS**

#### Phase2: Hyper Parameter Optimization
- GridSearchCV: 48 fits across 16 parameter combinations
- Best XGBoost: n_estimators=100, max_depth=4, lr=0.05
- CV R²: 0.397 → OOS Test R²: 0.535

#### Phase3: Trading Logic Evolution
- Original: Naive signals → Sharpe -0.80 (100% turnover)
- v1.0: Long-only + thresholds → Sharpe +0.418 (27 trades/year)
- Key Innovation: 0.3× prediction std threshold = SWEET SPOT