## Section 1: Data Preparation

In [None]:
import sys
sys.path.insert(0, '/Users/ajaiupadhyaya/Documents/Models')

from core.backtesting import (
    SimpleMLPredictor,
    BacktestEngine,
    WalkForwardAnalysis
)
from models.ml import (
    EnsemblePredictor,
    RLReadyEnvironment
)

import pandas as pd
import numpy as np
import yfinance as yf
import warnings
warnings.filterwarnings('ignore')

# Download data
print("Downloading data for backtesting...")
ticker = 'SPY'
df = yf.download(ticker, period='2y', progress=False)

print(f"Data shape: {df.shape}")
print(f"Period: {df.index[0].date()} to {df.index[-1].date()}")
print(f"\nData summary:")
print(df[['Close', 'Volume']].describe())

## Section 2: Simple Rules-Based Predictor

In [None]:
# Initialize simple predictor
simple_predictor = SimpleMLPredictor(lookback_window=20)

# Generate signals
signals = simple_predictor.predict(df)

# Show signal statistics
print("Signal Statistics:")
print(f"  Mean signal: {np.mean(signals):.4f}")
print(f"  Std dev: {np.std(signals):.4f}")
print(f"  Min: {np.min(signals):.4f}")
print(f"  Max: {np.max(signals):.4f}")
print(f"\nBuy signals (>0.3): {sum(signals > 0.3)}")
print(f"Sell signals (<-0.3): {sum(signals < -0.3)}")
print(f"Neutral: {sum(np.abs(signals) <= 0.3)}")

# Show recent signals
print(f"\nRecent signals (last 5 days):")
for i in range(-5, 0):
    print(f"  {df.index[i].date()}: {signals[i]:>6.3f}")

## Section 3: Backtest Simple Strategy

In [None]:
# Run backtest
engine = BacktestEngine(initial_capital=100000, commission=0.001)
results = engine.run_backtest(df, signals, signal_threshold=0.3, position_size=0.1)

print(f"Simple Rules-Based Strategy Results:")
print(f"\nPerformance:")
print(f"  Initial Capital: $100,000")
print(f"  Final Equity: ${results['final_equity']:,.2f}")
print(f"  Total Return: {results['total_return_pct']:.2f}%")

print(f"\nTrade Statistics:")
print(f"  Total Trades: {results['num_trades']}")
print(f"  Winning Trades: {results['winning_trades']}")
print(f"  Losing Trades: {results['losing_trades']}")
print(f"  Win Rate: {results['win_rate']*100:.1f}%")

print(f"\nRisk Metrics:")
print(f"  Total PnL: ${results['total_pnl']:,.2f}")
print(f"  Avg PnL per Trade: ${results['avg_pnl']:,.2f}")
print(f"  Sharpe Ratio: {results['sharpe_ratio']:.2f}")
print(f"  Max Drawdown: {results['max_drawdown_pct']:.2f}%")

## Section 4: Ensemble ML Predictor

In [None]:
# Split data for training
split_idx = int(len(df) * 0.7)
train_data = df.iloc[:split_idx]
test_data = df.iloc[split_idx:]

print(f"Training period: {train_data.index[0].date()} to {train_data.index[-1].date()}")
print(f"Testing period: {test_data.index[0].date()} to {test_data.index[-1].date()}")

# Train ensemble model
print(f"\nTraining Ensemble ML Model...")
ensemble = EnsemblePredictor(lookback_window=20)
ensemble.train(train_data)
print("Training complete!")

# Generate signals on test data
ensemble_signals = ensemble.predict(test_data)

print(f"\nEnsemble Signal Statistics:")
print(f"  Mean signal: {np.mean(ensemble_signals):.4f}")
print(f"  Std dev: {np.std(ensemble_signals):.4f}")
print(f"  Buy signals: {sum(ensemble_signals > 0.3)}")
print(f"  Sell signals: {sum(ensemble_signals < -0.3)}")

## Section 5: Backtest Ensemble Strategy

In [None]:
# Backtest ensemble signals
ensemble_engine = BacktestEngine(initial_capital=100000, commission=0.001)
ensemble_results = ensemble_engine.run_backtest(test_data, ensemble_signals, signal_threshold=0.3, position_size=0.1)

print(f"Ensemble ML Strategy Results (Out-of-Sample):")
print(f"\nPerformance:")
print(f"  Final Equity: ${ensemble_results['final_equity']:,.2f}")
print(f"  Total Return: {ensemble_results['total_return_pct']:.2f}%")

print(f"\nTrade Statistics:")
print(f"  Total Trades: {ensemble_results['num_trades']}")
print(f"  Win Rate: {ensemble_results['win_rate']*100:.1f}%")

print(f"\nRisk Metrics:")
print(f"  Sharpe Ratio: {ensemble_results['sharpe_ratio']:.2f}")
print(f"  Max Drawdown: {ensemble_results['max_drawdown_pct']:.2f}%")

# Compare strategies
print(f"\n" + "="*50)
print("STRATEGY COMPARISON")
print("="*50)
print(f"{'Metric':<25} {'Simple Rules':<15} {'Ensemble ML':<15}")
print("-" * 55)
print(f"{'Return %':<25} {results['total_return_pct']:>6.2f}% {ensemble_results['total_return_pct']:>14.2f}%")
print(f"{'Win Rate':<25} {results['win_rate']*100:>6.1f}% {ensemble_results['win_rate']*100:>14.1f}%")
print(f"{'Sharpe Ratio':<25} {results['sharpe_ratio']:>6.2f}  {ensemble_results['sharpe_ratio']:>14.2f}")
print(f"{'Max Drawdown':<25} {results['max_drawdown_pct']:>6.2f}% {ensemble_results['max_drawdown_pct']:>14.2f}%")

## Section 6: Walk-Forward Analysis (Robust Testing)

In [None]:
# Walk-forward analysis
print("Running Walk-Forward Analysis...")
print("(This performs 252-day training + 63-day testing rolling windows)\n")

wf_analyzer = WalkForwardAnalysis(df, in_sample_period=252, out_sample_period=63)
wf_results = wf_analyzer.run(SimpleMLPredictor)

print(f"Completed {len(wf_results)} walk-forward periods\n")

# Analyze results
returns = [r['total_return_pct'] for r in wf_results]
sharpe_ratios = [r['sharpe_ratio'] for r in wf_results]
win_rates = [r['win_rate'] * 100 for r in wf_results]

print(f"Walk-Forward Performance Summary:")
print(f"  Mean Return: {np.mean(returns):.2f}%")
print(f"  Std Dev: {np.std(returns):.2f}%")
print(f"  Min Return: {np.min(returns):.2f}%")
print(f"  Max Return: {np.max(returns):.2f}%")

print(f"\n  Mean Sharpe: {np.mean(sharpe_ratios):.2f}")
print(f"  Mean Win Rate: {np.mean(win_rates):.1f}%")

print(f"\nIndividual Period Returns:")
for i, (result, ret) in enumerate(zip(wf_results[:5], returns[:5])):
    start_date = result['period'][0].date()
    end_date = result['period'][1].date()
    print(f"  Period {i+1} ({start_date} to {end_date}): {ret:>7.2f}%")

## Section 7: RL Environment Setup

In [None]:
# Create RL environment
print("Setting up RL-Ready Trading Environment...")

rl_env = RLReadyEnvironment(test_data, initial_capital=100000)

# Simulate random trading
print("\nSimulating random trading agent...")
state = rl_env.reset()

total_reward = 0
num_steps = 0

while True:
    # Random action (0=hold, 1=long, 2=short, 3=close)
    action = np.random.choice([0, 1, 2, 3], p=[0.5, 0.2, 0.2, 0.1])
    state, reward, done, info = rl_env.step(action)
    total_reward += reward
    num_steps += 1
    
    if done:
        break

perf = rl_env.get_performance()

print(f"\nRL Environment Performance (Random Agent):")
print(f"  Final Capital: ${perf['final_capital']:,.2f}")
print(f"  Total Return: {perf['total_return']*100:.2f}%")
print(f"  Trades Made: {perf['trades']}")
print(f"  Total Steps: {num_steps}")

print(f"\nRL Environment is ready for stable-baselines3 training!")
print(f"State space dimension: {state.shape[0]}")
print(f"Action space: 4 actions (hold, long, short, close)")

## Section 8: Feature Importance Analysis

In [None]:
# Feature importance from ensemble
print("Feature Importance from Ensemble Models:\n")

# Get feature importance from GB model
feature_names = [
    'Returns', 'Price/SMA5', 'Price/SMA20', 'Volatility',
    'Volume Ratio', 'High-Low Range', 'RSI'
]

gb_importance = ensemble.gb_model.feature_importances_
rf_importance = ensemble.rf_model.feature_importances_

# Average importance
avg_importance = (gb_importance + rf_importance) / 2

# Sort by importance
sorted_idx = np.argsort(avg_importance)[::-1]

print(f"{'Feature':<20} {'Importance':<12}")
print("-" * 32)

for idx in sorted_idx[:7]:
    print(f"{feature_names[idx]:<20} {avg_importance[idx]:>8.4f}")

## Section 9: Hyperparameter Sensitivity

In [None]:
# Test different signal thresholds
print("Signal Threshold Sensitivity Analysis:\n")

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
threshold_results = []

for threshold in thresholds:
    eng = BacktestEngine(initial_capital=100000, commission=0.001)
    res = eng.run_backtest(test_data, ensemble_signals, signal_threshold=threshold, position_size=0.1)
    threshold_results.append(res)
    
    print(f"Threshold {threshold:.1f}: Return {res['total_return_pct']:>7.2f}%, Sharpe {res['sharpe_ratio']:>5.2f}, Win Rate {res['win_rate']*100:>5.1f}%")

# Find optimal threshold
best_sharpe_idx = np.argmax([r['sharpe_ratio'] for r in threshold_results])
print(f"\nOptimal threshold (by Sharpe): {thresholds[best_sharpe_idx]:.1f}")

## Section 10: Position Size Analysis

In [None]:
# Test different position sizes
print("Position Size Sensitivity Analysis:\n")

position_sizes = [0.05, 0.10, 0.15, 0.20, 0.25]
size_results = []

for size in position_sizes:
    eng = BacktestEngine(initial_capital=100000, commission=0.001)
    res = eng.run_backtest(test_data, ensemble_signals, signal_threshold=0.3, position_size=size)
    size_results.append(res)
    
    print(f"Size {size:.2f}: Return {res['total_return_pct']:>7.2f}%, Max DD {res['max_drawdown_pct']:>6.2f}%, Sharpe {res['sharpe_ratio']:>5.2f}")

# Find optimal size (best risk-adjusted return)
risk_adj_returns = [r['total_return_pct'] / abs(r['max_drawdown_pct']) if r['max_drawdown_pct'] != 0 else 0 for r in size_results]
best_size_idx = np.argmax(risk_adj_returns)
print(f"\nOptimal position size (by risk-adj return): {position_sizes[best_size_idx]:.2f}")

## Section 11: Comparison to Buy & Hold

In [None]:
# Calculate buy & hold return
buy_hold_return = (test_data['Close'].iloc[-1] - test_data['Close'].iloc[0]) / test_data['Close'].iloc[0]
buy_hold_pct = buy_hold_return * 100

print("Strategy Comparison (Out-of-Sample Period):\n")
print(f"{'Strategy':<30} {'Return':<12} {'Sharpe':<10} {'Max DD':<10}")
print("-" * 62)
print(f"{'Buy & Hold':<30} {buy_hold_pct:>6.2f}% {'N/A':>9} {'N/A':>9}")
print(f"{'Simple Rules-Based':<30} {results['total_return_pct']:>6.2f}% {results['sharpe_ratio']:>9.2f} {results['max_drawdown_pct']:>8.2f}%")
print(f"{'Ensemble ML':<30} {ensemble_results['total_return_pct']:>6.2f}% {ensemble_results['sharpe_ratio']:>9.2f} {ensemble_results['max_drawdown_pct']:>8.2f}%")

# Calculate excess return
excess_return = ensemble_results['total_return_pct'] - buy_hold_pct
print(f"\nEnsemble ML excess return vs buy & hold: {excess_return:>6.2f}%")

## Section 12: Forward-Looking Pipeline

In [None]:
print("="*70)
print("ML/AI TRADING PIPELINE SUMMARY")
print("="*70)

print(f"\nâœ“ COMPLETED COMPONENTS:")
print(f"  1. Simple rules-based predictor (fast, interpretable)")
print(f"  2. Ensemble ML (Random Forest + Gradient Boosting)")
print(f"  3. Full backtesting engine with commission modeling")
print(f"  4. Walk-forward analysis (robust out-of-sample validation)")
print(f"  5. RL-ready environment (OpenAI Gym compatible)")
print(f"  6. Feature importance analysis")
print(f"  7. Hyperparameter optimization framework")

print(f"\nâžœ NEXT: Advanced Models")
print(f"  - LSTM neural networks (in advanced_trading.py)")
print(f"  - Reinforcement learning (PPO, A3C with stable-baselines3)")
print(f"  - Attention mechanisms for market regimes")
print(f"  - Multi-asset portfolio optimization")

print(f"\nðŸ“Š API & DEPLOYMENT READY:")
print(f"  - FastAPI integration for real-time predictions")
print(f"  - Model serving with Redis caching")
print(f"  - Live trading paper trading support")
print(f"  - Database logging of all trades")

print(f"\n" + "="*70)