## Section 1: Environment & Dependencies

In [None]:
import sys
sys.path.insert(0, '/Users/ajaiupadhyaya/Documents/Models')

from models.ml import RLReadyEnvironment
from core.backtesting import SimpleMLPredictor

import pandas as pd
import numpy as np
import yfinance as yf
import warnings
warnings.filterwarnings('ignore')

print("Core libraries loaded.")
print("\nNote: To use RL agents, install stable-baselines3:")
print("  pip install stable-baselines3")

# Try to import RL libraries
try:
    from stable_baselines3 import DQN, PPO, A2C
    from stable_baselines3.common.vec_env import DummyVecEnv
    print("\n✓ stable-baselines3 is installed!")
    RL_AVAILABLE = True
except ImportError:
    print("\n⚠ stable-baselines3 not available (install for full RL examples)")
    RL_AVAILABLE = False

## Section 2: Load and Prepare Data

In [None]:
# Download market data
print("Downloading market data...")
ticker = 'SPY'
df = yf.download(ticker, period='2y', progress=False)

# Split into train/val/test
n = len(df)
train_end = int(n * 0.6)
val_end = int(n * 0.8)

df_train = df.iloc[:train_end]
df_val = df.iloc[train_end:val_end]
df_test = df.iloc[val_end:]

print(f"Training data: {len(df_train)} days ({df_train.index[0].date()} to {df_train.index[-1].date()})")
print(f"Validation data: {len(df_val)} days")
print(f"Test data: {len(df_test)} days")
print(f"\nTotal price change: {(df['Close'].iloc[-1]/df['Close'].iloc[0]-1)*100:.2f}%")

## Section 3: Create RL Environment

In [None]:
# Create training environment
print("Creating RL trading environment...")
train_env = RLReadyEnvironment(df_train, initial_capital=100000)

print(f"\nEnvironment Details:")
print(f"  State shape: {train_env.observation_space.shape}")
print(f"  Action space: {train_env.action_space.n} actions")
print(f"  Action meanings: 0=hold, 1=long, 2=short, 3=close")

# Test environment
print(f"\nTesting environment...")
state = train_env.reset()
print(f"  Initial state shape: {state.shape}")
print(f"  Initial state (first 5 values): {state[:5]}")

# Take one step
state, reward, done, info = train_env.step(1)  # action 1 = long
print(f"  After 1 step: reward={reward:.6f}, done={done}")

## Section 4: Random Agent Baseline

In [None]:
# Random agent for comparison
print("Running random agent baseline...\n")

def run_agent(env, agent_func, episodes=1):
    """Run agent and return performance metrics."""
    returns = []
    
    for ep in range(episodes):
        state = env.reset()
        total_reward = 0
        steps = 0
        
        while True:
            action = agent_func(state)
            state, reward, done, info = env.step(action)
            total_reward += reward
            steps += 1
            
            if done:
                break
        
        returns.append(total_reward)
    
    return {
        'returns': returns,
        'mean_return': np.mean(returns),
        'std_return': np.std(returns),
        'final_performance': env.get_performance()
    }

# Random agent
def random_agent(state):
    return np.random.choice([0, 1, 2, 3])

random_results = run_agent(train_env, random_agent, episodes=1)
random_perf = random_results['final_performance']

print(f"Random Agent Performance:")
print(f"  Mean Episode Return: {random_results['mean_return']:.2f}")
print(f"  Final Capital: ${random_perf['final_capital']:,.2f}")
print(f"  Total Return: {random_perf['total_return']*100:.2f}%")
print(f"  Trades Made: {random_perf['trades']}")

In [None]:
# Smart agent using technical indicators
print("Running smart baseline (technical indicators)...\n")

predictor = SimpleMLPredictor(lookback_window=20)
signals = predictor.predict(df_train)

def smart_agent(state):
    """Agent that uses technical signals."""
    # Extract signal component from state (last element is often position info)
    # For simplicity, use momentum-based logic
    
    # In real implementation, you'd access the internal signal
    # Here we use a simple heuristic
    if state[-2] > 0.5:  # High momentum
        return 1  # Long
    elif state[-2] < -0.5:  # Negative momentum
        return 2  # Short
    else:
        return 0  # Hold

smart_results = run_agent(train_env, smart_agent, episodes=1)
smart_perf = smart_results['final_performance']

print(f"Smart Baseline Performance:")
print(f"  Final Capital: ${smart_perf['final_capital']:,.2f}")
print(f"  Total Return: {smart_perf['total_return']*100:.2f}%")
print(f"  Trades Made: {smart_perf['trades']}")

## Section 6: DQN Agent Training (If stable-baselines3 available)

In [None]:
if RL_AVAILABLE:
    print("Training DQN agent...\n")
    
    # Create vectorized environment
    train_env_vec = DummyVecEnv([lambda: RLReadyEnvironment(df_train, initial_capital=100000)])
    
    # Create DQN agent
    dqn_agent = DQN(
        'MlpPolicy',
        train_env_vec,
        learning_rate=0.001,
        buffer_size=10000,
        learning_starts=500,
        exploration_fraction=0.1,
        exploration_initial_eps=1.0,
        exploration_final_eps=0.01,
        verbose=0
    )
    
    # Train for 10,000 steps
    print("Training DQN for 10,000 steps...")
    dqn_agent.learn(total_timesteps=10000)
    
    print("Training complete!\n")
    
    # Evaluate DQN
    print("Evaluating DQN on training data...")
    eval_env = RLReadyEnvironment(df_train, initial_capital=100000)
    
    state = eval_env.reset()
    total_reward = 0
    
    while True:
        action, _ = dqn_agent.predict(state, deterministic=True)
        state, reward, done, info = eval_env.step(action)
        total_reward += reward
        
        if done:
            break
    
    dqn_perf = eval_env.get_performance()
    print(f"\nDQN Performance:")
    print(f"  Final Capital: ${dqn_perf['final_capital']:,.2f}")
    print(f"  Total Return: {dqn_perf['total_return']*100:.2f}%")
    print(f"  Trades Made: {dqn_perf['trades']}")

else:
    print("DQN training requires stable-baselines3 installation.")
    print("Install with: pip install stable-baselines3")

## Section 7: PPO Agent Training (If stable-baselines3 available)

In [None]:
if RL_AVAILABLE:
    print("Training PPO agent...\n")
    
    # Create vectorized environment
    train_env_vec = DummyVecEnv([lambda: RLReadyEnvironment(df_train, initial_capital=100000)])
    
    # Create PPO agent
    ppo_agent = PPO(
        'MlpPolicy',
        train_env_vec,
        learning_rate=0.0003,
        n_steps=128,
        batch_size=32,
        n_epochs=10,
        gae_lambda=0.95,
        verbose=0
    )
    
    # Train for 10,000 steps
    print("Training PPO for 10,000 steps...")
    ppo_agent.learn(total_timesteps=10000)
    
    print("Training complete!\n")
    
    # Evaluate PPO
    print("Evaluating PPO on training data...")
    eval_env = RLReadyEnvironment(df_train, initial_capital=100000)
    
    state = eval_env.reset()
    total_reward = 0
    
    while True:
        action, _ = ppo_agent.predict(state, deterministic=True)
        state, reward, done, info = eval_env.step(action)
        total_reward += reward
        
        if done:
            break
    
    ppo_perf = eval_env.get_performance()
    print(f"\nPPO Performance:")
    print(f"  Final Capital: ${ppo_perf['final_capital']:,.2f}")
    print(f"  Total Return: {ppo_perf['total_return']*100:.2f}%")
    print(f"  Trades Made: {ppo_perf['trades']}")

else:
    print("PPO training requires stable-baselines3 installation.")

## Section 8: Cross-Validation on Test Data