In [None]:
# Setup and environment configuration
import sys
import os
from dotenv import load_dotenv

# Get relative paths
notebook_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, '..'))

# Load environment variables
env_path = os.path.join(notebook_dir, '.env')
load_dotenv(env_path)

# Install FinRL package
%pip install -e {project_root} -q

# Add to path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import finrl
print(f'Using finrl from: {os.path.dirname(finrl.__file__)}')
print(f'Project root: {project_root}')


  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32minstalling build dependencies[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[8 lines of output][0m
  [31m   [0m [0m[31mERROR: Could not find a version that satisfies the requirement poetry-core (from versions: none)[0m[31m
  [31m   [0m [0m[31mERROR: No matching distribution found for poetry-core[0m[31m
  [31m   [0m [0m
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[31mERROR: Failed to build 'file:///Users/ayushraj/Documents/Python/FinRL/FinRL' when installing build dependencies[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Using finrl from: /Users/ayushraj/Documents/Python/FinRL/FinRL/finrl


In [None]:
# Import required libraries for training
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import Tensor

from finrl.config import INDICATORS, TRAINED_MODEL_DIR, RESULTS_DIR
from finrl.main import check_and_make_directories
from finrl.meta.env_stock_trading.env_stocktrading import StockTradingEnv
from finrl.agents.stablebaselines3.models import DRLAgent
from stable_baselines3.common.logger import configure


# Model Training Notebook

This notebook focuses on training reinforcement learning models using preprocessed data.

**Prerequisites:**
- Run `1-Data_Preprocessing.ipynb` first to generate the training data
- Ensure `train_data.csv` exists in the examples directory

**Supported Models:**
- A2C (Advantage Actor-Critic)
- DDPG (Deep Deterministic Policy Gradient)
- PPO (Proximal Policy Optimization)
- TD3 (Twin Delayed Deep Deterministic Policy Gradient)
- SAC (Soft Actor-Critic)


In [None]:
# Load preprocessed training data
TRAIN_DATA_CSV = os.path.join(notebook_dir, os.getenv('TRAIN_DATA_CSV', 'train_data.csv'))

print(f"Loading training data from: {os.path.basename(TRAIN_DATA_CSV)}")
train = pd.read_csv(TRAIN_DATA_CSV)

# Set index properly
train = train.set_index(train.columns[0])
train.index.names = ['']

print(f"✓ Training data loaded")
print(f"  Shape: {train.shape}")
print(f"  Date range: {train['date'].min()} to {train['date'].max()}")
print(f"  Unique tickers: {train['tic'].nunique()}")


# Train

In [None]:
# Create necessary directories
check_and_make_directories([TRAINED_MODEL_DIR])
print(f"Model directory ready: {TRAINED_MODEL_DIR}")

In [4]:
train.tail()

Unnamed: 0,date,open,high,low,close,volume,tic,vixy,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma
,,,,,,,,,,,,,,,,
105689.0,2025-07-30 19:59:00+00:00,152.805,152.88,152.69,152.71,301384.0,PG,41.24,0.024913,152.775511,152.083619,48.713511,170.660308,28.420762,152.415543,152.717332
105689.0,2025-07-30 19:59:00+00:00,266.48,266.63,265.99,266.12,261128.0,UNH,41.24,0.306531,266.547193,264.684837,58.149589,141.097443,19.641133,265.515343,265.169503
105689.0,2025-07-30 19:59:00+00:00,351.37,351.37,350.4,351.03,316816.0,V,41.24,0.231163,351.549909,350.292341,54.396284,49.139917,12.007675,350.72125,350.137593
105689.0,2025-07-30 19:59:00+00:00,42.66,42.66,42.59,42.62,495984.0,VZ,41.24,0.015744,42.661154,42.469776,50.623313,158.254965,12.41328,42.558843,42.605642
105689.0,2025-07-30 19:59:00+00:00,97.66,97.67,97.54,97.63,378002.0,WMT,41.24,0.040713,97.638542,97.255458,56.412751,206.97588,23.275628,97.427107,97.474117


In [5]:
# Fill NaN values in technical indicators
print("Before NaN handling:")
print(train.isna().sum())

# Forward fill first
train = train.ffill()
# Then backward fill any remaining NaNs at the start
train = train.bfill()

print("\nAfter NaN handling:")
print(train.isna().sum())

# Verify no NaN values in the data
assert not train.isna().any().any(), "There should be no NaN values in the training data"

Before NaN handling:
date               0
open               0
high               0
low                0
close              0
volume             0
tic                0
vixy               0
macd               0
boll_ub           30
boll_lb           30
rsi_30            60
cci_30          3941
dx_30             60
close_30_sma       0
close_60_sma       0
dtype: int64

After NaN handling:
date            0
open            0
high            0
low             0
close           0
volume          0
tic             0
vixy            0
macd            0
boll_ub         0
boll_lb         0
rsi_30          0
cci_30          0
dx_30           0
close_30_sma    0
close_60_sma    0
dtype: int64


In [6]:
stock_dimension = len(train.tic.unique())
state_space = 1 + 2*stock_dimension + len(INDICATORS)*stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")

Stock Dimension: 30, State Space: 301


In [None]:
buy_cost_list = sell_cost_list = [0.001] * stock_dimension
num_stock_shares = [0] * stock_dimension

# Print environment parameters for debugging
print("Environment Parameters:")
print(f"Stock Dimension: {stock_dimension}")
print(f"State Space: {state_space}")
print(f"Initial Amount: 1000000")
print(f"Hmax: 100")
print(f"Transaction cost: 0.001")

env_kwargs = {
    "hmax": 100,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4
}

# Print indicator list for debugging
print("\nTechnical Indicators:")
print(INDICATORS)

e_train_gym = StockTradingEnv(df=train, **env_kwargs)

# Test environment reset and check for NaN values
initial_state = e_train_gym.reset()
print("\nInitial State Shape:", len(initial_state))
print("First few values of initial state:", initial_state[:5])

# Verify no NaN values in initial state
if isinstance(initial_state, tuple):
    state_array = initial_state[0]  # If initial_state is a tuple (new gym style)
else:
    state_array = initial_state  # If initial_state is just the array (old gym style
    
assert not np.isnan(state_array).any(), "NaN values found in initial state"

Environment Parameters:
Stock Dimension: 30
State Space: 301
Initial Amount: 1000000
Hmax: 100
Transaction cost: 0.001

Technical Indicators:
['macd', 'boll_ub', 'boll_lb', 'rsi_30', 'cci_30', 'dx_30', 'close_30_sma', 'close_60_sma']

Initial State Shape: 2
First few values of initial state: ([1000000, 212.43, 313.4933, 193.59, 232.52, 185.1, 333.33, 258.605, 47.54, 157.21, 99.46, 52.925, 454.1201, 343.96, 213.65, 173.91, 30.71, 146.13, 203.02, 63.99, 254.5638, 102.81, 125.55, 447.1, 75.94, 123.5379, 165.565, 507.55, 262.275, 41.5, 67.87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43, 212.43,

In [8]:
env_train, _ = e_train_gym.get_sb_env()
print(type(env_train))

<class 'stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv'>


In [None]:
agent = DRLAgent(env = env_train)

# Set the corresponding values to 'True' for the algorithms that you want to use
if_using_a2c = False
if_using_ddpg = False
if_using_ppo = True
if_using_td3 = False
if_using_sac = False

In [15]:
#A2C

agent = DRLAgent(env = env_train)
model_a2c = agent.get_model("a2c")

if if_using_a2c:
  # set up logger
  tmp_path = RESULTS_DIR + '/a2c'
  new_logger_a2c = configure(tmp_path, ["stdout", "csv", "tensorboard"])
  # Set new logger
  model_a2c.set_logger(new_logger_a2c)

trained_a2c = agent.train_model(model=model_a2c, 
                             tb_log_name='a2c',
                             total_timesteps=1000000) if if_using_a2c else None

trained_a2c.save(TRAINED_MODEL_DIR + "/agent_a2c") if if_using_a2c else None

{'n_steps': 5, 'ent_coef': 0.01, 'learning_rate': 0.0007}
Using mps device
Logging to results/a2c




----------------------------------------
| time/                 |              |
|    fps                | 8            |
|    iterations         | 100          |
|    time_elapsed       | 58           |
|    total_timesteps    | 500          |
| train/                |              |
|    entropy_loss       | -43.5        |
|    explained_variance | 0            |
|    learning_rate      | 0.0007       |
|    n_updates          | 99           |
|    policy_loss        | -2.45        |
|    reward             | -0.009851701 |
|    reward_max         | 0.017617073  |
|    reward_mean        | -0.01569039  |
|    reward_min         | -0.053447187 |
|    std                | 1.03         |
|    value_loss         | 0.00344      |
----------------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 8             |
|    iterations         | 200           |
|    time_elapsed       | 113           |
|    total_

KeyboardInterrupt: 

In [None]:
#DDPG

agent = DRLAgent(env = env_train)
model_ddpg = agent.get_model("ddpg")

if if_using_ddpg:
  # set up logger
  tmp_path = RESULTS_DIR + '/ddpg'
  new_logger_ddpg = configure(tmp_path, ["stdout", "csv", "tensorboard"])
  # Set new logger
  model_ddpg.set_logger(new_logger_ddpg)

trained_ddpg = agent.train_model(model=model_ddpg, 
                             tb_log_name='ddpg',
                             total_timesteps=1200000) if if_using_ddpg else None

trained_ddpg.save(TRAINED_MODEL_DIR + "/agent_ddpg") if if_using_ddpg else None

In [166]:
# PPO
import numpy as np

agent = DRLAgent(env = env_train)
PPO_PARAMS = {
    "n_steps": 2048,             # number of steps to collect per environment before updating
    "ent_coef": 0.005,           # entropy bonus coefficient (encourages exploration)
    "learning_rate": 0.0001,     # smaller lr for stable, gradual learning
    "batch_size": 256,           # minibatch size for each gradient update
    "n_epochs": 10,              # number of passes over each batch of data
    "gamma": 0.99,               # discount factor for reward
    "gae_lambda": 0.95,          # bias-variance tradeoff for advantage estimation
    "clip_range": 0.2,           # PPO clipping parameter
    "max_grad_norm": 0.5,        # gradient clipping to prevent exploding gradients
    "vf_coef": 0.4,              # weight for value function loss
    "normalize_advantage": True,  # normalize advantage estimates
}

model_ppo = agent.get_model("ppo", model_kwargs=PPO_PARAMS)

if if_using_ppo:
    try:
        # set up logger
        tmp_path = RESULTS_DIR + '/ppo'
        new_logger_ppo = configure(tmp_path, ["stdout", "csv", "tensorboard"])
        # Set new logger
        model_ppo.set_logger(new_logger_ppo)
        
        # Check if model initialized correctly
        print("Model device:", model_ppo.device)
        print("Policy network:", model_ppo.policy)
        
        # Test a single prediction before training
        test_obs = env_train.observation_space.sample()
        with torch.no_grad():
            test_action, _ = model_ppo.predict(test_obs, deterministic=True)
        print("\nTest prediction shape:", test_action.shape)
        print("Test prediction values:", test_action[:5])
        
        trained_ppo = agent.train_model(model=model_ppo, 
                                      tb_log_name='ppo',
                                      total_timesteps=200000)
        
        trained_ppo.save(TRAINED_MODEL_DIR + "/agent_ppo")
        print("Model saved successfully!")
        
    except Exception as e:
        print(f"Error during training: {str(e)}")
        print("\nEnvironment Info:")
        print("Observation Space:", env_train.observation_space)
        print("Action Space:", env_train.action_space)
        raise  # Re-raise the exception for full traceback

{'n_steps': 2048, 'ent_coef': 0.005, 'learning_rate': 0.0001, 'batch_size': 256, 'n_epochs': 10, 'gamma': 0.99, 'gae_lambda': 0.95, 'clip_range': 0.2, 'max_grad_norm': 0.5, 'vf_coef': 0.4, 'normalize_advantage': True}
Using mps device


  device = torch.device("cuda" if torch.cuda.is_available() else "mps" if getattr(torch, "has_mps", False) and torch.backends.mps.is_available() else "cpu"),


Logging to results/ppo
Model device: mps
Policy network: ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=301, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=301, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=30, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)

Test prediction shape: (30,)
Test prediction values: [ 0.01005942  0.00

In [13]:
agent = DRLAgent(env = env_train)
TD3_PARAMS = {"batch_size": 10000, 
              "buffer_size": 1000000, 
              "learning_rate": 0.001}

model_td3 = agent.get_model("td3",model_kwargs = TD3_PARAMS)

if if_using_td3:
  # set up logger
  tmp_path = RESULTS_DIR + '/td3'
  new_logger_td3 = configure(tmp_path, ["stdout", "csv", "tensorboard"])
  # Set new logger
  model_td3.set_logger(new_logger_td3)

trained_td3 = agent.train_model(model=model_td3, 
                             tb_log_name='td3',
                             total_timesteps=50000) if if_using_td3 else None

trained_td3.save(TRAINED_MODEL_DIR + "/agent_td3") if if_using_td3 else None

{'batch_size': 10000, 'buffer_size': 1000000, 'learning_rate': 0.001}
Using mps device
Logging to results/td3
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_bu

In [12]:
agent = DRLAgent(env = env_train)
SAC_PARAMS = {
    "batch_size": 2048,
    "buffer_size": 100000,
    "learning_rate": 0.0001,
    "learning_starts": 100,
    "ent_coef": "auto_0.1",
}

model_sac = agent.get_model("sac",model_kwargs = SAC_PARAMS)

if if_using_sac:
  # set up logger
  tmp_path = RESULTS_DIR + '/sac'
  new_logger_sac = configure(tmp_path, ["stdout", "csv", "tensorboard"])
  # Set new logger
  model_sac.set_logger(new_logger_sac)

trained_sac = agent.train_model(model=model_sac, 
                             tb_log_name='sac',
                             total_timesteps=70000) if if_using_sac else None

trained_sac.save(TRAINED_MODEL_DIR + "/agent_sac") if if_using_sac else None

{'batch_size': 2048, 'buffer_size': 100000, 'learning_rate': 0.0001, 'learning_starts': 100, 'ent_coef': 'auto_0.1'}
Using mps device
Logging to results/sac
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Error: 'rollout_buffer'
Logging Err

In [None]:
# Inspect the trained model architecture
import torch
import json
from stable_baselines3 import PPO

# Use relative path to trained model
MODEL_PATH = os.path.join(TRAINED_MODEL_DIR, 'agent_ppo.zip')

print("🔍 INSPECTING TRAINED MODEL ARCHITECTURE")
print("=" * 60)

try:
    # Method 1: Load the full model and inspect policy network
    print("Loading full model...")
    model = PPO.load(MODEL_PATH)
    
    print(f"✅ Model loaded successfully!")
    print(f"📊 Policy class: {type(model.policy).__name__}")
    
    # Extract network architecture from policy
    policy = model.policy
    
    print("\n🧠 NEURAL NETWORK ARCHITECTURE:")
    print("-" * 40)
    
    # Check if it has mlp_extractor (common in ActorCritic policies)
    if hasattr(policy, 'mlp_extractor'):
        mlp = policy.mlp_extractor
        
        # Policy (actor) network layers
        if hasattr(mlp, 'policy_net'):
            print("🎯 Actor Network (Policy):")
            for i, layer in enumerate(mlp.policy_net):
                if hasattr(layer, 'in_features') and hasattr(layer, 'out_features'):
                    print(f"   Layer {i}: {layer.in_features} → {layer.out_features}")
        
        # Value network layers  
        if hasattr(mlp, 'value_net'):
            print("\n💰 Value Network (Critic):")
            for i, layer in enumerate(mlp.value_net):
                if hasattr(layer, 'in_features') and hasattr(layer, 'out_features'):
                    print(f"   Layer {i}: {layer.in_features} → {layer.out_features}")
    
    # Direct policy network inspection
    print(f"\n📋 POLICY NETWORK DETAILS:")
    print(f"   Input dimension: {model.observation_space.shape[0]}")
    print(f"   Output dimension: {model.action_space.shape[0]}")
    
    # Print the full network structure
    print(f"\n🏗️ FULL NETWORK STRUCTURE:")
    print(policy)
    
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Trying alternative method...")
    
    try:
        # Method 2: Direct PyTorch load of policy.pth
        print("\n🔧 Loading policy.pth directly...")
        policy_path = f"{MODEL_PATH}/policy.pth"
        state_dict = torch.load(policy_path, map_location='cpu')
        
        print("✅ Policy state dict loaded!")
        print("\n📊 LAYER INFORMATION FROM STATE DICT:")
        print("-" * 50)
        
        # Extract layer dimensions from state dict keys
        for key, tensor in state_dict.items():
            if 'weight' in key and len(tensor.shape) == 2:
                print(f"{key}: {tensor.shape[1]} → {tensor.shape[0]}")
                
        # Look for specific network layers
        actor_layers = [k for k in state_dict.keys() if 'mlp_extractor.policy_net' in k and 'weight' in k]
        value_layers = [k for k in state_dict.keys() if 'mlp_extractor.value_net' in k and 'weight' in k]
        
        if actor_layers:
            print(f"\n🎯 ACTOR NETWORK DIMENSIONS:")
            for layer in sorted(actor_layers):
                shape = state_dict[layer].shape
                print(f"   {layer}: {shape[1]} → {shape[0]}")
                
        if value_layers:
            print(f"\n💰 VALUE NETWORK DIMENSIONS:")
            for layer in sorted(value_layers):
                shape = state_dict[layer].shape
                print(f"   {layer}: {shape[1]} → {shape[0]}")
        
    except Exception as e2:
        print(f"❌ Error with direct load: {e2}")

print(f"\n" + "=" * 60)
print("✅ Network inspection complete!")


🔍 INSPECTING TRAINED MODEL ARCHITECTURE
Loading full model...
✅ Model loaded successfully!
📊 Policy class: ActorCriticPolicy

🧠 NEURAL NETWORK ARCHITECTURE:
----------------------------------------
🎯 Actor Network (Policy):
   Layer 0: 301 → 64
   Layer 2: 64 → 64

💰 Value Network (Critic):
   Layer 0: 301 → 64
   Layer 2: 64 → 64

📋 POLICY NETWORK DETAILS:
   Input dimension: 301
   Output dimension: 30

🏗️ FULL NETWORK STRUCTURE:
ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=301, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential