# Using RL to predict Stock Prices

### Basic Data
Using only the basic Open, Min, Max and Close to predict the prices

### Data Preprocessing

In [57]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import wandb

In [58]:
df = pd.read_csv("AAPL.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1980-12-12,0.100178,0.100614,0.100178,0.100178,469033600,0.0,0.0
1,1980-12-15,0.095388,0.095388,0.094952,0.094952,175884800,0.0,0.0
2,1980-12-16,0.088418,0.088418,0.087983,0.087983,105728000,0.0,0.0
3,1980-12-17,0.09016,0.090596,0.09016,0.09016,86441600,0.0,0.0
4,1980-12-18,0.092774,0.09321,0.092774,0.092774,73449600,0.0,0.0


In [59]:
df = df.drop(['Date', 'Dividends', 'Stock Splits'], axis=1)

In [60]:
df.head()

Unnamed: 0,Open,High,Low,Close,Volume
0,0.100178,0.100614,0.100178,0.100178,469033600
1,0.095388,0.095388,0.094952,0.094952,175884800
2,0.088418,0.088418,0.087983,0.087983,105728000
3,0.09016,0.090596,0.09016,0.09016,86441600
4,0.092774,0.09321,0.092774,0.092774,73449600


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10483 entries, 0 to 10482
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    10483 non-null  float64
 1   High    10483 non-null  float64
 2   Low     10483 non-null  float64
 3   Close   10483 non-null  float64
 4   Volume  10483 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 409.6 KB


In [62]:
df['remove'] = df.apply(lambda x: all([abs(i - x[0]) < 1e-8 for i in x[1:]]), axis = 1)

  df['remove'] = df.apply(lambda x: all([abs(i - x[0]) < 1e-8 for i in x[1:]]), axis = 1)


In [63]:
df.head()

Unnamed: 0,Open,High,Low,Close,Volume,remove
0,0.100178,0.100614,0.100178,0.100178,469033600,False
1,0.095388,0.095388,0.094952,0.094952,175884800,False
2,0.088418,0.088418,0.087983,0.087983,105728000,False
3,0.09016,0.090596,0.09016,0.09016,86441600,False
4,0.092774,0.09321,0.092774,0.092774,73449600,False


In [64]:
df = df.query("remove == False").reset_index(drop=True)
df.drop(['remove'], axis=1, inplace=True)
df.head()

Unnamed: 0,Open,High,Low,Close,Volume
0,0.100178,0.100614,0.100178,0.100178,469033600
1,0.095388,0.095388,0.094952,0.094952,175884800
2,0.088418,0.088418,0.087983,0.087983,105728000
3,0.09016,0.090596,0.09016,0.09016,86441600
4,0.092774,0.09321,0.092774,0.092774,73449600


In [65]:
df

Unnamed: 0,Open,High,Low,Close,Volume
0,0.100178,0.100614,0.100178,0.100178,469033600
1,0.095388,0.095388,0.094952,0.094952,175884800
2,0.088418,0.088418,0.087983,0.087983,105728000
3,0.090160,0.090596,0.090160,0.090160,86441600
4,0.092774,0.093210,0.092774,0.092774,73449600
...,...,...,...,...,...
10478,141.350006,144.119995,141.080002,142.919998,74064300
10479,143.289993,146.550003,143.279999,146.350006,66253700
10480,145.259995,147.550003,145.000000,147.039993,64493200
10481,145.669998,146.639999,143.779999,144.869995,63141600


In [66]:
# Normalizing the dataset
df['High'] = (df['High'] - df['Open']) /df['Open']
df['Low'] = (df['Low'] - df['Open']) /df['Open']
df['Close'] = (df['Close'] - df['Open']) /df['Open']

In [67]:
df

Unnamed: 0,Open,High,Low,Close,Volume
0,0.100178,0.004348,0.000000,0.000000,469033600
1,0.095388,0.000000,-0.004566,-0.004566,175884800
2,0.088418,0.000000,-0.004926,-0.004926,105728000
3,0.090160,0.004831,0.000000,0.000000,86441600
4,0.092774,0.004694,0.000000,0.000000,73449600
...,...,...,...,...,...
10478,141.350006,0.019597,-0.001910,0.011107,74064300
10479,143.289993,0.022751,-0.000070,0.021355,66253700
10480,145.259995,0.015765,-0.001790,0.012254,64493200
10481,145.669998,0.006659,-0.012975,-0.005492,63141600


### Trading Environment

In [72]:
def preprocess_state(state, df, current_idx, obs_bars=50, add_noise=False):
    # Ensure we have enough data
    start_idx = max(0, current_idx - obs_bars + 1)
    end_idx = current_idx + 1
    window = df.iloc[start_idx:end_idx]

    # Pad with zeros if not enough data (e.g., at the start of the dataset)
    if len(window) < obs_bars:
        padding = obs_bars - len(window)
        padded_data = np.zeros((padding, len(window.columns)))
        padded_data = pd.DataFrame(padded_data, columns=window.columns)
        window = pd.concat([padded_data, window], axis=0)

    # Extract price and volume data
    high = window['High'].values
    low = window['Low'].values
    close = window['Close'].values
    volume = window['Volume'].values

    # Normalize price data (min-max scaling over the window)
    price_min = min(high.min(), low.min(), close.min())
    price_max = max(high.max(), low.max(), close.max())
    if price_max != price_min:
        high = (high - price_min) / (price_max - price_min)
        low = (low - price_min) / (price_max - price_min)
        close = (close - price_min) / (price_max - price_min)
    else:
        high = np.zeros_like(high)
        low = np.zeros_like(low)
        close = np.zeros_like(close)

    # Normalize volume (log scale to handle large variations)
    volume = np.log1p(volume)  # log(1 + volume) to handle zeros
    volume_min = volume.min()
    volume_max = volume.max()
    if volume_max != volume_min:
        volume = (volume - volume_min) / (volume_max - volume_min)
    else:
        volume = np.zeros_like(volume)

    # Compute returns (percentage change in Close price)
    returns = np.zeros(obs_bars)
    returns[1:] = (close[1:] - close[:-1]) / close[:-1]
    returns[0] = 0  # No return for the first bar
    returns = np.clip(returns, -1, 1)  # Clip to avoid extreme values

    # Compute volatility (standard deviation of returns over the window)
    volatility = np.zeros(obs_bars)
    for i in range(obs_bars):
        if i < 1:
            volatility[i] = 0
        else:
            window_returns = returns[max(0, i-14):i+1]  # Use last 14 bars for volatility
            volatility[i] = np.std(window_returns) if len(window_returns) > 1 else 0
    volatility = np.nan_to_num(volatility, 0)  # Replace NaNs with 0
    vol_max = volatility.max()
    vol_min = volatility.min()
    if vol_max != vol_min:
        volatility = (volatility - vol_min) / (vol_max - vol_min)
    else:
        volatility = np.zeros_like(volatility)

    # Extract position and profit (last timestep only, as in your original setup)
    position = state[3, -1]  # Assuming position is in channel 3 (adjust if needed)
    profit = state[4, -1]    # Assuming profit is in channel 4 (adjust if needed)

    # Create position and profit channels (constant across all timesteps)
    position_channel = np.full(obs_bars, position)
    profit_channel = np.full(obs_bars, profit)

    # Stack all channels into the state
    new_state = np.stack([
        high,
        low,
        close,
        volume,
        returns,
        volatility,
        position_channel,
        profit_channel
    ], axis=0)  # Shape: (8, obs_bars)

    # Add noise if specified
    if add_noise:
        new_state += np.random.normal(0, 0.01, size=new_state.shape)

    # Convert to PyTorch tensor
    new_state = torch.from_numpy(new_state).float()
    if len(new_state.shape) == 2:  # Add batch dimension if needed
        new_state = new_state.unsqueeze(0)  # Shape: (1, 8, obs_bars)

    return new_state

def get_action(q_values, num_actions, epsilon):
    if np.random.random() < epsilon:
        # Exploration: choose a random action
        return np.random.randint(0, num_actions)
    else:
        # Exploitation: choose the action with the highest Q-value
        return q_values.argmax().item()

def get_batch_for_nsteps_dqn(replay, batch_size, nsteps, device, gamma=0.99):
    if len(replay) < batch_size + nsteps:
        raise ValueError("Not enough experiences in replay buffer")

    max_idx = len(replay) - nsteps
    indices = np.random.randint(0, max_idx, size=batch_size)
    
    state1_batch = []
    action1_batch = []
    nsteps_next_state_batch = []
    nsteps_reward_batch = []
    nsteps_done_batch = []
    
    for idx in indices:
        state, action, reward, next_state, done = replay[idx]
        state = state.squeeze(0)
        state1_batch.append(state.cpu().numpy())
        action1_batch.append(action)
        
        nstep_reward = reward
        nstep_done = done
        nstep_next_state = next_state
        
        for t in range(1, nsteps):
            if nstep_done:
                break
            next_transition = replay[idx + t]
            reward_t = next_transition[2]
            nstep_done = next_transition[4]
            nstep_next_state = next_transition[3]
            nstep_reward += (gamma ** t) * reward_t
        
        nstep_next_state = nstep_next_state.squeeze(0)
        nsteps_next_state_batch.append(nstep_next_state.cpu().numpy())
        nsteps_reward_batch.append(nstep_reward)
        nsteps_done_batch.append(nstep_done)
    
    state1_batch = torch.tensor(np.array(state1_batch), dtype=torch.float32).to(device)
    action1_batch = torch.tensor(action1_batch, dtype=torch.long).to(device)
    nsteps_next_state_batch = torch.tensor(np.array(nsteps_next_state_batch), dtype=torch.float32).to(device)
    nsteps_reward_batch = torch.tensor(nsteps_reward_batch, dtype=torch.float32).to(device)
    nsteps_done_batch = torch.tensor(nsteps_done_batch, dtype=torch.float32).to(device)
    
    # Log batch-level metrics to W&B
    wandb.log({
        "batch_avg_reward": nsteps_reward_batch.mean().item(),
        "batch_done_ratio": nsteps_done_batch.mean().item()
    })
    
    return state1_batch, action1_batch, nsteps_next_state_batch, nsteps_reward_batch, nsteps_done_batch

In [73]:
class AAPL_env:
    def __init__(self, df, commission_perc=0.1, obs_bars=50, max_steps=1000):
        """
        Environment for trading AAPL stock using RL.
        
        Args:
            df (pd.DataFrame): Historical data with columns High, Low, Close, Volume.
            commission_perc (float): Commission percentage per trade (default: 0.1%).
            obs_bars (int): Number of historical bars in the observation (default: 50).
            max_steps (int): Maximum steps per episode (default: 1000).
        """
        self.df = df.reset_index(drop=True)  # Ensure index starts at 0
        self.commission_perc = commission_perc
        self.obs_bars = obs_bars
        self.max_steps = max_steps
        self.current_idx = 0
        self.position = 0  # 1: Long, -1: Short, 0: None
        self.entry_price = 0  # Price at which position was opened
        self.profit = 0  # Running profit/loss
        self.step_count = 0
        self.max_idx = len(df) - 1
        self.actions = {"buy": 0, "close": 1, "hold": 2}  # Action mapping

    def reset(self):
        """
        Reset the environment to the initial state.
        
        Returns:
            torch.Tensor: Initial state.
            dict: Info (optional).
        """
        self.current_idx = self.obs_bars - 1  # Start after enough bars for observation
        self.position = 0
        self.entry_price = 0
        self.profit = 0
        self.step_count = 0
        state = torch.zeros((2, self.obs_bars))  # Initial position/profit channels
        state = preprocess_state(state, self.df, self.current_idx, self.obs_bars)
        return state, {}

    def step(self, action):
        """
        Take an action in the environment.
        
        Args:
            action (str): Action to take ("buy", "close", "hold", or "do_nothing").
        
        Returns:
            torch.Tensor: Next state.
            float: Reward.
            bool: Done flag.
            dict: Info (optional).
        """
        if action not in self.actions and action != "do_nothing":
            raise ValueError(f"Invalid action: {action}")

        self.step_count += 1
        self.current_idx += 1

        # Check if episode should end
        done = False
        if self.current_idx >= self.max_idx or self.step_count >= self.max_steps:
            done = True
            if self.position != 0:  # Force close position at end
                action = "close"

        # Get current price (use Close price for simplicity)
        current_price = self.df.iloc[self.current_idx]['Close']

        # Initialize reward
        reward = 0

        # Handle actions
        if action == "do_nothing":
            # Used during reset or initial steps; no reward
            pass

        elif action == "buy" and self.position == 0:
            # Open a long position
            self.position = 1
            self.entry_price = current_price
            commission = current_price * self.commission_perc
            reward -= commission  # Deduct commission as penalty

        elif action == "buy" and self.position == -1:
            # Close short position and open long
            profit = self.entry_price - current_price  # Short profit: sell high, buy low
            commission = current_price * self.commission_perc
            self.profit += profit - commission
            reward = profit - commission
            self.position = 1
            self.entry_price = current_price
            reward -= commission  # Additional commission for new position

        elif action == "close" and self.position != 0:
            # Close existing position
            if self.position == 1:
                profit = current_price - self.entry_price  # Long profit: buy low, sell high
            else:  # position == -1
                profit = self.entry_price - current_price  # Short profit
            commission = current_price * self.commission_perc
            self.profit += profit - commission
            reward = profit - commission
            self.position = 0
            self.entry_price = 0

        elif action == "hold" and self.position != 0:
            # Calculate unrealized profit/loss as reward
            if self.position == 1:
                unrealized_profit = current_price - self.entry_price
            else:  # position == -1
                unrealized_profit = self.entry_price - current_price
            reward = unrealized_profit * 0.01  # Scale down to encourage closing positions
            # Small holding penalty to discourage excessive holding
            reward -= 0.01

        # Update state
        state = torch.zeros((2, self.obs_bars))  # Placeholder for position/profit
        state[0, -1] = self.position
        state[1, -1] = self.profit
        state = preprocess_state(state, self.df, self.current_idx, self.obs_bars)

        # Info for debugging
        info = {
            "current_idx": self.current_idx,
            "position": self.position,
            "profit": self.profit,
            "current_price": current_price
        }

        return state, reward, done, info

    def get_current_price(self):
        """
        Get the current Close price.
        
        Returns:
            float: Current Close price.
        """
        return self.df.iloc[self.current_idx]['Close']

In [74]:
actions = {
    0: "do_nothing",
    1: "buy",
    2: "close"
}

In [15]:
AAPL = AAPL_env(data=df, test=False, obs_bars=50)

In [16]:
state, reward, done = AAPL.step("do_nothing")

In [17]:
state.shape

(5, 50)

### DL Architecture

In [71]:
class DuelingConv1DQNet(nn.Module):
    def __init__(self, input_depth_length, obs_bars, output_shape):
        super(DuelingConv1DQNet, self).__init__()
        
        # Convolutional layers for price data (High, Low, Close)
        self.price_conv = nn.Sequential(
            nn.Conv1d(input_depth_length, 128, kernel_size=5, padding=2),  # in_channels=3
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 128, kernel_size=5, padding=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Flatten()
        )
        
        # Dynamically compute the flattened size
        self.flattened_size = 128 * obs_bars + 2
        
        # State value stream
        self.state_value = nn.Sequential(
            nn.Linear(self.flattened_size, 512),  # +2 for position and profit/loss
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        
        # Advantage stream
        self.advantage = nn.Sequential(
            nn.Linear(self.flattened_size, 512),
            nn.ReLU(),
            nn.Linear(512, output_shape)
        )

    def forward(self, x):
        price_x = x[:, :6, :]  # Extract price channels: High, Low, Close
        info_x = x[:, 6:, -1]  # Extract position and profit/loss from last time step
        
        # Process price data through convolutional layers
        price_features = self.price_conv(price_x)
        
        # Concatenate with position and profit/loss
        features = torch.cat([price_features, info_x], dim=1)
        
        # Compute state value and advantage
        state_val = self.state_value(features)
        advantage = self.advantage(features)
        
        # Combine using Dueling DQN formula
        q_values = state_val + advantage - advantage.mean(dim=1, keepdim=True)
        return q_values

### Training the bot

In [None]:
from collections import deque
import copy

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Hyperparameters
memory_size = 100000
batch_size = 256
obs_bars = 50
gamma = 0.99
lr = 0.0001
sync_freq = 1000
output_shape = len(actions)  # Assumes 'actions' is defined (e.g., ["buy", "close", "hold"])
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.99995
nsteps = 3
total_steps = 50000

In [52]:
wandb.init(
    project="dqn-aapl-trading",
    name="run_50k_steps",
    notes="Second run with 50,000 steps to compare with the first run (1M steps, -310$ reward). Started at 06:49 PM IST on Sunday, June 08, 2025."
)  
config = {
    "learning_rate": lr,
    "batch_size": batch_size,
    "memory_size": memory_size,
    "obs_bars": obs_bars,
    "nsteps": nsteps,
    "epochs": total_steps,
    "epsilon_start": epsilon,
    "epsilon_min": epsilon_min,
    "epsilon_decay": epsilon_decay,
    "gamma": gamma,
    "sync_freq": sync_freq,
    "optimizer": "RMSprop",
    "loss_function": "MSELoss"
}
wandb.config.update(config)

0,1
batch_avg_reward,▅▇█▆▅▃▄▅▄▂▄▂▃▃▇▂▄▂▃▂▁▃▅▆▂▄▃▄▆█▂▆▅█▆▅▆▇▇▆
batch_done_ratio,▃▃▅▄▅▃▃▂▅▁▄▃▅▆▆▅▄▄▅▄▄▅▄▅▇▄▆▃▅▅▆█▅▆▄▆▇▅▅▆
episode,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇███
episode_reward,▁█▅▆▆█▅▆▅▆▆▆▄▄▅▇█▄▁▅▅▅▆▆▇▇█▄▆▅▄▅▇▅██▆▇▅▅
epsilon,███▇▇▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁
mean_episode_reward,▁███▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
mean_reward,█▆▅▃▃▃▂▂▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄
q_loss,▁▃▄▄▄▃▆▆▅▄▃▅▃▃▃█▅▂▂▄▄▆▄▄▂▆▆▃▅▅▄▆▇▄▃▃▃▇▄▆
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▆▆▆▆▇▇▇▇▇█████

0,1
batch_avg_reward,1.06719
batch_done_ratio,0.5
episode,506.0
episode_reward,3.21527
epsilon,0.86118
mean_episode_reward,2.87705
mean_reward,2.87705
q_loss,10.96239
step,2989.0


In [53]:
# Initialize replay buffer
replay = deque(maxlen=memory_size)

# Initialize networks
Agent_NN = DuelingConv1DQNet(input_depth_length=5, obs_bars=obs_bars, output_shape=output_shape)
target_NN = DuelingConv1DQNet(input_depth_length=5, obs_bars=obs_bars, output_shape=output_shape)
target_NN.load_state_dict(Agent_NN.state_dict())
Agent_NN.to(device)
target_NN.to(device)

DuelingConv1DQNet(
  (price_conv): Sequential(
    (0): Conv1d(3, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
  )
  (state_value): Sequential(
    (0): Linear(in_features=6402, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=1, bias=True)
  )
  (advantage): Sequential(
    (0): Linear(in_features=6402, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=3, bias=True)
  )
)

In [54]:
# Optimizer and loss
optimizer = torch.optim.RMSprop(Agent_NN.parameters(), lr=lr)
loss_fn = torch.nn.MSELoss()

# Training metrics
all_rewards_list = []
Q_losses = []
k = 0

In [55]:
# Section 4: Training Loop
while k < total_steps:
    # Initialize environment
    game = AAPL_env(df, commission_perc=0.1, obs_bars=obs_bars)
    state, _, _ = game.step("do_nothing")
    state1 = preprocess_state(state, add_noise=False)  # Shape: (1, 5, 50)
    status = 1
    episode_rewards = []
    
    while status == 1:
        k += 1
        
        # Action selection
        with torch.no_grad():
            qval = Agent_NN(state1.to(device))
            qval_ = qval.cpu().numpy()
        action = get_action(qval_, output_shape, epsilon)
        action_name = actions[action]
        
        # Environment step
        state2, reward, done = game.step(action_name)
        state2 = preprocess_state(state2, add_noise=False)
        exp = (state1, action, reward, state2, done)
        replay.append(exp)
        episode_rewards.append(reward)
        
        # Update epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        
        # Training step
        if len(replay) >= batch_size + nsteps:
            state1_batch, action1_batch, nsteps_next_state_batch, nsteps_reward_batch, nsteps_done_batch = get_batch_for_nsteps_dqn(
                replay=replay, batch_size=batch_size, nsteps=nsteps, device=device, gamma=gamma
            )
            
            # Compute Q-values
            Q1 = Agent_NN(state1_batch)
            Q1_selected = Q1.gather(dim=1, index=action1_batch.unsqueeze(dim=1)).squeeze()
            
            # Compute target Q-values
            with torch.no_grad():
                Q_next_state = target_NN(nsteps_next_state_batch)
                selected_nodes_for_target_network = Q_next_state.max(dim=1)[1]
                best_Q_next_state = Q_next_state.gather(dim=1, index=selected_nodes_for_target_network.unsqueeze(dim=1)).squeeze()
                Y_batch_target_for_nsteps_don = (nsteps_reward_batch + (1 - nsteps_done_batch) * best_Q_next_state).to(device)
            
            # Compute loss
            loss = loss_fn(Q1_selected, Y_batch_target_for_nsteps_don)
            Q_losses.append(loss.item())
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Update target network
            if k % sync_freq == 0:
                target_NN.load_state_dict(Agent_NN.state_dict())
            
            # Log metrics to W&B
            wandb.log({
                "step": k,
                "epsilon": epsilon,
                "q_loss": loss.item(),
                "mean_reward": np.mean(all_rewards_list) if all_rewards_list else 0
            })
            
            print(f"Step {k} | Epsilon: {epsilon:.4f} | Q-Loss: {loss.item():.4f} | Mean Reward: {np.mean(all_rewards_list) if all_rewards_list else 0:.4f}")
        
        state1 = state2
        status = 1 - done
    
    # End of episode
    episode_reward = np.sum(episode_rewards)
    all_rewards_list.append(episode_reward)
    
    # Log episode-level metrics to W&B
    wandb.log({
        "episode": len(all_rewards_list),
        "episode_reward": episode_reward,
        "mean_episode_reward": np.mean(all_rewards_list)
    })
    
    print(f"Episode {len(all_rewards_list)} | Episode Reward: {episode_reward:.4f} | Mean Episode Reward: {np.mean(all_rewards_list):.4f}")

# Section 5: Save Model and Finish W&B Run
torch.save(Agent_NN.state_dict(), "agent_nn_50k.pth")
wandb.save("agent_nn_50k.pth")  # Upload model weights to W&B
wandb.finish()

Episode 1 | Episode Reward: 11.3791 | Mean Episode Reward: 11.3791


Episode 2 | Episode Reward: 2.7701 | Mean Episode Reward: 7.0746
Episode 3 | Episode Reward: 0.2717 | Mean Episode Reward: 4.8070
Episode 4 | Episode Reward: 3.7030 | Mean Episode Reward: 4.5310
Episode 5 | Episode Reward: 4.7511 | Mean Episode Reward: 4.5750
Episode 6 | Episode Reward: 9.2730 | Mean Episode Reward: 5.3580
Episode 7 | Episode Reward: 0.7755 | Mean Episode Reward: 4.7034
Episode 8 | Episode Reward: 6.1154 | Mean Episode Reward: 4.8799
Episode 9 | Episode Reward: 11.3791 | Mean Episode Reward: 5.6020
Episode 10 | Episode Reward: 6.1154 | Mean Episode Reward: 5.6533
Episode 11 | Episode Reward: 11.3791 | Mean Episode Reward: 6.1739
Episode 12 | Episode Reward: 7.6942 | Mean Episode Reward: 6.3006
Episode 13 | Episode Reward: 4.7511 | Mean Episode Reward: 6.1814
Episode 14 | Episode Reward: 4.7511 | Mean Episode Reward: 6.0792
Episode 15 | Episode Reward: -0.6807 | Mean Episode Reward: 5.6286
Episode 16 | Episode Reward: -4.4259 | Mean Episode Reward: 5.0001
Episode 17 | 

0,1
batch_avg_reward,▂▄▅▇▆▇▆▆▇▇▇▆█▇▆▆▆▆▆▇▆▅▄▄▅▆▅▃▃▄▄▃▂▅▃▂▁▂▂▂
batch_done_ratio,▄▄▄▅▄▆█▆▆▇▇▇▇▇▇▇▇▇▇▇▆▅▆▆▅▅▅▅▄▃▂▄▄▄▂▃▁▂▁▂
episode,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇█
episode_reward,▅▅▃▅▅▅▅▅▅▅▅▅▅▄▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▄▅▅▅▅▅▆▁█▄▄
epsilon,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_episode_reward,▁▁▂▂▃▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
mean_reward,▁▂▃▄▄▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████
q_loss,▂▃▂▂▂▂▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▄█▄▅▅▅
step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇███

0,1
batch_avg_reward,4.21386
batch_done_ratio,0.39453
episode,6801.0
episode_reward,4.49452
epsilon,0.1
mean_episode_reward,10.93168
mean_reward,10.93263
q_loss,32.4506
step,50008.0


### Inference

In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize Model
model = DuelingConv1DQNet(input_depth_length=5, obs_bars=obs_bars, output_shape=output_shape)
model.to(device)

# Load Model Weights
model.load_state_dict(torch.load("agent_nn_50k.pth", map_location=device))
print("Model weights loaded successfully!")

# Set Model to Evaluation Mode
model.eval()

# Initialize Environment
game = AAPL_env(df, commission_perc=0.1, obs_bars=obs_bars)
state, _, _ = game.step("do_nothing")
state = preprocess_state(state, add_noise=False)

# Run Inference Over Entire Dataset
total_reward = 0
total_commission = 0
trade_count = 0
step = 0
has_position = False

max_steps = len(df) - obs_bars

# Current date and time
current_time = "06:37 PM IST on Sunday, June 08, 2025"
print(f"Starting evaluation at {current_time}")

while step < max_steps:
    with torch.no_grad():
        state = state.to(device)
        q_values = model(state)
        q_values = q_values.cpu().numpy()

    action = np.argmax(q_values[0])
    action_name = actions[action]

    # Enforce buy-close logic
    if has_position and action_name == "buy":
        action_name = "close"
        action = actions.index("close")
    elif not has_position and action_name == "close":
        action_name = "buy"
        action = actions.index("buy")

    # Update position status and count trades
    if action_name == "buy":
        has_position = True
        trade_count += 1
    elif action_name == "close":
        has_position = False
        trade_count += 1

    # Take action in the environment
    next_state, reward, done = game.step(action_name)
    next_state = preprocess_state(next_state, add_noise=False)

    # Update state and rewards
    state = next_state
    total_reward += reward
    step += 1

    print(f"Step {step} | Action: {action_name} | Reward: {reward:.4f} | Total Reward: {total_reward:.4f}")

# Estimate commission impact (assuming reward includes commission)
# Commission = 0.1% per trade, assuming average trade value from state
# For simplicity, assume trade value is based on the last Close price in the state
last_close_price = state[0, 2, -1].item()  # Last Close price in the state
average_trade_value = last_close_price * 100  # Assume 100 shares per trade
commission_per_trade = 0.001 * average_trade_value  # 0.1% commission
total_commission = trade_count * commission_per_trade

print(f"Finished evaluation at {current_time}")
print(f"Total Steps: {step} | Total Trades: {trade_count} | Estimated Total Commission: ${total_commission:.2f}")
print(f"Total Reward (after commission): ${total_reward:.2f}")
print(f"Total Reward (before commission, estimated): ${(total_reward + total_commission):.2f}")

Model weights loaded successfully!
Starting evaluation at 06:37 PM IST on Sunday, June 08, 2025
Step 1 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 2 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 3 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 4 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 5 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 6 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 7 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 8 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 9 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 10 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 11 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 12 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 13 | Action: hold | Reward: 0.0000 | Total Reward: 0.0000
Step 14 | Action: buy | Reward: -0.1000 | Total Reward: -0.1000
Step 15 | Action: hold | Rewar