# Using RL to predict Stock Prices

### Basic Data
Using only the basic Open, Min, Max and Close to predict the prices

### Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import wandb

In [2]:
df = pd.read_csv("AAPL.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1980-12-12,0.100178,0.100614,0.100178,0.100178,469033600,0.0,0.0
1,1980-12-15,0.095388,0.095388,0.094952,0.094952,175884800,0.0,0.0
2,1980-12-16,0.088418,0.088418,0.087983,0.087983,105728000,0.0,0.0
3,1980-12-17,0.09016,0.090596,0.09016,0.09016,86441600,0.0,0.0
4,1980-12-18,0.092774,0.09321,0.092774,0.092774,73449600,0.0,0.0


In [3]:
df = df.drop(['Date', 'Volume', 'Dividends', 'Stock Splits'], axis=1)

In [4]:
df.head()

Unnamed: 0,Open,High,Low,Close
0,0.100178,0.100614,0.100178,0.100178
1,0.095388,0.095388,0.094952,0.094952
2,0.088418,0.088418,0.087983,0.087983
3,0.09016,0.090596,0.09016,0.09016
4,0.092774,0.09321,0.092774,0.092774


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10483 entries, 0 to 10482
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    10483 non-null  float64
 1   High    10483 non-null  float64
 2   Low     10483 non-null  float64
 3   Close   10483 non-null  float64
dtypes: float64(4)
memory usage: 327.7 KB


In [6]:
df['remove'] = df.apply(lambda x: all([abs(i - x[0]) < 1e-8 for i in x[1:]]), axis = 1)

  df['remove'] = df.apply(lambda x: all([abs(i - x[0]) < 1e-8 for i in x[1:]]), axis = 1)


In [7]:
df.head()

Unnamed: 0,Open,High,Low,Close,remove
0,0.100178,0.100614,0.100178,0.100178,False
1,0.095388,0.095388,0.094952,0.094952,False
2,0.088418,0.088418,0.087983,0.087983,False
3,0.09016,0.090596,0.09016,0.09016,False
4,0.092774,0.09321,0.092774,0.092774,False


In [8]:
df = df.query("remove == False").reset_index(drop=True)
df.drop(['remove'], axis=1, inplace=True)
df.head()

Unnamed: 0,Open,High,Low,Close
0,0.100178,0.100614,0.100178,0.100178
1,0.095388,0.095388,0.094952,0.094952
2,0.088418,0.088418,0.087983,0.087983
3,0.09016,0.090596,0.09016,0.09016
4,0.092774,0.09321,0.092774,0.092774


In [9]:
df

Unnamed: 0,Open,High,Low,Close
0,0.100178,0.100614,0.100178,0.100178
1,0.095388,0.095388,0.094952,0.094952
2,0.088418,0.088418,0.087983,0.087983
3,0.090160,0.090596,0.090160,0.090160
4,0.092774,0.093210,0.092774,0.092774
...,...,...,...,...
10450,141.350006,144.119995,141.080002,142.919998
10451,143.289993,146.550003,143.279999,146.350006
10452,145.259995,147.550003,145.000000,147.039993
10453,145.669998,146.639999,143.779999,144.869995


In [10]:
# Normalizing the dataset
df['High'] = (df['High'] - df['Open']) /df['Open']
df['Low'] = (df['Low'] - df['Open']) /df['Open']
df['Close'] = (df['Close'] - df['Open']) /df['Open']

In [11]:
df

Unnamed: 0,Open,High,Low,Close
0,0.100178,0.004348,0.000000,0.000000
1,0.095388,0.000000,-0.004566,-0.004566
2,0.088418,0.000000,-0.004926,-0.004926
3,0.090160,0.004831,0.000000,0.000000
4,0.092774,0.004694,0.000000,0.000000
...,...,...,...,...
10450,141.350006,0.019597,-0.001910,0.011107
10451,143.289993,0.022751,-0.000070,0.021355
10452,145.259995,0.015765,-0.001790,0.012254
10453,145.669998,0.006659,-0.012975,-0.005492


### Trading Environment

In [12]:
class AAPL_env():
    def __init__(self, data, obs_bars = 10, test = False, commission_perc=0.1):
        self.data = data
        self.obs_bars = obs_bars
        self.have_position = False
        self.open_price = 0
        self.test = test
        self.commission_perc = commission_perc
        if test == False:
            self.curr_step = np.random.choice(self.data.High.shape[0] - self.obs_bars*10) + self.obs_bars
        else:
            self.curr_step = self.obs_bars
        
        self.state = self.data[self.curr_step - self.obs_bars : self.curr_step]
    
    def step(self, action):
        reward = 0.0
        done = False
        
        # Handle special "do_nothing" action for initialization
        if action == "do_nothing":
            self.curr_step = self.obs_bars
            self.state = self.data.iloc[self.curr_step - self.obs_bars : self.curr_step]
            return self._get_state(), reward, done
        
        # Current prices (as in your original step method)
        relative_close = self.state["Close"].iloc[-1]
        open_price = self.state["Open"].iloc[-1]
        close = open_price * (1 + relative_close)
        
        # Handle actions (buy, sell, close, hold)
        if action == "buy" and self.have_position == 0:
            self.have_position = 1
            self.open_price = close
            reward -= self.commission_perc
        elif action == "sell" and self.have_position == 0:
            self.have_position = -1
            self.open_price = close
            reward -= self.commission_perc
        elif action == "close" and self.have_position != 0:
            if self.have_position == 1:
                reward += 100.0 * (close - self.open_price) / self.open_price - self.commission_perc
            elif self.have_position == -1:
                reward += 100.0 * (self.open_price - close) / self.open_price - self.commission_perc
            self.have_position = 0
            self.open_price = 0.0
            if not self.test:
                done = True
        # 'hold' or invalid actions: do nothing
        
        # Advance step
        self.curr_step += 1
        self.state = self.data.iloc[self.curr_step - self.obs_bars : self.curr_step]
        
        # Check if episode is done
        if self.curr_step >= len(self.data) - 1:
            done = True
        
        return self._get_state(), reward, done

    def _get_state(self):
        """Helper method to construct the state array."""
        state = np.zeros((5, self.obs_bars), dtype=np.float32)
        state[0] = self.state["High"].to_list()
        state[1] = self.state["Low"].to_list()
        state[2] = self.state["Close"].to_list()
        state[3] = [self.have_position] * self.obs_bars
        if self.have_position != 0:
            relative_close = self.state["Close"].iloc[-1]
            open_price = self.state["Open"].iloc[-1]
            close = open_price * (1 + relative_close)
            if self.have_position == 1:
                state[4] = [(close - self.open_price) / self.open_price] * self.obs_bars
            elif self.have_position == -1:
                state[4] = [(self.open_price - close) / self.open_price] * self.obs_bars
        else:
            state[4] = [0.0] * self.obs_bars
        return state

In [13]:
actions = {
    0: "do_nothing",
    1: "buy",
    2: "close"
}

In [14]:
AAPL = AAPL_env(data=df, test=False, obs_bars=50)

In [15]:
state, reward, done = AAPL.step("do_nothing")

In [16]:
state.shape

(5, 50)

### DL Architecture

In [17]:
class DuelingConv1DQNet(nn.Module):
    def __init__(self, input_depth_length, obs_bars, output_shape):
        super(DuelingConv1DQNet, self).__init__()
        
        # Convolutional layers for price data (High, Low, Close)
        self.price_conv = nn.Sequential(
            nn.Conv1d(3, 128, kernel_size=5, padding=2),  # in_channels=3
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 128, kernel_size=5, padding=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Flatten()
        )
        
        # Dynamically compute the flattened size
        self.flattened_size = 128 * obs_bars + 2
        
        # State value stream
        self.state_value = nn.Sequential(
            nn.Linear(self.flattened_size, 512),  # +2 for position and profit/loss
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        
        # Advantage stream
        self.advantage = nn.Sequential(
            nn.Linear(self.flattened_size, 512),
            nn.ReLU(),
            nn.Linear(512, output_shape)
        )

    def forward(self, x):
        price_x = x[:, :3, :]  # Extract price channels: High, Low, Close
        info_x = x[:, 3:, -1]  # Extract position and profit/loss from last time step
        
        # Process price data through convolutional layers
        price_features = self.price_conv(price_x)
        
        # Concatenate with position and profit/loss
        features = torch.cat([price_features, info_x], dim=1)
        
        # Compute state value and advantage
        state_val = self.state_value(features)
        advantage = self.advantage(features)
        
        # Combine using Dueling DQN formula
        q_values = state_val + advantage - advantage.mean(dim=1, keepdim=True)
        return q_values

### Training the bot

In [18]:
from collections import deque
import copy

In [20]:
def preprocess_state(state, add_noise=False):
    if add_noise:
        state += np.random.normal(0, 0.01, size=state.shape)
    state_tensor = torch.from_numpy(state).float().unsqueeze(0)  # (1, 5, 50)
    return state_tensor

def get_action(q_values, num_actions, epsilon):
    if np.random.random() < epsilon:
        # Exploration: choose a random action
        return np.random.randint(0, num_actions)
    else:
        # Exploitation: choose the action with the highest Q-value
        return q_values.argmax().item()

def get_batch_for_nsteps_dqn(replay, batch_size, nsteps, device, gamma=0.99):
    if len(replay) < batch_size + nsteps:
        raise ValueError("Not enough experiences in replay buffer")

    max_idx = len(replay) - nsteps
    indices = np.random.randint(0, max_idx, size=batch_size)
    
    state1_batch = []
    action1_batch = []
    nsteps_next_state_batch = []
    nsteps_reward_batch = []
    nsteps_done_batch = []
    
    for idx in indices:
        state, action, reward, next_state, done = replay[idx]
        state = state.squeeze(0)
        state1_batch.append(state.cpu().numpy())
        action1_batch.append(action)
        
        nstep_reward = reward
        nstep_done = done
        nstep_next_state = next_state
        
        for t in range(1, nsteps):
            if nstep_done:
                break
            next_transition = replay[idx + t]
            reward_t = next_transition[2]
            nstep_done = next_transition[4]
            nstep_next_state = next_transition[3]
            nstep_reward += (gamma ** t) * reward_t
        
        nstep_next_state = nstep_next_state.squeeze(0)
        nsteps_next_state_batch.append(nstep_next_state.cpu().numpy())
        nsteps_reward_batch.append(nstep_reward)
        nsteps_done_batch.append(nstep_done)
    
    state1_batch = torch.tensor(np.array(state1_batch), dtype=torch.float32).to(device)
    action1_batch = torch.tensor(action1_batch, dtype=torch.long).to(device)
    nsteps_next_state_batch = torch.tensor(np.array(nsteps_next_state_batch), dtype=torch.float32).to(device)
    nsteps_reward_batch = torch.tensor(nsteps_reward_batch, dtype=torch.float32).to(device)
    nsteps_done_batch = torch.tensor(nsteps_done_batch, dtype=torch.float32).to(device)
    
    # Log batch-level metrics to W&B
    wandb.log({
        "batch_avg_reward": nsteps_reward_batch.mean().item(),
        "batch_done_ratio": nsteps_done_batch.mean().item()
    })
    
    return state1_batch, action1_batch, nsteps_next_state_batch, nsteps_reward_batch, nsteps_done_batch

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [22]:
# Hyperparameters
memory_size = 100000
batch_size = 64
obs_bars = 50
gamma = 0.99
lr = 0.0001
sync_freq = 1000
output_shape = len(actions)  # Assumes 'actions' is defined (e.g., ["buy", "close", "hold"])
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
nsteps = 2

In [23]:
wandb.init(project="dqn-aapl-trading")  
config = {
    "learning_rate": lr,
    "batch_size": batch_size,
    "memory_size": memory_size,
    "obs_bars": obs_bars,
    "nsteps": nsteps,
    "epsilon_start": epsilon,
    "epsilon_min": epsilon_min,
    "epsilon_decay": epsilon_decay,
    "gamma": gamma,
    "sync_freq": sync_freq,
    "optimizer": "RMSprop",
    "loss_function": "MSELoss"
}
wandb.config.update(config)

wandb: Currently logged in as: anshuman-221793101 (anshuman-221793101-vidyavardhini-s-college-of-engineerin) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


In [24]:
# Initialize replay buffer
replay = deque(maxlen=memory_size)

# Initialize networks
Agent_NN = DuelingConv1DQNet(input_depth_length=5, obs_bars=obs_bars, output_shape=output_shape)
target_NN = DuelingConv1DQNet(input_depth_length=5, obs_bars=obs_bars, output_shape=output_shape)
target_NN.load_state_dict(Agent_NN.state_dict())
Agent_NN.to(device)
target_NN.to(device)

DuelingConv1DQNet(
  (price_conv): Sequential(
    (0): Conv1d(3, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
  )
  (state_value): Sequential(
    (0): Linear(in_features=6402, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=1, bias=True)
  )
  (advantage): Sequential(
    (0): Linear(in_features=6402, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=3, bias=True)
  )
)

In [None]:
# Initialize parameters
optimizer = torch.optim.RMSprop(Agent_NN.parameters(), lr=lr)
loss_fn = torch.nn.MSELoss()
all_rewards_list = []
Q_losses = []
obs_bars = 50
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
k = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move networks to device
Agent_NN.to(device)
target_NN.to(device)

DuelingConv1DQNet(
  (price_conv): Sequential(
    (0): Conv1d(3, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
  )
  (state_value): Sequential(
    (0): Linear(in_features=6402, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=1, bias=True)
  )
  (advantage): Sequential(
    (0): Linear(in_features=6402, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=3, bias=True)
  )
)

In [25]:
# Optimizer and loss
optimizer = torch.optim.RMSprop(Agent_NN.parameters(), lr=lr)
loss_fn = torch.nn.MSELoss()

# Training metrics
all_rewards_list = []
Q_losses = []
k = 0

In [26]:
# Section 4: Training Loop
while k < 10000:
    # Initialize environment
    game = AAPL_env(df, commission_perc=0.1, obs_bars=obs_bars)
    state, _, _ = game.step("do_nothing")
    state1 = preprocess_state(state, add_noise=False)  # Shape: (1, 5, 50)
    status = 1
    episode_rewards = []
    
    while status == 1:
        k += 1
        
        # Action selection
        with torch.no_grad():
            qval = Agent_NN(state1.to(device))
            qval_ = qval.cpu().numpy()
        action = get_action(qval_, output_shape, epsilon)
        action_name = actions[action]
        
        # Environment step
        state2, reward, done = game.step(action_name)
        state2 = preprocess_state(state2, add_noise=False)
        exp = (state1, action, reward, state2, done)
        replay.append(exp)
        episode_rewards.append(reward)
        
        # Update epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        
        # Training step
        if len(replay) >= batch_size + nsteps:
            state1_batch, action1_batch, nsteps_next_state_batch, nsteps_reward_batch, nsteps_done_batch = get_batch_for_nsteps_dqn(
                replay=replay, batch_size=batch_size, nsteps=nsteps, device=device, gamma=gamma
            )
            
            # Compute Q-values
            Q1 = Agent_NN(state1_batch)
            Q1_selected = Q1.gather(dim=1, index=action1_batch.unsqueeze(dim=1)).squeeze()
            
            # Compute target Q-values
            with torch.no_grad():
                Q_next_state = target_NN(nsteps_next_state_batch)
                selected_nodes_for_target_network = Q_next_state.max(dim=1)[1]
                best_Q_next_state = Q_next_state.gather(dim=1, index=selected_nodes_for_target_network.unsqueeze(dim=1)).squeeze()
                Y_batch_target_for_nsteps_don = (nsteps_reward_batch + (1 - nsteps_done_batch) * best_Q_next_state).to(device)
            
            # Compute loss
            loss = loss_fn(Q1_selected, Y_batch_target_for_nsteps_don)
            Q_losses.append(loss.item())
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Update target network
            if k % sync_freq == 0:
                target_NN.load_state_dict(Agent_NN.state_dict())
            
            # Log metrics to W&B
            wandb.log({
                "step": k,
                "epsilon": epsilon,
                "q_loss": loss.item(),
                "mean_reward": np.mean(all_rewards_list) if all_rewards_list else 0
            })
            
            print(f"Step {k} | Epsilon: {epsilon:.4f} | Q-Loss: {loss.item():.4f} | Mean Reward: {np.mean(all_rewards_list) if all_rewards_list else 0:.4f}")
        
        state1 = state2
        status = 1 - done
    
    # End of episode
    episode_reward = np.sum(episode_rewards)
    all_rewards_list.append(episode_reward)
    
    # Log episode-level metrics to W&B
    wandb.log({
        "episode": len(all_rewards_list),
        "episode_reward": episode_reward,
        "mean_episode_reward": np.mean(all_rewards_list)
    })
    
    print(f"Episode {len(all_rewards_list)} | Episode Reward: {episode_reward:.4f} | Mean Episode Reward: {np.mean(all_rewards_list):.4f}")

# Section 5: Save Model and Finish W&B Run
torch.save(Agent_NN.state_dict(), "agent_nn_final.pth")
wandb.save("agent_nn_final.pth")  # Upload model weights to W&B
wandb.finish()

Episode 1 | Episode Reward: -0.2000 | Mean Episode Reward: -0.2000
Episode 2 | Episode Reward: -0.2000 | Mean Episode Reward: -0.2000
Episode 3 | Episode Reward: 11.3791 | Mean Episode Reward: 3.6597
Episode 4 | Episode Reward: 7.6942 | Mean Episode Reward: 4.6683
Episode 5 | Episode Reward: -1.6293 | Mean Episode Reward: 3.4088
Episode 6 | Episode Reward: 6.1154 | Mean Episode Reward: 3.8599
Episode 7 | Episode Reward: -0.2000 | Mean Episode Reward: 3.2799
Episode 8 | Episode Reward: 11.9054 | Mean Episode Reward: 4.3581
Episode 9 | Episode Reward: -0.2000 | Mean Episode Reward: 3.8516
Episode 10 | Episode Reward: 8.7468 | Mean Episode Reward: 4.3412
Episode 11 | Episode Reward: 11.9054 | Mean Episode Reward: 5.0288
Step 66 | Epsilon: 0.7183 | Q-Loss: 17.4469 | Mean Reward: 5.0288
Step 67 | Epsilon: 0.7147 | Q-Loss: 188.0950 | Mean Reward: 5.0288
Step 68 | Epsilon: 0.7112 | Q-Loss: 24.6140 | Mean Reward: 5.0288
Episode 12 | Episode Reward: 3.2153 | Mean Episode Reward: 4.8777
Step 69 

0,1
batch_avg_reward,▄▆▇▇▇██▆█▇▆▇█▇▇▆▆▆▆▃▄▄▄▆▃▃▃▃▃▃▃▃▂▂▁▃▃▂▂▂
batch_done_ratio,▃▅▆▇███▇▇▆▆▇▇▆▆▄▄▄▄▄▄▃▃▃▃▁▃▂▂▂▂▁▂▂▂▁▁▂▁▁
episode,▁▁▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇███
episode_reward,▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████████████▁
epsilon,█▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
mean_episode_reward,▂▃▁▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇██████
mean_reward,▁▂▂▄▅▇▇█████████████████████████████████
q_loss,█▇▄▄▄▁▂▂▁▂▂▂▁▂▂▁▁▂▁▂▁▁▁▁▁▁▂▂▁▁▂▂▁▁▁▁▁▁▁▁
step,▁▁▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆████

0,1
batch_avg_reward,2.75245
batch_done_ratio,0.39062
episode,1062.0
episode_reward,-0.2
epsilon,0.01
mean_episode_reward,6.80959
mean_reward,6.81619
q_loss,0.77989
step,10371.0
