# Using RL to predict Stock Prices

### Basic Data
Using only the basic Open, Min, Max and Close to predict the prices

### Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

In [2]:
df = pd.read_csv("AAPL.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1980-12-12,0.100178,0.100614,0.100178,0.100178,469033600,0.0,0.0
1,1980-12-15,0.095388,0.095388,0.094952,0.094952,175884800,0.0,0.0
2,1980-12-16,0.088418,0.088418,0.087983,0.087983,105728000,0.0,0.0
3,1980-12-17,0.09016,0.090596,0.09016,0.09016,86441600,0.0,0.0
4,1980-12-18,0.092774,0.09321,0.092774,0.092774,73449600,0.0,0.0


In [3]:
df = df.drop(['Date', 'Volume', 'Dividends', 'Stock Splits'], axis=1)

In [4]:
df.head()

Unnamed: 0,Open,High,Low,Close
0,0.100178,0.100614,0.100178,0.100178
1,0.095388,0.095388,0.094952,0.094952
2,0.088418,0.088418,0.087983,0.087983
3,0.09016,0.090596,0.09016,0.09016
4,0.092774,0.09321,0.092774,0.092774


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10483 entries, 0 to 10482
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    10483 non-null  float64
 1   High    10483 non-null  float64
 2   Low     10483 non-null  float64
 3   Close   10483 non-null  float64
dtypes: float64(4)
memory usage: 327.7 KB


In [6]:
df['remove'] = df.apply(lambda x: all([abs(i - x[0]) < 1e-8 for i in x[1:]]), axis = 1)

  df['remove'] = df.apply(lambda x: all([abs(i - x[0]) < 1e-8 for i in x[1:]]), axis = 1)


In [7]:
df.head()

Unnamed: 0,Open,High,Low,Close,remove
0,0.100178,0.100614,0.100178,0.100178,False
1,0.095388,0.095388,0.094952,0.094952,False
2,0.088418,0.088418,0.087983,0.087983,False
3,0.09016,0.090596,0.09016,0.09016,False
4,0.092774,0.09321,0.092774,0.092774,False


In [8]:
df = df.query("remove == False").reset_index(drop=True)
df.drop(['remove'], axis=1, inplace=True)
df.head()

Unnamed: 0,Open,High,Low,Close
0,0.100178,0.100614,0.100178,0.100178
1,0.095388,0.095388,0.094952,0.094952
2,0.088418,0.088418,0.087983,0.087983
3,0.09016,0.090596,0.09016,0.09016
4,0.092774,0.09321,0.092774,0.092774


In [9]:
df

Unnamed: 0,Open,High,Low,Close
0,0.100178,0.100614,0.100178,0.100178
1,0.095388,0.095388,0.094952,0.094952
2,0.088418,0.088418,0.087983,0.087983
3,0.090160,0.090596,0.090160,0.090160
4,0.092774,0.093210,0.092774,0.092774
...,...,...,...,...
10450,141.350006,144.119995,141.080002,142.919998
10451,143.289993,146.550003,143.279999,146.350006
10452,145.259995,147.550003,145.000000,147.039993
10453,145.669998,146.639999,143.779999,144.869995


In [10]:
# Normalizing the dataset
df['High'] = (df['High'] - df['Open']) /df['Open']
df['Low'] = (df['Low'] - df['Open']) /df['Open']
df['Close'] = (df['Close'] - df['Open']) /df['Open']

In [11]:
df

Unnamed: 0,Open,High,Low,Close
0,0.100178,0.004348,0.000000,0.000000
1,0.095388,0.000000,-0.004566,-0.004566
2,0.088418,0.000000,-0.004926,-0.004926
3,0.090160,0.004831,0.000000,0.000000
4,0.092774,0.004694,0.000000,0.000000
...,...,...,...,...
10450,141.350006,0.019597,-0.001910,0.011107
10451,143.289993,0.022751,-0.000070,0.021355
10452,145.259995,0.015765,-0.001790,0.012254
10453,145.669998,0.006659,-0.012975,-0.005492


### Trading Environment

In [24]:
class AAPL_env():
    def __init__(self, data, obs_bars = 10, test = False, commission_perc=0.1):
        self.data = data
        self.obs_bars = obs_bars
        self.have_position = False
        self.open_price = 0
        self.test = test
        self.commission_perc = commission_perc
        if test == False:
            self.curr_step = np.random.choice(self.data.High.shape[0] - self.obs_bars*10) + self.obs_bars
        else:
            self.curr_step = self.obs_bars
        
        self.state = self.data[self.curr_step - self.obs_bars : self.curr_step]
    
    def step(self, action):
        reward = 0.0
        done = False
        
        # Handle special "do_nothing" action for initialization
        if action == "do_nothing":
            self.curr_step = self.obs_bars
            self.state = self.data.iloc[self.curr_step - self.obs_bars : self.curr_step]
            return self._get_state(), reward, done
        
        # Current prices (as in your original step method)
        relative_close = self.state["Close"].iloc[-1]
        open_price = self.state["Open"].iloc[-1]
        close = open_price * (1 + relative_close)
        
        # Handle actions (buy, sell, close, hold)
        if action == "buy" and self.have_position == 0:
            self.have_position = 1
            self.open_price = close
            reward -= self.commission_perc
        elif action == "sell" and self.have_position == 0:
            self.have_position = -1
            self.open_price = close
            reward -= self.commission_perc
        elif action == "close" and self.have_position != 0:
            if self.have_position == 1:
                reward += 100.0 * (close - self.open_price) / self.open_price - self.commission_perc
            elif self.have_position == -1:
                reward += 100.0 * (self.open_price - close) / self.open_price - self.commission_perc
            self.have_position = 0
            self.open_price = 0.0
            if not self.test:
                done = True
        # 'hold' or invalid actions: do nothing
        
        # Advance step
        self.curr_step += 1
        self.state = self.data.iloc[self.curr_step - self.obs_bars : self.curr_step]
        
        # Check if episode is done
        if self.curr_step >= len(self.data) - 1:
            done = True
        
        return self._get_state(), reward, done

    def _get_state(self):
        """Helper method to construct the state array."""
        state = np.zeros((5, self.obs_bars), dtype=np.float32)
        state[0] = self.state["High"].to_list()
        state[1] = self.state["Low"].to_list()
        state[2] = self.state["Close"].to_list()
        state[3] = [self.have_position] * self.obs_bars
        if self.have_position != 0:
            relative_close = self.state["Close"].iloc[-1]
            open_price = self.state["Open"].iloc[-1]
            close = open_price * (1 + relative_close)
            if self.have_position == 1:
                state[4] = [(close - self.open_price) / self.open_price] * self.obs_bars
            elif self.have_position == -1:
                state[4] = [(self.open_price - close) / self.open_price] * self.obs_bars
        else:
            state[4] = [0.0] * self.obs_bars
        return state

In [13]:
actions = {
    0: "do_nothing",
    1: "buy",
    2: "close"
}

In [14]:
AAPL = AAPL_env(data=df, test=False, obs_bars=50)

In [15]:
state, reward, done = AAPL.step("do_nothing")

In [16]:
state.shape

(5, 50)

### DL Architecture

In [61]:
class DuelingConv1DQNet(nn.Module):
    def __init__(self, input_depth_length, obs_bars, output_shape):
        super(DuelingConv1DQNet, self).__init__()
        
        # Convolutional layers for price data (High, Low, Close)
        self.price_conv = nn.Sequential(
            nn.Conv1d(3, 128, kernel_size=5, padding=2),  # in_channels=3
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 128, kernel_size=5, padding=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Flatten()
        )
        
        # Dynamically compute the flattened size
        self.flattened_size = 128 * obs_bars + 2
        
        # State value stream
        self.state_value = nn.Sequential(
            nn.Linear(self.flattened_size, 512),  # +2 for position and profit/loss
            nn.ReLU(),
            nn.Linear(512, 1)
        )
        
        # Advantage stream
        self.advantage = nn.Sequential(
            nn.Linear(self.flattened_size, 512),
            nn.ReLU(),
            nn.Linear(512, output_shape)
        )

    def forward(self, x):
        price_x = x[:, :3, :]  # Extract price channels: High, Low, Close
        info_x = x[:, 3:, -1]  # Extract position and profit/loss from last time step
        
        # Process price data through convolutional layers
        price_features = self.price_conv(price_x)
        
        # Concatenate with position and profit/loss
        features = torch.cat([price_features, info_x], dim=1)
        
        # Compute state value and advantage
        state_val = self.state_value(features)
        advantage = self.advantage(features)
        
        # Combine using Dueling DQN formula
        q_values = state_val + advantage - advantage.mean(dim=1, keepdim=True)
        return q_values

### Training the bot

In [44]:
from collections import deque
import copy

In [77]:
def preprocess_state(state, add_noise=False):
    if add_noise:
        state += np.random.normal(0, 0.01, size=state.shape)
    state_tensor = torch.from_numpy(state).float().unsqueeze(0)  # (1, 5, 50)
    return state_tensor

def get_action(q_values, num_actions, epsilon):
    if np.random.random() < epsilon:
        # Exploration: choose a random action
        return np.random.randint(0, num_actions)
    else:
        # Exploitation: choose the action with the highest Q-value
        return q_values.argmax().item()

def get_batch_for_nsteps_dqn(replay, batch_size, nsteps, device, gamma=0.99):
    if len(replay) < batch_size + nsteps:
        raise ValueError("Not enough experiences in replay buffer")

    max_idx = len(replay) - nsteps
    indices = np.random.randint(0, max_idx, size=batch_size)
    
    state1_batch = []
    action1_batch = []
    nsteps_next_state_batch = []
    nsteps_reward_batch = []
    nsteps_done_batch = []
    
    for idx in indices:
        state, action, reward, next_state, done = replay[idx]
        state = state.squeeze(0)
        state1_batch.append(state.cpu().numpy())
        
        action1_batch.append(action)
        
        nstep_reward = reward
        nstep_done = done
        nstep_next_state = next_state
        
        for t in range(1, nsteps):
            if nstep_done:
                break
            next_transition = replay[idx + t]
            reward_t = next_transition[2]
            nstep_done = next_transition[4]
            nstep_next_state = next_transition[3]
            nstep_reward += (gamma ** t) * reward_t
        
        nstep_next_state = nstep_next_state.squeeze(0)
        nsteps_next_state_batch.append(nstep_next_state.cpu().numpy())
        nsteps_reward_batch.append(nstep_reward)
        nsteps_done_batch.append(nstep_done)
    
    # print("action1_batch before tensor:", action1_batch)  # Debug
    state1_batch = torch.tensor(np.array(state1_batch), dtype=torch.float32).to(device)
    action1_batch = torch.tensor(action1_batch, dtype=torch.long).to(device)
    # print("action1_batch after tensor:", action1_batch.shape)  # Debug
    nsteps_next_state_batch = torch.tensor(np.array(nsteps_next_state_batch), dtype=torch.float32).to(device)
    nsteps_reward_batch = torch.tensor(nsteps_reward_batch, dtype=torch.float32).to(device)
    nsteps_done_batch = torch.tensor(nsteps_done_batch, dtype=torch.float32).to(device)
    
    return state1_batch, action1_batch, nsteps_next_state_batch, nsteps_reward_batch, nsteps_done_batch

In [82]:
# Hyperparameter Initialization
memory_size = 100000
batch_size = 64
replay = deque(maxlen=memory_size)
gamma = 0.99
lr = 0.0001
sync_freq = 1000
output_shape = len(actions)
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [83]:
Agent_NN = DuelingConv1DQNet(input_depth_length=5, obs_bars=50, output_shape=3)
target_NN = DuelingConv1DQNet(input_depth_length=5, obs_bars=50, output_shape=3)
target_NN.load_state_dict(Agent_NN.state_dict())

<All keys matched successfully>

In [84]:
# Initialize parameters
optimizer = torch.optim.RMSprop(Agent_NN.parameters(), lr=lr)
loss_fn = torch.nn.MSELoss()
all_rewards_list = []
Q_losses = []
obs_bars = 50
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
k = 0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move networks to device
Agent_NN.to(device)
target_NN.to(device)

DuelingConv1DQNet(
  (price_conv): Sequential(
    (0): Conv1d(3, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv1d(128, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): Flatten(start_dim=1, end_dim=-1)
  )
  (state_value): Sequential(
    (0): Linear(in_features=6402, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=1, bias=True)
  )
  (advantage): Sequential(
    (0): Linear(in_features=6402, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=3, bias=True)
  )
)

In [85]:
while k < 1000000:
    game = AAPL_env(df, commission_perc=0.1, obs_bars=obs_bars)
    state, _, _ = game.step("do_nothing")  # Shape: (5, 50)
    state1 = preprocess_state(state, add_noise=False)
    status = 1
    episode_rewards = []
    
    while status == 1:
        k += 1
        
        # Select action using epsilon-greedy policy
        with torch.no_grad():
            qval = Agent_NN(state1.to(device))  # Shape: (1, num_actions)
            qval_ = qval.cpu().numpy()
        action = get_action(qval_, 3, epsilon)
        action_name = actions[action]
        
        # Take a step in the environment
        state2, reward, done = game.step(action_name)
        state2 = preprocess_state(state2, add_noise=False)  # Shape: (1, 3, 50)
        exp = (state1, action, reward, state2, done)
        replay.append(exp)
        episode_rewards.append(reward)
        
        # Update epsilon for exploration-exploitation trade-off
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        
        # Train if enough experiences are in the replay buffer
        if len(replay) >= batch_size + 2:
            state1_batch, action1_batch, nsteps_next_state_batch, nsteps_reward_batch, nsteps_done_batch = get_batch_for_nsteps_dqn(
                replay=replay, batch_size=batch_size, nsteps=2, device=device
            )
            
            Q1 = Agent_NN(state1_batch)
            # print("Q1 shape:", Q1.shape)  # Debug
            # print("action1_batch shape:", action1_batch.shape)  # Debug
            Q1_selected = Q1.gather(dim=1, index=action1_batch.unsqueeze(dim=1)).squeeze()
            # print("Q1_selected shape:", Q1_selected.shape)  # Debug
            
            # Compute target Q-values using the target network
            with torch.no_grad():
                Q_next_state = target_NN(nsteps_next_state_batch)  # Shape: (batch_size, num_actions)
                selected_nodes_for_target_network = Q_next_state.max(dim=1)[1]  # Shape: (batch_size,)
                best_Q_next_state = Q_next_state.gather(dim=1, index=selected_nodes_for_target_network.unsqueeze(dim=1)).squeeze()  # Shape: (batch_size,)
                Y_batch_target_for_nsteps_don = (nsteps_reward_batch + (1 - nsteps_done_batch) * best_Q_next_state).to(device)  # Shape: (batch_size,)
            
            # Compute loss
            loss = loss_fn(Q1_selected, Y_batch_target_for_nsteps_don)
            Q_losses.append(loss.item())
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Print progress
            print("Current step:", k)
            print("Mean reward:", np.mean(all_rewards_list) if all_rewards_list else 0)
            print("Loss:", loss.item())
        
        state1 = state2
        status = 1 - done
    
    all_rewards_list.append(np.sum(episode_rewards))

Current step: 66
Mean reward: 2.0382814687736204
Loss: 6.4643144607543945
Current step: 67
Mean reward: 2.0382814687736204
Loss: 117.43931579589844
Current step: 68
Mean reward: 2.169058431600299
Loss: 8.930057525634766
Current step: 69
Mean reward: 2.169058431600299
Loss: 4.177950382232666
Current step: 70
Mean reward: 2.169058431600299
Loss: 4.130468845367432
Current step: 71
Mean reward: 2.169058431600299
Loss: 4.862236022949219
Current step: 72
Mean reward: 2.7215774904670833
Loss: 3.132418632507324
Current step: 73
Mean reward: 2.7215774904670833
Loss: 5.896778583526611
Current step: 74
Mean reward: 2.7215774904670833
Loss: 2.8508214950561523
Current step: 75
Mean reward: 2.7215774904670833
Loss: 4.260993480682373
Current step: 76
Mean reward: 2.7215774904670833
Loss: 6.050257682800293
Current step: 77
Mean reward: 2.4988575331362757
Loss: 5.119801044464111
Current step: 78
Mean reward: 2.4988575331362757
Loss: 5.545325756072998
Current step: 79
Mean reward: 2.8002336059192907
Los

KeyboardInterrupt: 