In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import pandas as pd
from StockTradingRLEnv_Abhi import StockTradingEnv
import matplotlib.pyplot as plt


In [2]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, 24)
        self.fc2 = nn.Linear(24, 48)
        self.fc3 = nn.Linear(48, 24)
        self.fc4 = nn.Linear(24, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.softmax(self.fc4(x),dim = -1)
        return x

In [3]:
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, 24)
        self.fc2 = nn.Linear(24, 48)
        self.fc3 = nn.Linear(48, 24)
        self.fc4 = nn.Linear(24, 1)

    def forward(self, x, u):
        x = torch.cat([x, u], 1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [4]:
class OUNoise:
    def __init__(self, action_dim, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
        self.mu = mu
        self.theta = theta
        self.sigma = max_sigma
        self.max_sigma = max_sigma
        self.min_sigma = min_sigma
        self.decay_period = decay_period
        self.action_dim = action_dim
        self.reset()

    def reset(self):
        self.states = np.ones(self.action_dim) * self.mu

    def evolve_state(self):
        x = self.states
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.states = x + dx
        return self.states

    def get_actions(self, actions, t=0):
        ou_state = self.evolve_state()
        self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
        return np.clip(actions + ou_state, -1, 1)

In [5]:
class DDPGAgent:
    def __init__(self, state_dim):
        self.action_dim = 3
        self.actor = Actor(state_dim, self.action_dim)
        self.actor_target = Actor(state_dim, self.action_dim)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)

        self.critic = Critic(state_dim, self.action_dim)
        self.critic_target = Critic(state_dim, self.action_dim)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)

        self.memory = deque(maxlen=100000)
        self.batch_size = 64
        self.gamma = 0.99
        self.tau = 0.005
        self.noise = OUNoise(self.action_dim)

    def act(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        return self.actor(state).cpu().data.numpy().flatten()

    def replay(self, num_iterations=1000):
        for _ in range(num_iterations):
            if len(self.memory) < self.batch_size:
                continue

            batch = random.sample(self.memory, self.batch_size)
            state, action, reward, next_state, done = map(np.stack, zip(*batch))

            state = torch.FloatTensor(state)
            action = torch.FloatTensor(action)
            reward = torch.FloatTensor(reward)
            next_state = torch.FloatTensor(next_state)
            done = torch.FloatTensor(done)

            target_Q = self.critic_target(next_state, self.actor_target(next_state))
            target_Q = reward + ((1 - done) * self.gamma * target_Q).detach()

            current_Q = self.critic(state, action)
            critic_loss = nn.MSELoss()(current_Q, target_Q)

            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()

            actor_loss = -self.critic(state, self.actor(state)).mean()

            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

            for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def save(self, filename):
        torch.save(self.actor.state_dict(), filename + "_actor.pth")
        torch.save(self.critic.state_dict(), filename + "_critic.pth")

    def load(self, filename):
        self.actor.load_state_dict(torch.load(filename + "_actor.pth"))
        self.critic.load_state_dict(torch.load(filename + "_critic.pth"))


In [6]:
MAX_ACCOUNT_BALANCE = 2147483647
MAX_NUM_SHARES = 2147483647
MAX_SHARE_PRICE = 5000
MAX_STEPS = 2000

INITIAL_ACCOUNT_BALANCE = 10000

In [7]:
df = pd.read_csv('./data/AAPL.csv')
df = df.sort_values('Date')
df.dropna(inplace=True)
df = df.reset_index(drop=True)

env = StockTradingEnv(df, render_mode='human')
state_dim = env.observation_space.shape[1] * env.observation_space.shape[0]
agent = DDPGAgent(state_dim=state_dim)

episodes = 50


In [8]:
net_worths = []

for e in range(episodes):
    state, _ = env.reset()
    state = state.flatten()  # Flatten the state to match the input dimensions of the network
    done = False
    for time in range(MAX_STEPS):
        if(time % 100 == 0):
            print(f"Time: {time} episode: {e+1}/{episodes} score: {env.net_worth}")
        action = agent.act(state)
        next_state, reward, done, truncated, _ = env.step(action)
        next_state = next_state.flatten()  # Flatten the next state
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            agent.update_target_model()
            print(f'Episode {e+1}/{episodes} - Net Worth: {env.net_worth}')
            break
        agent.replay()
    net_worths.append(env.net_worth)
    if (e + 1) % 10 == 0:
        agent.save(f"model_ddpg_{e+1}.pth")
    print(f'Episode {e+1}/{episodes} - Net Worth: {env.net_worth}')

Time: 0 episode: 1/50 score: 10000


  return F.mse_loss(input, target, reduction=self.reduction)


Time: 100 episode: 1/50 score: 10204.748114135058
Time: 200 episode: 1/50 score: 10573.735011525696
Time: 300 episode: 1/50 score: 6490.43189186259
Time: 400 episode: 1/50 score: 6408.98623043653
Time: 500 episode: 1/50 score: 8597.953139594227
Time: 600 episode: 1/50 score: 9267.223135485805
Time: 700 episode: 1/50 score: 11951.762387604362
Time: 800 episode: 1/50 score: 16021.32516158388
Time: 900 episode: 1/50 score: 34671.16859816722
Time: 1000 episode: 1/50 score: 16535.358013688514
Time: 1100 episode: 1/50 score: 29278.423005907393
Time: 1200 episode: 1/50 score: 29603.31776692879
Time: 1300 episode: 1/50 score: 31782.017314274006
Time: 1400 episode: 1/50 score: 36855.935357738796
Time: 1500 episode: 1/50 score: 53306.72880442467
Time: 1600 episode: 1/50 score: 73224.37601186309
Time: 1700 episode: 1/50 score: 66814.62649385909
Time: 1800 episode: 1/50 score: 70606.06224430718
Time: 1900 episode: 1/50 score: 40005.168678548165
Episode 1/50 - Net Worth: 59478.060136873544
Time: 0 

  self.cost_basis = (prev_cost + additional_cost) / \


Time: 100 episode: 2/50 score: 10000.0
Time: 200 episode: 2/50 score: 10000.0
Time: 300 episode: 2/50 score: 10000.0
Time: 400 episode: 2/50 score: 10000.0
Time: 500 episode: 2/50 score: 10000.0
Time: 600 episode: 2/50 score: 10000.0
Time: 700 episode: 2/50 score: 10000.0
Time: 800 episode: 2/50 score: 10000.0
Time: 900 episode: 2/50 score: 10000.0
Time: 1000 episode: 2/50 score: 10000.0
Time: 1100 episode: 2/50 score: 10000.0
Time: 1200 episode: 2/50 score: 10000.0
Time: 1300 episode: 2/50 score: 10000.0
Time: 1400 episode: 2/50 score: 10000.0
Time: 1500 episode: 2/50 score: 10000.0
Time: 1600 episode: 2/50 score: 10000.0
Time: 1700 episode: 2/50 score: 10000.0
Time: 1800 episode: 2/50 score: 10000.0
Time: 1900 episode: 2/50 score: 10000.0
Episode 2/50 - Net Worth: 10000.0
Time: 0 episode: 3/50 score: 10000
Time: 100 episode: 3/50 score: 10000.0
Time: 200 episode: 3/50 score: 10000.0
Time: 300 episode: 3/50 score: 10000.0
Time: 400 episode: 3/50 score: 10000.0
Time: 500 episode: 3/50 

KeyboardInterrupt: 

In [None]:
print(net_worths)
# Plotting the net worth over episodes
plt.plot(range(episodes), net_worths)
plt.xlabel('Episodes')
plt.ylabel('Net Worth')
plt.title('Net Worth over Episodes')
plt.show()


NameError: name 'net_worths' is not defined

In [None]:
# Evaluation
def evaluate(env, agent, episodes=10):
    total_rewards = 0
    for _ in range(episodes):
        state, _ = env.reset()
        state = state.flatten()
        for time in range(MAX_STEPS):
            action = agent.act(state,explore = False)
            next_state, reward, done, truncated, _ = env.step(action)
            next_state = next_state.flatten()
            total_rewards += reward
            state = next_state
    avg_reward = total_rewards / episodes
    print(f"Average Reward over {episodes} episodes: {avg_reward}")

# Load the trained model
agent.load("model_ddpg_50.pth")

# Evaluate the agent
evaluate(env, agent)

Average Reward over 10 episodes: 2560175.0
