In [None]:
import gymnasium as gym
import gym_trading_env
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from copy import copy
from copy import deepcopy
import talib
from gym_trading_env.renderer import Renderer

In [None]:
nifty50_index_symbol = '^NSEI'

start_date = '2010-01-01'
end_date = '2019-06-30'
# end_date = '2012-06-11'

data = yf.download(nifty50_index_symbol, start=start_date, end=end_date)

In [None]:
df = data

In [None]:
df.rename(columns={'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume'}, inplace=True)

In [None]:
df['feature-MACD'], df['feature-MACD_signal'], df['feature-MACD_hist'] = talib.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)

df['feature-RSI'] = talib.RSI(df['close'], timeperiod=14)

df['feature-CCI'] = talib.CCI(df['high'], df['low'], df['close'], timeperiod=14)

df['feature-ADX'] = talib.ADX(df['high'], df['low'], df['close'], timeperiod=14)

df.dropna(inplace = True)


In [None]:
df.head()

In [None]:
action_space = np.linspace(0, 1, num=11).tolist()

for i in range(len(action_space)):
    action_space[i] = round(action_space[i], 1)

In [None]:
action_space

In [None]:
env = gym.make("TradingEnv",
        name= "RL",
        df = df,
        positions =action_space , # -1 (=SHORT), 0(=OUT), +1 (=LONG)
        # trading_fees = 0.01/100, # 0.01% per stock buy / sell
        # borrow_interest_rate= 0.0003/100, # 0.0003% per timestep 
    )

In [None]:
round(action_space[6], 1)

In [None]:
class Actor(nn.Module):
    def __init__(self, input_shape):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_shape, 128)
        self.fc2 = nn.Linear(128, 128)
        self.out = nn.Linear(128, 1)

    def forward(self, state):
        out = torch.relu(self.fc1(state))
        out = torch.relu(self.fc2(out))
        output = torch.tanh(self.out(out))
        return output
    
class Critic(nn.Module):
    def __init__(self, input_shape):
        super(Critic, self).__init__()
        self.actor_fc1 = nn.Linear(1, 64)
        self.actor_fc2 = nn.Linear(64, 128)
        self.critic_fc1 = nn.Linear(input_shape, 64)
        self.critic_fc2 = nn.Linear(64, 128)
        self.concatenated_1 = nn.Linear(256, 512)
        self.concatenated_2 = nn.Linear(512, 512)
        self.output = nn.Linear(512, 1)

    def forward(self, actor_input, state_input):
        ac_out = torch.relu(self.actor_fc1(actor_input))
        ac_out = torch.relu(self.actor_fc2(ac_out))

        cri_out = torch.relu(self.critic_fc1(state_input))
        cri_out = torch.relu(self.critic_fc2(cri_out))

        concat = torch.cat([cri_out, ac_out], dim=-1)
        out = torch.relu(self.concatenated_1(concat))
        out = torch.relu(self.concatenated_2(out))
        output = self.output(out)
        
        return output

In [None]:
input_shape = env.observation_space.shape[0]

In [None]:
actor_model = Actor(input_shape)
target_actor = deepcopy(actor_model)
critic_model = Critic(input_shape)
target_critic = deepcopy(critic_model)

actor_optimizer = torch.optim.Adam(list(actor_model.parameters()), lr = 0.01)
critic_optimizer = torch.optim.Adam(list(critic_model.parameters()), lr = 0.01)

loss_function = nn.MSELoss()

In [None]:
buffer_size = 1000
buffer = []
max_iters = 10000
gamma = 0.95
epsilon = 1
eps_decay = 0.995
min_eps = 0.05
tau = 0.01

In [None]:
curr_state, info = env.reset()
env.step(5)
int(target_actor(torch.Tensor(curr_state)).item()*10)/10

In [None]:
def soft_update(target_net, source_net, tau):
    for target_param, source_param in zip(target_net.parameters(), source_net.parameters()):
        target_param.data.copy_(tau * source_param.data + (1.0 - tau) * target_param.data)

In [None]:
# critic_state_dict = torch.load('ddpg_critic_model_weights.pth', weights_only=True)
# actor_state_dict = torch.load('ddpg_actor_model_weights.pth', weights_only=True)

# target_critic.load_state_dict(critic_state_dict)
# critic_model.load_state_dict(critic_state_dict)
# target_actor.load_state_dict(actor_state_dict)
# actor_model.load_state_dict(actor_state_dict)

In [None]:
min_eps = 0.1

In [None]:
critic_loss_history = []
actor_loss_history = []
rewards_history = []

In [None]:
actor_model.train()
critic_model.train()
target_actor.train()
target_critic.train()
for i in range(max_iters * 100):
    action_history = []
    action_count = [0, 0, 0]
    done, truncated = False, False
    curr_state, info = env.reset()
    rewards_sum = 0
    count_explore_exploit = [0, 0]
    while not done and not truncated: 
        rand_num = np.random.uniform(0, 1)
        if rand_num < epsilon:
            action = np.random.normal(0.5, 0.3)
            action = np.clip(action, 0, 1)
            action = int(action * 10) / 10
            action_index = action_space.index(action)
            count_explore_exploit[0] += 1
        else:
            action = target_actor(torch.Tensor(curr_state)).item()
            action = action + np.random.normal(0, 0.2)
            action = np.clip(action, 0, 1)
            action = int(action * 10) / 10
            action_index = action_space.index(action)
            count_explore_exploit[1] += 1

        action_history.append(action)

        next_state, reward, done, truncated, info = env.step(action_index)
        rewards_sum += reward

        if action > 0.5:
            action_count[2] += 1
        elif action < 0.5:
            action_count[0] += 1
        else:
            action_count[1] += 1

        quadruple = (curr_state, action, reward, next_state, done, truncated)
        buffer.append(quadruple)
        if len(buffer) > buffer_size:
            buffer.pop(0)
        
        curr_state = next_state

    rewards_history.append(rewards_sum)
    
    sample_indices = np.random.choice(len(buffer), size=min(400, len(buffer)), replace=False)
    train_sample = [buffer[idx] for idx in sample_indices]

    epsilon = max(epsilon * eps_decay, min_eps/10)

    total_critic_loss = 0
    total_actor_loss = 0
    for quadrup in train_sample:
        curr_state, action, reward, next_state, done, truncated = quadrup

        # critic training
        with torch.no_grad():
            next_action = target_actor(torch.Tensor(next_state)).item()
            next_action = int(next_action * 10) / 10
            next_q_val = target_critic(torch.tensor([next_action]), torch.Tensor(next_state))
            target_q_val = reward*100 + gamma * next_q_val
        curr_q_val = critic_model(torch.tensor([action]), torch.Tensor(curr_state))
        critic_loss = loss_function(curr_q_val, target_q_val)
        critic_optimizer.zero_grad()
        critic_loss.backward()
        critic_optimizer.step()
        total_critic_loss += critic_loss.item()

        #actor training
        curr_action_pred = actor_model(torch.tensor(curr_state))
        curr_q_pred = critic_model(curr_action_pred, torch.Tensor(curr_state))
        actor_loss = -1 * curr_q_pred
        actor_optimizer.zero_grad()
        actor_loss.backward()
        actor_optimizer.step()
        total_actor_loss += actor_loss.item()
    
    total_critic_loss /= len(train_sample)
    critic_loss_history.append(total_critic_loss)
    total_actor_loss /= len(train_sample)
    actor_loss_history.append(total_actor_loss)

    soft_update(target_actor, actor_model, tau)
    soft_update(target_critic, critic_model, tau)

    # if i % 10 == 0:
    #     target_actor.load_state_dict(actor_model.state_dict())
    #     target_critic.load_state_dict(critic_model.state_dict())

    print(i, "-> CRITIC LOSS:", total_critic_loss, "-> ACTOR LOSS:", total_actor_loss, "-> REWARD:", rewards_sum, "\n-> Actions:", action_history[:20])
    print("Action Count", action_count, "Exploration COUNT", count_explore_exploit)


In [None]:
torch.save(target_actor.state_dict(), 'ddpg_actor_model_weights.pth')
torch.save(target_critic.state_dict(), 'ddpg_critic_model_weights.pth')

In [None]:
plt.plot(actor_loss_history)

In [None]:
plt.plot(critic_loss_history)

In [None]:
plt.plot(rewards_history)

In [None]:
target_actor.eval()

done, truncated = False, False
curr_state, info = env.reset()
total_reward = 0
action_history = []
action_count = [0, 0, 0]
while not done and not truncated: 
    action = target_actor(torch.Tensor(curr_state)).item()
    action = action
    action = np.clip(action, 0, 1)
    action = int(action * 10) / 10
    action_index = action_space.index(action)

    action_history.append(action)

    next_state, reward, done, truncated, info = env.step(action_index)
    total_reward += reward

    if action > 0.5:
        action_count[2] += 1
    elif action < 0.5:
        action_count[0] += 1
    else:
        action_count[1] += 1
    
    curr_state = next_state
env.unwrapped.save_for_render(dir = "render_logs")
print(action_history)

In [None]:
print(total_reward)
print(action_count)

In [None]:
# renderer = Renderer(render_logs_dir="render_logs")
# renderer.run()