In [None]:
import gymnasium as gym
import gym_trading_env
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
from copy import copy
from copy import deepcopy
import talib
from gym_trading_env.renderer import Renderer

In [None]:
nifty50_index_symbol = '^NSEI'

start_date = '2010-01-01'
end_date = '2019-06-30'
# end_date = '2012-06-11'

data = yf.download(nifty50_index_symbol, start=start_date, end=end_date)

In [None]:
df = data

In [None]:
df.rename(columns={'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume'}, inplace=True)

In [None]:
df['feature-MACD'], df['feature-MACD_signal'], df['feature-MACD_hist'] = talib.MACD(df['close'], fastperiod=12, slowperiod=26, signalperiod=9)

df['feature-RSI'] = talib.RSI(df['close'], timeperiod=14)

df['feature-CCI'] = talib.CCI(df['high'], df['low'], df['close'], timeperiod=14)

df['feature-ADX'] = talib.ADX(df['high'], df['low'], df['close'], timeperiod=14)

df.dropna(inplace = True)


In [None]:
# df.drop(columns = ['open', 'high', 'low', 'Adj Close', 'volume'], inplace = True)

In [None]:
df.head()

In [None]:
env = gym.make("TradingEnv",
        name= "BTCUSD",
        df = df,
        positions = list(np.linspace(0, 1, 11)), # -1 (=SHORT), 0(=OUT), +1 (=LONG)
        # trading_fees = 0.01/100, # 0.01% per stock buy / sell
        # borrow_interest_rate= 0.0003/100, # 0.0003% per timestep 
    )

In [None]:
class DQN(nn.Module):
    def __init__(self, input_shape, output_shape):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_shape, 128)
        self.fc2 = nn.Linear(128, 128)
        self.out = nn.Linear(128, output_shape)

    def forward(self, state):
        out = torch.relu(self.fc1(state))
        out = torch.relu(self.fc2(out))
        output = torch.softmax(self.out(out), dim = -1)
        return output

In [None]:
input_shape = env.observation_space.shape[0]

In [None]:
dqn_model = DQN(input_shape, 11)
target_dqn = deepcopy(dqn_model)
optimizer = torch.optim.Adam(list(dqn_model.parameters()), lr = 0.001)
loss_function = nn.MSELoss()

In [None]:
buffer_size = 1000
buffer = []
max_iters = 100000
gamma = 0.9
epsilon = 1
eps_decay = 0.995
min_eps = 0.1

In [None]:
position_index = env.action_space.sample()
position_index

In [None]:
# target_dqn.load_state_dict(torch.load('dqn_model_weights.pth', weights_only=True))
# dqn_model.load_state_dict(torch.load('dqn_model_weights.pth', weights_only=True))

In [None]:
reward_history = []
loss_history = []

In [None]:
for i in range(max_iters):
    action_count = [0, 0, 0]
    exploration_exploitation = [0, 0]
    action_history = []
    total_loss = 0
    done, truncated = False, False
    curr_state, info = env.reset()
    total_reward = 0
    while not done and not truncated:
        random_num = np.random.uniform(0, 1)
        if random_num < epsilon:
            action_index = env.action_space.sample()
            exploration_exploitation[0] += 1
        else:
            action_index = torch.argmax(target_dqn(torch.Tensor(curr_state))).item()
            exploration_exploitation[1] += 1

        action_history.append(action_index)

        if action_index > 5:
            action_count[2] += 1
        elif action_index == 5:
            action_count[1] += 1
        else:
            action_count[0] += 1
        
        next_state, reward, done, truncated, info = env.step(action_index)
        total_reward += reward

        buffer.append((curr_state, action_index, reward, next_state, done, truncated))

        if len(buffer) > buffer_size:
            buffer.pop(0)

        curr_state = next_state

    reward_history.append(total_reward)
    sample_indices = np.random.choice(len(buffer), size=min(400, len(buffer)), replace=False)
    train_sample = [buffer[idx] for idx in sample_indices]

    for quad in train_sample:
        curr_state, action_index, reward, next_state, done, truncated = quad
        curr_q = dqn_model(torch.Tensor(curr_state))
        with torch.no_grad():
            target_q_val = reward*100 + gamma * torch.max(target_dqn(torch.Tensor(next_state))).item() if done is False and truncated is False else reward
            target_q = deepcopy(curr_q.detach())
            target_q[action_index] = torch.tensor(target_q_val)
        loss = loss_function(curr_q, target_q)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    total_loss /= len(sample_indices)
    loss_history.append(total_loss)

    epsilon = max(min_eps, epsilon * eps_decay)

    if i % 5 == 0:
        target_dqn.load_state_dict(dqn_model.state_dict())
    
    print(i, "REWARD: ", total_reward, "LOSS: ", total_loss, "portfolio difference: ", info['portfolio_valuation'] - 1000, "ACTION COUNT: ", action_count)
    print("EXPL: ", exploration_exploitation, "ACTION_HISTORY: ", action_history[:25])

In [None]:
torch.save(target_dqn.state_dict(), 'dqn_model_weights.pth')

In [None]:
plt.plot(loss_history)

In [None]:
plt.plot(reward_history)

In [None]:
done, truncated = False, False
curr_state, info = env.reset()
total_reward = 0
action_history = []
action_count = [0, 0, 0]
while not done and not truncated:
    action_index = torch.argmax(target_dqn(torch.Tensor(curr_state))).item()

    action_history.append(action_index)

    if action_index > 5:
        action_count[2] += 1
    elif action_index == 5:
        action_count[1] += 1
    else:
        action_count[0] += 1
    
    next_state, reward, done, truncated, info = env.step(action_index)
    total_reward += reward
    curr_state = next_state
env.unwrapped.save_for_render(dir = "render_logs")

In [None]:
print(total_reward)
print(action_count)
print(action_history)

In [None]:
# renderer = Renderer(render_logs_dir="render_logs")
# renderer.run()