In [185]:
import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim

class DQN(nn.Module):
    def __init__(self, state_size, action_size, n_assets):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, 24)

        self.logits_layer =  nn.ModuleList()
        self.softmax = nn.Softmax()

        for i in range(n_assets):
            self.logits_layer.append(nn.Linear(24, action_size))


    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        outputs = []
        for i in range(len(self.logits_layer)):
            outputs.append(self.softmax(self.logits_layer[i](x)))
        return torch.stack(outputs)

class DQNAgent:
    def __init__(self, state_size, action_size = 3, n_assets = 2):
        
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # Discount rate
        self.epsilon = 1.0   # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995

        self.network = DQN(state_size, action_size, n_assets)
        self.target_network = DQN(state_size, action_size, n_assets)
    
        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.network.parameters(), lr=0.001)
        self.n_assets = n_assets

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        actions = []
        state = torch.from_numpy(state[2]).float().unsqueeze(0)
        q_values = self.network(state)
        for i in range(self.n_assets):
            if np.random.rand() <= self.epsilon:
                actions.append(random.randrange(self.action_size))
            else:
                actions.append(q_values[i].max(1)[1].item())
        return actions 

    def replay(self, batch_size = 10):
        if len(self.memory) < batch_size:
            return
        
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            
            state = torch.from_numpy(state[2]).float().unsqueeze(0)
            next_state = torch.from_numpy(next_state[2]).float().unsqueeze(0)
            q_expected = self.network(state)
            q_expected_target = q_expected.clone().detach()
            q_next = self.network(next_state)       
            
            for i, a in enumerate(action):
                q_target = reward
                if not done:
                    q_target = (reward + self.gamma * q_next[i].detach().max())
                
                q_expected_target[i][0][a] = q_target
            
            loss = self.criterion(q_expected, q_expected_target)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# Define your environment and data
# You'll need to replace these placeholders with your actual environment and data



In [189]:
env = TradingEnvironment(tickers = ["BTC-USD", "ETH-USD"], start_date="2012-01-01", end_date="2021-05-01")


state_size = 2  # Example state size
action_size = 3  # Example action size
n_assets = 2
agent = DQNAgent(state_size, action_size, n_assets)
EPISODES = 200
# Training loop
# Replace this with your actual training loop using your data
for episode in range(EPISODES):
    state = env.reset()
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        reward = reward if not done else -10
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, running capital: {}, e: {:.2}"
                  .format(episode, EPISODES, env.balance + env.current_price * env.position, agent.epsilon))
            break
    agent.replay()


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
  outputs.append(self.softmax(self.logits_layer[i](x)))


In [126]:
dqn = DQN(state_size=2,action_size=3,n_assets=2)
dqn(torch.tensor([2.0,5.0]))

  outputs.append(self.softmax(self.logits_layer[i](x)))


tensor([[0.3924, 0.2729, 0.3347],
        [0.2977, 0.2972, 0.4051]], grad_fn=<StackBackward0>)

In [160]:
agent = DQNAgent(state_size=2)
agent.act((_, _, np.array([2.0,5.0])))

  outputs.append(self.softmax(self.logits_layer[i](x)))


[1, 0]

In [172]:
import yfinance as yf
import numpy as np

class TradingEnvironment:
    def __init__(self, tickers, start_date, end_date):
        self.tickers = tickers
        self.start_date = start_date
        self.end_date = end_date
        self.data = self._load_data()
        self.reset()

    def _load_data(self):
        
        stocks_data = []

        for ticker in self.tickers:
            stocks_data.append(yf.download(ticker, 
                                 start=self.start_date, 
                                 end=self.end_date))

        return stocks_data

    def reset(self):
        self.current_step = 0
        self.total_steps = len(self.data[0]) - 1
        self.balance = 10000  # Initial account balance
        self.positions = [0 for _ in self.data]# Initial position
        self.current_prices = [self.data[i]['Close'][self.current_step] for i in range(len(self.data))]
        self.done = False
        return self._get_observation()

    def _get_observation(self):
        observation = (self.balance, self.positions, np.array(self.current_prices))
        return observation

    def step(self, actions):
        if self.done:
            raise ValueError("Episode is done, call reset() to start a new episode")

        self.current_step += 1
        if self.current_step > self.total_steps:
            self.done = True
            next_state = None
            reward = 0
            return next_state, reward, self.done

        self.current_prices = [self.data[i]['Close'][self.current_step] for i, _ in enumerate(self.data)]

        for i, action in enumerate(actions):
            if action == 0:  # Buy
                if self.balance >= self.current_prices[i]:
                    self.positions[i] += 1
                    self.balance -= self.current_prices[i]
            elif action == 1:  # Sell
                if self.positions[i] > 0:
                    self.positions[i] -= 1
                    self.balance += self.current_prices[i]
            elif action == 2: # Hold
                pass

        next_state = self._get_observation()

        # Calculate reward
        reward = 0
        for i, _ in enumerate(self.data):
            reward += (self.positions[i] * self.current_prices[i])   # Reward based on account balance
        reward += self.balance - 10000
        return next_state, reward, self.done


In [50]:
env = TradingEnvironment(tickers = ["BTC-USD", "ETH-USD"], start_date="2019-01-01", end_date="2021-05-01")
env.step([0,0])

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


((5901.542892456055, [1, 1], [3943.409423828125, 155.0476837158203]),
 0.0,
 False)

In [92]:
env.step([2,2])

((9703.848777770996, [0, 0], [3632.070556640625, 122.55360412597656]),
 -296.1512222290039,
 False)

In [14]:
env = TradingEnvironment(ticker = "ETH-USD", start_date="2015-01-01", end_date="2021-05-01")


[*********************100%%**********************]  1 of 1 completed


In [179]:
env = TradingEnvironment(tickers = ["BTC-USD", "ETH-USD"], start_date="2019-01-01", end_date="2021-05-01")


state_size = 2  # Example state size
action_size = 3  # Example action size
n_assets = 2
agent = DQNAgent(state_size, action_size, n_assets)
EPISODES = 50
# Training loop
# Replace this with your actual training loop using your data
for episode in range(EPISODES):
    state = env.reset()
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        reward = reward if not done else -10
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, running capital: {}, e: {:.2}"
                  .format(episode, EPISODES, env.balance + env.current_price * env.position, agent.epsilon))
            break
    agent.replay()


[*********************100%%**********************]  1 of 1 completed


[*********************100%%**********************]  1 of 1 completed

tensor([[[2.6474e-04, 9.9973e-01, 2.3970e-06]],

        [[0.0000e+00, 1.0000e+00, 0.0000e+00]]])
tensor([[[1.3716e+03, 9.9973e-01, 2.3970e-06]],

        [[0.0000e+00, 1.0000e+00, 0.0000e+00]]])



  outputs.append(self.softmax(self.logits_layer[i](x)))


IndexError: index 1 is out of bounds for dimension 0 with size 1

In [12]:
env = TradingEnvironment(ticker = "ETH-USD", start_date="2021-11-01", end_date="2022-08-01")


[*********************100%%**********************]  1 of 1 completed


In [214]:
state = env.reset()

l = []

for time in range(200):
      action = agent.act(state)
      next_state, reward, done = env.step(action)
      state = next_state
      l.append(state[0])



9700.747009277344
9386.066009521484
9078.158020019531
8761.442016601562
8423.811004638672
8757.167999267578
8426.24398803711
8758.638000488281
8411.026000976562
8056.6400146484375
7689.910003662109
7329.509002685547
6948.856994628906
6538.691009521484
6063.779998779297
6530.055999755859
6530.055999755859
7010.4110107421875
7153.2340087890625
7061.898010253906
7509.012023925781
7975.552032470703
8439.001037597656
8904.854034423828
9375.058044433594
8911.777038574219
8483.189025878906
7654.008026123047
7580.533020019531
7107.031005859375
6665.309997558594
5760.628021240234
6794.904052734375
6794.904052734375
7490.720031738281
7490.720031738281
8186.929016113281
8559.632995605469
7410.283966064453
7763.272979736328
8939.976989746094
9761.039978027344
9086.179992675781
9805.567016601562
10499.715026855469
11265.549011230469
12039.385009765625
12802.226989746094
12065.203979492188
12818.795959472656
13536.052978515625
12779.320007324219
13551.960998535156
12667.517028808594
13630.2369995117

  outputs.append(self.softmax(self.logits_layer[i](x)))


In [165]:
env.balance + env.current_price * env.position

10000.0