In [131]:
import time
import copy
import numpy as np
import pandas as pd

import yfinance as yf

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

from plotly import tools
from plotly.graph_objs import *
from plotly.offline import init_notebook_mode, iplot, iplot_mpl

## Data loading and preprocessing

In [132]:
# loading data
start_date='2020-01-01'
freq = "1d"

data = yf.download(["BTC-USD"], 
                   start=start_date,
                   interval=freq)

data.head()

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01,7194.89209,7254.330566,7174.944336,7200.174316,7200.174316,18565664997
2020-01-02,7202.55127,7212.155273,6935.27002,6985.470215,6985.470215,20802083465
2020-01-03,6984.428711,7413.715332,6914.996094,7344.884277,7344.884277,28111481032
2020-01-04,7345.375488,7427.385742,7309.51416,7410.656738,7410.656738,18444271275
2020-01-05,7410.45166,7544.49707,7400.535645,7411.317383,7411.317383,19725074095


In [133]:
# splitting data into train and test
train = data[:'2023-12-31']
test = data['2024-01-01':]

print(f"Number of days in train sample: {len(train):5}")
print(f"Number of days in test sample : {len(test):5}")

Number of days in train sample:  1461
Number of days in test sample :    87


In [134]:
def plot_train_test(train, test, date_split):
    
    data = [
        Candlestick(x=train.index, open=train['Open'], high=train['High'], low=train['Low'], close=train['Close'], name='train'),
        Candlestick(x=test.index, open=test['Open'], high=test['High'], low=test['Low'], close=test['Close'], name='test')
    ]
    layout = {
         'shapes': [
             {'x0': date_split, 'x1': date_split, 'y0': 0, 'y1': 1, 'xref': 'x', 'yref': 'paper', 'line': {'color': 'rgb(0,0,0)', 'width': 1}}
         ],
        'annotations': [
            {'x': date_split, 'y': 1.0, 'xref': 'x', 'yref': 'paper', 'showarrow': False, 'xanchor': 'left', 'text': ' test data'},
            {'x': date_split, 'y': 1.0, 'xref': 'x', 'yref': 'paper', 'showarrow': False, 'xanchor': 'right', 'text': 'train data '}
        ]
    }
    figure = Figure(data=data, layout=layout)
    iplot(figure)

In [135]:
plot_train_test(train, test, '2024-01-01')

In [136]:
class TradingEnv:
    
    def __init__(self, data, history_t=7, initial_capital=10000):
        self.data = data
        self.history_t = history_t
        self.initial_capital = initial_capital
        self.reset()
        
    def reset(self):
        self.t = 0
        self.done = False
        self.profit = 0
        self.positions = []
        self.position_value = 0
        self.history = [0 for _ in range(self.history_t)]
        return [self.position_value] + self.history # obs
    
    def step(self, act):
        reward = 0
        
        # act = {0: stay, 1: buy, 2: sell}

        # BUY
        if act == 1:
            if self.initial_capital - sum(self.positions) >= self.data.iloc[self.t]['Close']:
                reward = - self.data.iloc[self.t]['Close']
                self.positions.append(self.data.iloc[self.t]['Close'])
            else:
                reward = - 100000
        
        # SELL
        elif act == 2:
            if len(self.positions) == 0:
                reward = -1
            else:
                reward += self.data.iloc[self.t]['Close']*len(self.positions)
                self.profit += reward - sum(self.positions)
                self.positions = []

        # HOLD
        elif act == 0:
            reward = -1
        
        # predict next value
        self.position_value = sum(self.history)/self.history_t

        # update history
        self.history.pop(0)
        self.history.append(self.data.iloc[self.t]['Close'])

        if (self.t==len(self.data)-1):
            self.done=True

        # clipping reward
        if reward > 0:
            reward = 1
        elif reward < 0:
            reward = -1

        # set next time
        self.t += 1

        #print ("t={%d}, done={%str}"%(self.t,self.done))
        return [self.position_value] + self.history, reward, self.done # obs, reward, done
    

In [137]:
env = TradingEnv(train)
print(env.reset())
for _ in range(3):
    pact = np.random.randint(3)
    print(pact)
    print(env.step(pact))

[0, 0, 0, 0, 0, 0, 0, 0]
1
([0.0, 0, 0, 0, 0, 0, 0, 7200.17431640625], -1, False)
2
([1028.5963309151787, 0, 0, 0, 0, 0, 7200.17431640625, 6985.47021484375], 1, False)
1
([2026.5206473214287, 0, 0, 0, 0, 7200.17431640625, 6985.47021484375, 7344.88427734375], -1, False)


In [138]:
# network instead of matrix
# input_dim = length of observations
# output_dim = number of possible actions

# goal : go from observations to a score for each action

class Q_network(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, output_dim) -> None:
        super(Q_network, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        out = self.layers(x)
        return out

In [139]:
hidden_size=100
input_size=env.history_t+1
output_size=3
USE_CUDA = False
LR = 0.001

In [140]:
Q = Q_network(input_size, hidden_size, output_size)

In [141]:
Q_ast = copy.deepcopy(Q)

In [142]:
if USE_CUDA:
    Q = Q.cuda()
loss_function = nn.MSELoss()
optimizer = optim.Adam(list(Q.parameters()), lr=LR)

In [143]:
epoch_num = 10
step_max = len(env.data)-1
memory_size = 200
batch_size = 50
gamma = 0.97

In [144]:
obs, reward, done = env.step(5)

In [145]:
memory = []
total_step = 0
total_rewards = []
total_losses = []
epsilon = 1.0
epsilon_decrease = 1e-3
epsilon_min = 0.1
start_reduce_epsilon = 200
train_freq = 10
update_q_freq = 20
gamma = 0.97
show_log_freq = 5

In [146]:
start = time.time()
for epoch in range(epoch_num):

    pobs = env.reset()
    step = 0
    done = False
    total_reward = 0
    total_loss = 0

    while not done and step < step_max:

        # select act
        pact = np.random.randint(3)
        if np.random.rand() > epsilon:
            pact = Q(torch.from_numpy(np.array(pobs, dtype=np.float32).reshape(1, -1)))
            pact = np.argmax(pact.data)
            pact = pact.numpy()

        # act
        obs, reward, done = env.step(pact)

        # add memory
        memory.append((pobs, pact, reward, obs, done))
        if len(memory) > memory_size:
            memory.pop(0)

        # train or update q
        if len(memory) == memory_size:
            if total_step % train_freq == 0:
                shuffled_memory = np.random.permutation(memory)
                memory_idx = range(len(shuffled_memory))
                for i in memory_idx[::batch_size]:
                    batch = np.array(shuffled_memory[i:i+batch_size])
                    b_pobs = np.array(batch[:, 0].tolist(), dtype=np.float32).reshape(batch_size, -1)
                    b_pact = np.array(batch[:, 1].tolist(), dtype=np.int32)
                    b_reward = np.array(batch[:, 2].tolist(), dtype=np.int32)
                    b_obs = np.array(batch[:, 3].tolist(), dtype=np.float32).reshape(batch_size, -1)
                    b_done = np.array(batch[:, 4].tolist(), dtype=np.bool)

                    q = Q(torch.from_numpy(b_pobs))
                    q_ = Q_ast(torch.from_numpy(b_obs))
                    maxq = np.max(q_.data.numpy(),axis=1)
                    target = copy.deepcopy(q.data)
                    for j in range(batch_size):
                        target[j, b_pact[j]] = b_reward[j]+gamma*maxq[j]*(not b_done[j])
                    Q.zero_grad()
                    loss = loss_function(q, target)
                    total_loss += loss.data.item()
                    loss.backward()
                    optimizer.step()
                    
            if total_step % update_q_freq == 0:
                Q_ast = copy.deepcopy(Q)
                
            # epsilon
            if epsilon > epsilon_min and total_step > start_reduce_epsilon:
                epsilon -= epsilon_decrease

            # next step
            total_reward += reward
            pobs = obs
            step += 1
            total_step += 1

        total_rewards.append(total_reward)
        total_losses.append(total_loss)

        if (epoch+1) % show_log_freq == 0:
            log_reward = sum(total_rewards[((epoch+1)-show_log_freq):])/show_log_freq
            log_loss = sum(total_losses[((epoch+1)-show_log_freq):])/show_log_freq
            elapsed_time = time.time()-start
            print('\t'.join(map(str, [epoch+1, epsilon, total_step, log_reward, log_loss, elapsed_time])))
            start = time.time()
            
#return Q, total_losses, total_rewards


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations



5	0.0999999999999992	5643	-589612.4	24472023705.744884	17.032294750213623
5	0.0999999999999992	5644	-589612.8	24472023705.744884	0.0015175342559814453
5	0.0999999999999992	5645	-589613.4	24472023705.744884	0.0007939338684082031
5	0.0999999999999992	5646	-589613.8	24472023705.744884	0.0007860660552978516
5	0.0999999999999992	5647	-589614.4	24472023705.744884	0.0006487369537353516
5	0.0999999999999992	5648	-589615.2	24472023705.744884	0.0006465911865234375
5	0.0999999999999992	5649	-589615.8	24472023705.744884	0.0008451938629150391
5	0.0999999999999992	5650	-589616.6	24472023705.744884	0.0007572174072265625
5	0.0999999999999992	5651	-589617.6	24472024203.335133	0.02105998992919922
5	0.0999999999999992	5652	-589618.8	24472024700.925377	0.0010585784912109375
5	0.0999999999999992	5653	-589620.2	24472025198.515625	0.0006721019744873047
5	0.0999999999999992	5654	-589621.8	24472025696.105873	0.0006420612335205078
5	0.0999999999999992	5655	-589623.6	24472026193.696117	0.0006284713745117188
5	0.

In [147]:
test_env = TradingEnv(test)
pobs = test_env.reset()
test_acts = []
test_rewards = []

for _ in range(len(test_env.data)-1):
    
    pact = Q(torch.from_numpy(np.array(pobs, dtype=np.float32).reshape(1, -1)))
    pact = np.argmax(pact.data)
    test_acts.append(pact.item())
            
    obs, reward, done = test_env.step(pact.numpy())
    test_rewards.append(reward)

    pobs = obs

test_profits = test_env.profit

In [148]:
test_profits

22147.35546875

In [149]:
d = {0: 'Stay', 1: 'Buy', 2: 'Sell'}

for a, r in zip(test_acts, test_rewards):
    print(d[a], r)

Buy -1
Buy -1
Buy -1
Buy -1
Buy -1
Buy -1
Buy -1
Sell 1
Stay -1
Sell -1
Sell -1
Stay -1
Sell -1
Sell -1
Sell -1
Sell -1
Stay -1
Sell -1
Sell -1
Stay -1
Stay -1
Sell -1
Sell -1
Sell -1
Stay -1
Sell -1
Stay -1
Sell -1
Sell -1
Stay -1
Sell -1
Sell -1
Stay -1
Sell -1
Sell -1
Sell -1
Sell -1
Stay -1
Sell -1
Sell -1
Stay -1
Sell -1
Sell -1
Sell -1
Sell -1
Sell -1
Sell -1
Sell -1
Sell -1
Sell -1
Stay -1
Sell -1
Sell -1
Sell -1
Stay -1
Stay -1
Stay -1
Sell -1
Stay -1
Stay -1
Sell -1
Stay -1
Sell -1
Sell -1
Sell -1
Sell -1
Stay -1
Stay -1
Sell -1
Sell -1
Sell -1
Buy -1
Sell 1
Sell -1
Sell -1
Sell -1
Sell -1
Sell -1
Sell -1
Sell -1
Stay -1
Sell -1
Sell -1
Stay -1
Sell -1
Buy -1
