 # Deep Q Networks

 ## Imports

In [1]:
import gym, random, pickle, os.path, math, glob
import numpy as np

from timeit import default_timer as timer
from datetime import timedelta
from timeit import default_timer as timer

import torch
import torch.optim as optim

import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
from IPython.display import clear_output

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from utils.wrappers import *
from utils.hyperparameters import Config
from utils.plot import plot_all_data
from utils.plot import *

from agents.BaseAgent import BaseAgent
from functions import *
import csv

In [2]:
def save_profit(log_dir, step, profit):
    with open(os.path.join(log_dir, 'profit.csv'), 'a') as f:
        writer = csv.writer(f)
        writer.writerow((step, profit))
        
def plot_sigma(profit_TJ, log_dir, frame_idx, title):
    l = profit_TJ[0].shape[0]
    X = np.reshape(profit_TJ, (-1, l))[-15:]
    X = X.cumsum(axis = 1)
    t = np.arange(X.shape[1])

    mu1 = X.mean(axis=0)
    sigma1 = X.std(axis=0)

    # plot it!
    fig, ax = plt.subplots(1, figsize=(20, 5),)
    ax.plot(t, mu1, lw=2, label='mean', color='blue', alpha=0.5)
    ax.plot(t, X[-1], lw=2, label='now', color='red')
    ax.plot(t, [0 for i in range(len(t))], color='black', alpha=0.5)
    ax.fill_between(t, mu1+sigma1, mu1-sigma1, facecolor='blue', alpha=0.3)
    ax.fill_between(t, mu1+sigma1*2, mu1-sigma1*2, facecolor='blue', alpha=0.1)
    ax.set_title('Mean & Std')
    ax.legend(loc='upper left')
    ax.set_xlabel('num steps')
    ax.set_ylabel(title)
    fig = plt.gcf()
    plt.show()
    fig.savefig(log_dir + '/' + title + '_log/' + title+'_'+str(frame_idx)+'.png')
    plt.close()

def run_testing(model, env):
    env_id = 'Trading_system'
    episode_reward = 0
    observation = env.reset()
    while(1):
        action = model.get_action(observation, 0)
        prev_observation=observation
        observation, reward, done, _ = env.step(action)
        observation = None if done else observation
        episode_reward += reward
        
        if done:
            print ('Testing: profit: ', env.profit_df.sum())
            model.finish_nstep()
            model.reset_hx()
            break
    Total_profit = env.profit_df.sum()
    env.writeaction(log_dir)
    return Total_profit, env

In [3]:
class action_space():
    def __init__(self, a):
        self.n = a

class environment():
    def __init__(self, df, log_dir, firstday, lastday, transection=0.001, delay=10, training=1):
        self.delay = delay
        self.n_features = df.shape[1]
        
        self.firstday_orig = firstday
        self.lastday_orig = lastday
        self.training = training
        
        #self.firstday = firstday
        #self.lastday = lastday
        
        self.action_space = action_space(3)
        self.observation_space = Box(high=5000, low=-5000, shape=(1, self.n_features*delay))
        self.df_orig = df.copy().reset_index(drop=True)
        
        self.log_dir = log_dir
        self.env_id = None
        self.today = None
        self.position = None
        self.holding_prize = None
        self.profit_df = np.array([])
        self.reward_df = np.array([])
        self.action_df = np.array([])
        self.transection = transection
        self.counter = 0
        self.reward_list = []
        self.profit_list = []
        self.action_list = []
        self.buffer = []
        self.init()

    def init(self):
        cols = [col for col in self.df_orig.columns if col not in ['sell', 'hold', 'buy', 'profit', 'allocation']]
        self.head = self.df_orig.iloc[self.firstday_orig]
        self.min_ = self.df_orig[cols].min()
        self.max_ = self.df_orig[cols].max()
        self.range_ = self.max_-self.min_
        for index in self.range_.index:
            self.df_orig.loc[:, index] = (self.df_orig[index] - self.min_[index]).values
            self.df_orig.loc[:, index] = (self.df_orig[index] / self.range_[index]).values

    def get_shapre(self, profit):
        self.buffer.append(profit)
        if len(self.buffer)<=1:
            return 0
        if len(self.buffer)>5:
            self.buffer = self.buffer[1:]
        mean = np.mean(self.buffer)
        std = np.std(self.buffer)
        if std ==0:
            return 0
        return mean/std
        
    def step(self, action):
        self.counter += 1
        if self.today == self.lastday-1:
            done = 1  
            observation = None
            self.reward_list.append(self.reward_df)
            self.profit_list.append(self.profit_df)
            self.action_list.append(self.action_df)
            return None, 0, done, _
        else :
            done = 0
        if self.today == self.lastday-2:
            action = 1
        old_state_n = self.position
        new_state_n = action
        old_holding = self.holding_prize
        new_holding = self.df.loc[self.today+1, 'open']
        
        #print ('action: ', action)
        # Update transection cost
        #if old_state_n == new_state_n:
        #    transection_fee = 0
        #elif abs(old_state_n - new_state_n)==2:
        #    transection_fee = new_holding*2*self.transection
        #else:
        #    transection_fee = new_holding*1*self.transection
        transection_fee = 0
        if (old_state_n!=1) & (old_state_n-new_state_n)!=0:
            transection_fee = self.holding_prize*self.transection
        #print ('old_state_n: ', old_state_n)
        #print ('new_state_n: ', new_state_n)
        #print ('transection_fee: ', transection_fee)
        #print ('----------------------------------------------')
        #-----------------------------------------------
        
        # Update profit
        if (old_state_n==1) | (old_state_n==new_state_n):
            profit = 0 - transection_fee
        else:
            profit = (new_holding - old_holding)*(old_state_n-1) - transection_fee
        profit = profit * self.range_.open / self.head.open
        #print ('old_holding', old_holding)
        #print ('new_holding', new_holding)
        #print ('profit: ', profit)
        #print ('old_state_n: ', old_state_n)
        #print ('new_state_n: ', new_state_n)
        #print ('old_holding: ', old_holding)
        #print ('new_holding: ', new_holding)
        #print ('profit: ', profit)
        #print ('----------------------------------------------')
        
        # Update holding prize
        if new_state_n == 1:
            self.holding_prize = 0
        elif old_state_n != new_state_n:
            self.holding_prize = new_holding
        #print ('holding_prize: ', self.holding_prize)
        #print ('old_state_n: ', old_state_n)
        #print ('new_state_n: ', new_state_n)
        #print ('holding_prize: ', new_holding)
        #print ('----------------------------------------------')
        # ------------------------------------
        
        old_prize = self.df.loc[self.today, 'open']
        new_prize = self.df.loc[self.today+1, 'open']
        
        # Update reward
        if new_state_n==1:
            reward = 0
        elif new_state_n==0:
            reward = np.log(old_prize / (new_prize+transection_fee))
        else:
            reward = np.log((new_prize-transection_fee)/old_prize)
        #print ('old_state_n: ', old_state_n)
        #print ('new_state_n: ', new_state_n)
        #print ('old_prize: ', old_prize)
        #print ('new_prize: ', new_prize)
        #print ('reward: ', reward)
        #print ('----------------------------------------------')
        #reward = self.get_shapre(reward)
        reward *= 1
        self.log_PRA(profit, reward, action)
        self.set_PRA(profit, self.holding_prize, action)
        self.position = action
        observation = self.get_state_seq(self.today+1)
        #print ('old_prize', old_prize)
        #print ('new_prize', new_prize)
        #print ('reward: ', reward)        
        self.today+=1
        return observation, reward, done, _ 

    def get_state_seq(self, step):
        #print ('step: ', step)
        step += 1
        if (step < self.delay) & (self.training):
            head = self.delay-step
            header = self.df_orig[self.firstday_orig-head:self.firstday_orig] 
            tail = self.delay-head
            tailer = self.df[:tail] 
            state = header.append(tailer).values.flatten()
            #print ('head', head)
        else:
            state = self.df.iloc[step-self.delay:step].values.flatten()
        state = np.expand_dims(state, 0)
        return state

    def set_PRA(self, profit, allocation, action):
        self.df.loc[self.today+1, 'profit'] = profit
        self.df.loc[self.today+1, 'allocation'] = self.holding_prize
        arr = [0,0,0]
        arr[action] = 1
        self.df.loc[self.today+1, ['sell', 'hold', 'buy']] = arr
        
    def log_PRA(self, profit, reward, action):
        self.profit_df = np.append(self.profit_df, profit)
        self.reward_df = np.append(self.reward_df, reward)
        self.action_df = np.append(self.action_df, action)
        
    def reset(self):
        self.sub_df()
        self.today = self.firstday
        self.position = 1
        self.holding_prize = 0
        self.buffer = []
        
        self.profit_df = np.array([])
        self.reward_df = np.array([])
        self.action_df = np.array([])
        s = self.get_state_seq(self.today)
        return s
    def close(self):
        pass
    def writeaction(self, log_dir):
        with open(os.path.join('%s/%s.csv'%(log_dir, 'action')), 'a') as f:
            writer = csv.writer(f)
            writer.writerow(self.action_list[-1])

    def sub_df(self):
        if self.training:
            sub_inx = np.random.choice(range(self.firstday_orig+1, self.lastday_orig), int((self.lastday_orig-self.firstday_orig)/4) )
            sub_inx.sort()
            self.df = self.df_orig.iloc[sub_inx].copy().reset_index(drop=True)
            self.firstday = 0
            self.lastday = self.df.shape[0]
        else:
            self.df = self.df_orig
            self.firstday = self.firstday_orig
            self.lastday = self.lastday_orig

In [5]:
#torch.cuda.set_device(1)

 # Hyperparameters

In [6]:
config = Config()

config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#epsilon variables
config.epsilon_start    = 1.0
config.epsilon_final    = 0.1
config.epsilon_decay    = 30000
config.epsilon_by_frame = lambda frame_idx: config.epsilon_final + (config.epsilon_start - config.epsilon_final) * math.exp(-1. * frame_idx / config.epsilon_decay)

#misc agent variables
config.GAMMA = 0.99
config.LR    = 1e-4

#memory
config.TARGET_NET_UPDATE_FREQ = 1000
config.EXP_REPLAY_SIZE        = 25000
config.BATCH_SIZE             = 32

#Learning control variables
config.LEARN_START = 10000
config.MAX_FRAMES  = 250000
config.UPDATE_FREQ = 1

#data logging parameters
config.ACTION_SELECTION_COUNT_FREQUENCY = 1000


 ## Replay Memory

In [7]:
class ExperienceReplayMemory:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, transition):
        self.memory.append(transition)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size), None, None

    def __len__(self):
        return len(self.memory)


 ## Network Declaration

In [8]:
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()
        
        self.input_shape = input_shape[1]
        self.num_actions = num_actions
        
        self.fc1 = nn.Linear(self.input_shape, 256)
        self.fc1.weight.data.normal_(0, 0.1)   
        self.fc2 = nn.Linear(256, 256)
        self.fc2.weight.data.normal_(0, 0.1)  
        self.fc3 = nn.Linear(256, 256)
        self.fc3.weight.data.normal_(0, 0.1)  
        self.out = nn.Linear(256, num_actions)
        self.out.weight.data.normal_(0, 0.1)  
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.out(x)
        return x

 ## Agent

In [9]:
class Model(BaseAgent):
    def __init__(self, static_policy=False, env=None, config=None, log_dir='/tmp/gym'):
        super(Model, self).__init__(config=config, env=env, log_dir=log_dir)
        self.device = config.device

        self.gamma = config.GAMMA
        self.lr = config.LR
        self.target_net_update_freq = config.TARGET_NET_UPDATE_FREQ
        self.experience_replay_size = config.EXP_REPLAY_SIZE
        self.batch_size = config.BATCH_SIZE
        self.learn_start = config.LEARN_START
        self.update_freq = config.UPDATE_FREQ

        self.static_policy = static_policy
        self.num_feats = env.observation_space.shape
        self.num_actions = env.action_space.n
        self.env = env

        self.declare_networks()
            
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=1e-8)
        
        #move to correct device
        self.model = self.model.to(self.device)
        self.target_model.to(self.device)

        if self.static_policy:
            self.model.eval()
            self.target_model.eval()
        else:
            self.model.train()
            self.target_model.train()

        self.update_count = 0

        self.declare_memory()

    def declare_networks(self):
        self.model = DQN(self.num_feats, self.num_actions)
        self.target_model = DQN(self.num_feats, self.num_actions)

    def declare_memory(self):
        self.memory = ExperienceReplayMemory(self.experience_replay_size)

    def append_to_replay(self, s, a, r, s_):
        self.memory.push((s, a, r, s_))

    def prep_minibatch(self):
        # random transition batch is taken from experience replay memory
        transitions, indices, weights = self.memory.sample(self.batch_size)
        
        batch_state, batch_action, batch_reward, batch_next_state = zip(*transitions)

        shape = (-1,)+self.num_feats

        batch_state = torch.tensor(batch_state, device=self.device, dtype=torch.float).view(shape)
        batch_action = torch.tensor(batch_action, device=self.device, dtype=torch.long).squeeze().view(-1, 1)
        batch_reward = torch.tensor(batch_reward, device=self.device, dtype=torch.float).squeeze().view(-1, 1)
        
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch_next_state)), device=self.device, dtype=torch.uint8)
        try: #sometimes all next states are false
            non_final_next_states = torch.tensor([s for s in batch_next_state if s is not None], device=self.device, dtype=torch.float).view(shape)
            empty_next_state_values = False
        except:
            non_final_next_states = None
            empty_next_state_values = True

        return batch_state, batch_action, batch_reward, non_final_next_states, non_final_mask, empty_next_state_values, indices, weights

    def compute_loss(self, batch_vars):
        batch_state, batch_action, batch_reward, non_final_next_states, non_final_mask, empty_next_state_values, indices, weights = batch_vars

        #estimate
        current_q_values = self.model(batch_state).squeeze().gather(1, batch_action)

        #target
        with torch.no_grad():
            max_next_q_values = torch.zeros(self.batch_size, device=self.device, dtype=torch.float).unsqueeze(dim=1)
            if not empty_next_state_values:
                max_next_action = self.get_max_next_state_action(non_final_next_states)
                max_next_q_values[non_final_mask] = self.target_model(non_final_next_states).squeeze().gather(1, max_next_action)
            expected_q_values = batch_reward + self.gamma*max_next_q_values

        diff = (expected_q_values - current_q_values)
        loss = self.MSE(diff)
        loss = loss.mean()

        return loss

    def update(self, s, a, r, s_, frame=0):
        if self.static_policy:
            return None

        self.append_to_replay(s, a, r, s_)

        if frame < self.learn_start or frame % self.update_freq != 0:
            return None

        batch_vars = self.prep_minibatch()

        loss = self.compute_loss(batch_vars)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        self.update_target_model()
        self.save_td(loss.item(), frame)
        self.save_sigma_param_magnitudes(frame)

    def get_action(self, s, eps=0.1):
        with torch.no_grad():
            if np.random.random() >= eps or self.static_policy:
                X = torch.tensor([s], device=self.device, dtype=torch.float)
                a = self.model(X).max(2)[1].view(1, 1)
                return a.item()
            else:
                return np.random.randint(0, self.num_actions)

    def update_target_model(self):
        self.update_count+=1
        self.update_count = self.update_count % self.target_net_update_freq
        if self.update_count == 0:
            self.target_model.load_state_dict(self.model.state_dict())

    def get_max_next_state_action(self, next_states):
        return self.target_model(next_states).max(dim=2)[1].view(-1, 1)

    def finish_nstep(self):
        pass
    
    def reset_hx(self):
        pass
    
    def MSE(self, x):
        return 0.5 * x.pow(2)


 ## Training Loop

In [10]:
import pandas as pd
from functions import *
data_0 = load_data('../NAS10_mkt.csv')

In [11]:
#cols = ['open', 'high', 'low', 'close', 'volume', 'sell', 'hold', 'buy', 'profit', 'allocation']

In [12]:
exp_result_path = 'exp_'
project_name = '0_DQN'
assets = data_0.assetName.unique()
sub_folder = ['exp_plot', 'saved_agents', 'profit_log', 'reward_log']
mkdir(exp_result_path, project_name, assets, sub_folder)

FileNotFoundError: [WinError 3] 系統找不到指定的路徑。: 'exp_/0_DQN/Tesla, Inc. /exp_plot'

In [None]:
df = data_0[data_0.assetName=='Apple Inc.']
train_firstday = df[df.date>=('2014-01-01')].index[0]
train_lastday = df[df.date<=('2017-12-31')].index[-1]

test_firstday = df[df.date>=('2018-01-01')].index[0]
test_lastday = df[df.date<=('2018-12-31')].index[-1]
cols =['open', 'high', 'low', 'close', 'volume', 'month', 'day', 'dayofweek', 'sell',
       'hold', 'buy', 'profit', 'allocation', 'SMA', 'macd', 'macdsignal',
       'macdhist', 'RSI', 'WILLR', 'STOCH_D', 'STOCH_K', 'ROCP']
delay = 50

In [None]:
for asset in assets:
    start=timer()
    
    log_dir = '%s/%s/%s'%(exp_result_path, project_name, asset)

    data = data_0[data_0.assetName==asset]
    data = TI(data)
    #data = nomorlize(data)
    env_id = '0_DQN'
    env = environment(data[cols], log_dir, train_firstday, train_lastday, delay=delay, training=False)
    env2 = environment(data[cols], '', test_firstday, test_lastday, delay=delay, training=False)
    
    env.env_id = env_id
    model = Model(env=env, config=config, log_dir=log_dir)

    episode_reward = 0
    observation = env.reset()
    for frame_idx in range(1, config.MAX_FRAMES + 1):
        epsilon = config.epsilon_by_frame(frame_idx)

        action = model.get_action(observation, epsilon)
        model.save_action(action, frame_idx) #log action selection

        prev_observation=observation
        observation, reward, done, _ = env.step(action)
        observation = None if done else observation
        #print (reward)
        model.update(prev_observation, action, reward, observation, frame_idx)
        episode_reward += reward

        if done:
            model.finish_nstep()
            model.reset_hx()
            observation = env.reset()
            model.save_reward(env_id, frame_idx, episode_reward)
            episode_reward = 0
            
        if (frame_idx % 10000 == 0):
            model.save_w()
            try:
                clear_output(True)
                plot_all_data(log_dir, env_id, '0_DQN', config.MAX_FRAMES, bin_size=(10, 100, 100, 1), smooth=1, time=timedelta(seconds=int(timer()-start)), ipynb=True)
            except IOError:
                pass

        if (frame_idx % 1000 == 0) & (frame_idx>config.MAX_FRAMES/8):
            Total_profit, env2 = run_testing(model, env2)
            save_profit(log_dir, frame_idx, env2.profit_df.sum())
            if frame_idx % 10000 == 0:
                plot_sigma(env2.reward_list, log_dir, frame_idx, 'reward')
                plot_sigma(env2.profit_list, log_dir, frame_idx, 'profit')
plot_all_data(log_dir, env_id, '0_DQN', config.MAX_FRAMES, bin_size=(10, 100, 100, 1), smooth=1, time=timedelta(seconds=int(timer()-start)), ipynb=True)