In [24]:
# db connection

import pymysql
from sqlalchemy import create_engine
import keyring
import platform
import numpy as np

user = 'root'
pw = keyring.get_password('macmini_db', user)
host = '192.168.219.106' if platform.system() == 'Windows' else '127.0.0.1'
port = 3306
db = 'stock'


# # connect DB
# engine = create_engine(f'mysql+pymysql://{self.user}:{self.pw}@{self.host}:{self.port}/{self.db}')

# con = pymysql.connect(
#     user=user,
#     passwd=pw,
#     host=host,
#     db=db,
#     charset='utf8'
# )
        
# mycursor = con.cursor()

# COLUMNS

In [25]:
# base data
COLUMNS_STOCK_DATA = ['date', 'open', 'high', 'low', 'close', 'volume']
COLUMNS_MARKET_DATA = [
    'kospi', 'kosdaq', 'dow', 'snp',
    'nikkei', 'hangseng', 'ftse', 'cac',
    'dax', 'enronext', 'vix', 
    'shanghai',
]
COLUMNS_BOND_DATA = [
    'bond_us_13w', 'bond_us_5y', 'bond_us_10y',
    'bond_us_30y', 'bond_kr_3y'
]
COLUMNS_EXCHANGE_DATA = [
    'usd_eur', 'usd_gbr', 'usd_jpy', 'usd_cnh', 'usd_kor',
]
COLUMNS_VALUE_DATA = [
    'per', 'pbr', 'roe', 'dy', 'pcr', 'psr', 'marekt_cap',
    'roa',
]
COLUMNS_FACTOR_DATA = [
    'beta', 'value', 'moment', 'qaulity', 'volatility',
]
COLUMNS_TECH_DATA = [
    'upperbb', 'lowerbb', 'bb_pb', 'bb_width', 'macd',
    'rsi', 'mfi', 'ii', 'buy_strength', 'sell_strength'
]
COLUMNS_SECTOR_DATA = [
    'xlk', 'xlv', 'xly', 'xlp',
    'xle', 'xlf', 'xli', 'xlb',
    'xlre', 'xlu',
]
COLUMNS_COMMODITY_DATA = [
    'oil', 'gold',
]

# moving average data
COLUMNS_STOCK_ROLLING_DATA = [
    'close_ma5_ratio', 'close_ma10_ratio', 'close_ma20_ratio',
    'close_ma60_ratio', 'close_ma120_ratio', 'close_ma240_ratio',
    'volume_ma5_ratio', 'volume_ma10_ratio', 'volumne_ma20_ratio',
    'volume_ma60_ratio', 'volume_ma120_ratio', 'volume_ma240_ratio',
]
COLUMNS_MARKET_ROLLING_DATA = [
    'market_kospi_ma5_ratio', 'market_kospi_ma10_ratio', 'market_kospi_ma20_ratio',
    'market_kospi_ma60_ratio', 'market_kospi_ma120_ratio', 'market_kospi_ma120_ratio',
    'market_kosdaq_ma5_ratio', 'market_kosdaq_ma10_ratio', 'market_kosdaq_ma20_ratio',
    'market_kosdqp_ma60_ratio', 'market_kosdaq_ma120_ratio', 'market_kosdaq_ma240_ratio',
    'market_dow_ma5_ratio', 'market_dow_ma10_ratio', 'market_dow_ma20_ratio',
    'market_dow_ma60_ratio', 'market_dow_ma120_ratio', 'market_dow_ma240_ratio',
    'market_snp_ma5_ratio', 'market_snp_ma10_ratio', 'market_snp_ma20_ratio',
    'market_snp_ma60_ratio', 'market_snp_ma120_ratio', 'market_snp_ma240_ratio',
    'market_nikkei_ma5_ratio', 'market_nikkei_ma10_ratio', 'market_nikkei_ma20_ratio',
    'market_nikkei_ma60_ratio', 'market_nikkei_ma120_ratio', 'market_ma240_ratio',
    'market_hangseng_ma5_ratio', 'market_hangseng_ma10_ratio', 'market_hangseng_ma20_ratio',
    'market_hangseng_ma60_ratio', 'market_hangseng_ma120_ratio', 'market_hangseng_ma240_ratio',
    'market_ftse_ma5_ratio', 'market_ftse_ma10_ratio', 'market_ftse_ma20_ratio',
    'market_ftse_ma60_ratio', 'market_ftse_ma120_ratio', 'market_ftse_ma240_ratio',
    'market_cac_ma5_ratio', 'market_cac_ma10_ratio', 'market_cac_ma20_ratio',
    'market_cac_ma60_ratio', 'market_cac_ma120_ratio', 'market_cac_ma240_ratio',
    'market_dax_ma5_ratio', 'market_dax_ma10_ratio', 'market_dax_ma20_ratio',
    'market_dax_ma60_ratio', 'market_dax_ma120_ratio', 'market_dax_ma240_ratio',
    'market_euronext_ma5_ratio', 'market_euronext_mas10_ratio', 'market_euronext_ma20_ratio',
    'market_euronext_ma60_ratio', 'market_euronext_ma120_ratio', 'marekt_euronext_ma240_ratio',
    'market_vix_ma5_ratio', 'market_vix_ma10_ratio', 'market_vix_ma20_ratio',
    'market_vix_ma60_ratio', 'market_vix_ma120_ratio', 'market_vix_ma240_raito',
    'market_shanghai_ma5_ratio', 'market_shanghai_ma10_ratio', 'market_shanghai_ma20_ratio',
    'market_shanghai_ma60_ratio', 'market_shanghai_ma120_ratio', 'market_shanghai_ma240_ratio',
]
COLUMNS_BOND_ROLLING_DATA = [
    'bond_us_13w_ma5_ratio', 'bond_us_13w_ma10_ratio', 'bond_us_13w_ma20_ratio',
    'bond_us_13w_ma60_ratio', 'bond_us_13w_ma120_ratio', 'bond_us_12w_ma240_ratio',
    'bond_us_5y_ma5_ratio', 'bond_us_5y_ma10_ratio', 'bond_us_5y_ma20_ratio',
    'bond_us_5y_ma60_ratio', 'bond_us_5y_ma120_ratio', 'bond_us_5y_ma240_ratio',
    'bond_us_10y_ma5_ratio', 'bond_us_10y_ma10_ratio', 'bond_us_10y_ma20_ratio',
    'bond_us_10y_ma60_ratio', 'bond_us_10y_ma120_ratio', 'bond_us_10y_ma240_ratio',
    'bond_us_30y_ma5_ratio', 'bond_us_30y_ma10_ratio', 'bond_us_30y_ma20_ratio',
    'bond_us_30y_ma60_ratio', 'bond_us_30y_ma120_ratio', 'bond_us_30y_ma240_ratio',
    'bond_kr_3y_ma5_ratio', 'bond_kr_3y_ma10_ratio', 'bond_kr_3y_ma20_ratio',
    'bond_kr_3y_ma60_ratio', 'bond_kr_3y_ma120_ratio', 'bond_kr_3y_ma240_ratio',
]
COLUMNS_SECTOR_ROLLING_DATA = [
    'sector_xlk_ma5_ratio', 'sector_xlk_ma10_ratio', 'sector_xlk_ma20_ratio',
    'sector_xlk_ma60_ratio', 'sector_xlk_ma120_ratio', 'sector_xlk_ma240_ratio',
    'sector_xlv_ma5_ratio', 'sector_xlv_ma10_ratio', 'sector_xlv_ma20_ratio',
    'sector_xlv_ma60_ratio', 'sector_xlv_ma120_ratio', 'sector_xlv_ma240_ratio',
    'sector_xly_ma5_ratio', 'sector_xly_ma10_ratio', 'sector_xly_ma20_ratio',
    'sector_xly_ma60_ratio', 'sector_xly_ma120_ratio', 'sector_xly_ma240_ratio',
    'sector_xlp_ma5_ratio', 'sector_xlp_ma10_ratio', 'sector_xlp_ma20_ratio',
    'sector_xlp_ma60_ratio', 'sector_xlp_ma120_ratio', 'sector_xlp_ma240_ratio',
    'sector_xle_ma5_ratio', 'sector_xle_ma10_ratio', 'sector_xle_ma20_ratio',
    'sector_xle_ma60_ratio', 'sector_xle_ma120_ratio', 'sector_xle_ma240_ratio',
    'sector_xlf_ma5_ratio', 'sector_xlf_ma10_ratio', 'sector_xlf_ma20_ratio',
    'sector_xlf_ma60_ratio', 'sector_xlf_ma120_ratio', 'sector_xlf_ma240_ratio',
    'sector_xli_ma5_ratio', 'sector_xli_ma10_ratio', 'sector_xli_ma20_ratio',
    'sector_xli_ma60_ratio', 'sector_xli_ma120_ratio', 'sector_xli_ma240_ratio',
    'sector_xlb_ma5_ratio', 'sector_xlb_ma10_ratio', 'sector_xlb_ma20_ratio',
    'sector_xlb_ma60_raito', 'sector_xlb_ma120_ratio', 'sector_xlb_ma240_ratio',
    'sector_xlre_ma5_ratio', 'sector_xlre_ma10_ratio', 'sector_xlre_ma20_ratio',
    'sector_xlre_ma60_ratio', 'sector_xlre_ma120_ratio', 'sector_xlre_ma240_ratio',
    'sector_xlu_ma5_ratio', 'sector_xlu_ma10_ratio', 'sector_xlu_ma20_ratio',
    'sector_xlu_ma60_ratio', 'sector_xlu_ma120_ratio', 'sector_xlu_ma240_ratio',
]

COLUMNS_COMMODITY_ROLLING_DATA = [
    'commodity_oil_ma5_ratio', 'commodity_oil_ma10_ratio', 'commodiy_oil_ma20_ratio',
    'commodity_oil_ma60_ratio', 'commodity_oil_ma120_ratio', 'commodity_oil_ma240_ratio',
    'commodity_gold_ma5_ratio', 'commodity_gold_ma10_ratio', 'commodity_gold_ma20_ratio',
    'commodity_gold_ma60_ratio', 'commodity_gold_ma120_ratio', 'commodity_gold_ma240_ratio',
    
]

# ratio data
COLUMNS_STOCK_RATIO_DATA = [
    'open_close_ratio', 'open_prev_close_ratio', 'high_close_ratio', 'low_close_ratio',
    'close_prev_close_ratio', 'volume_prev_volume_ratio',
]


# get prices funcitons

In [26]:
import pandas as pd
import pymysql
from sqlalchemy import create_engine


# get us stock price of a specific ticker
def get_prices_from_ticker(ticker, fro=None, to=None):

    # connect DB
    engine = create_engine(f'mysql+pymysql://{user}:{pw}@{host}:{port}/{db}')

    con = pymysql.connect(
        user=user,
        passwd=pw,
        host=host,
        db=db,
        charset='utf8'
    )
            
    mycursor = con.cursor()
    
    if fro is not None:
        if to is not None:               
            query = f""" 
                    SELECT * FROM price_global
                    WHERE ticker = {ticker}
                    AND date BETWEEN {fro} AND {to} 
                    """
        else:
            query = f""" 
                    SELECT * FROM price_global
                    WHERE ticker = {ticker}
                    AND date >= {fro} 
                    """
    
    else:
        if to is not None:
            query = f""" 
                    SELECT * FROM price_global
                    WHERE ticker = {ticker}
                    AND date <= {to} 
                    """
        else:
            query = f""" 
                    SELECT * FROM price_global
                    WHERE ticker = '{ticker}'
                    """
            
    print(query)
    prices = pd.read_sql(query, con=engine)
    con.close()
    engine.dispose()
    return prices

In [27]:
df = get_prices_from_ticker('AAPL')

 
                    SELECT * FROM price_global
                    WHERE ticker = 'AAPL'
                    


In [28]:
df

Unnamed: 0,date,high,low,open,close,volume,adj_close,ticker
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.099192,469033600.0,AAPL
1,1980-12-15,0.122210,0.122210,0.121652,0.121652,0.094017,175884800.0,AAPL
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.087117,105728000.0,AAPL
3,1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089273,86441600.0,AAPL
4,1980-12-18,0.118862,0.119420,0.118862,0.118862,0.091861,73449600.0,AAPL
...,...,...,...,...,...,...,...,...
10900,2024-03-11,172.940002,174.380005,172.050003,172.750000,172.750000,60139500.0,AAPL
10901,2024-03-12,173.149994,174.029999,171.009995,173.229996,173.229996,59825400.0,AAPL
10902,2024-03-13,172.770004,173.190002,170.759995,171.130005,171.130005,52488700.0,AAPL
10903,2024-03-14,172.910004,174.309998,172.050003,173.000000,173.000000,72913500.0,AAPL


# preprocess function

In [29]:
COLUMNS_STOCK_RATIO_DATA = [
    'open_close_ratio', 'open_prev_close_ratio', 'high_close_ratio', 'low_close_ratio',
    'close_prev_close_ratio', 'volume_prev_volume_ratio',
]

def preprocess(data):
    
    # moving average
    windows = [5, 10, 20, 60, 120, 240]
    for window in windows:
        data[f'close_ma{window}'] = data['close'].rolling(window).mean()
        data[f'volume_ma{window}'] = data['volume'].rolling(window).mean()
        data[f'close_ma{window}_ratio'] = (data['close'] - data[f'close_ma{window}']) / data[f'close_ma{window}']
        data[f'volume_ma{window}_ratio'] = (data['volume'] - data[f'volume_ma{window}']) / data[f'volume_ma{window}']
        data['open_close_ratio'] = (data['open'].values - data['close'].values) / data['close'].values
        data['open_prev_close_ratio'] = np.zeros(len(data))
        data.loc[1:, 'open_prev_close_ratio'] = (data['open'][1:].values - data['close'][:-1].values) / data['close'][:-1].values
        data['high_close_ratio'] = (data['high'].values - data['close'].values) / data['close'].values
        data['low_close_ratio'] = (data['low'].values - data['close'].values) / data['close'].values
        data['close_prev_close_ratio'] = np.zeros(len(data))
        data.loc[1:, 'close_prev_close_ratio'] = (data['close'][1:].values - data['close'][:-1].values) / data['close'][:-1].values 
        data['volume_prev_volume_ratio'] = np.zeros(len(data))
        data.loc[1:, 'volume_prev_volume_ratio'] = (
            # if volume is 0, change it into non zero value exploring previous volume continuously
            (data['volume'][1:].values - data['volume'][:-1].values) / data['volume'][:-1].replace(to_replace=0, method='ffill').replace(to_replace=0, method='bfill').values
        )
    
    # Bollinger band
    data['middle_bb'] = data['close'].rolling(20).mean()
    data['upper_bb'] = data['middle_bb'] + 2 * data['close'].rolling(20).std()
    data['lower_bb'] = data['middle_bb'] - 2 * data['close'].rolling(20).std()
    data['bb_pb'] = (data['close'] - data['lower_bb']) / (data['upper_bb'] - data['lower_bb'])
    data['bb_width'] = (data['upper_bb'] - data['lower_bb']) / data['middle_bb']
    
    # MACD
    macd_short, macd_long, macd_signal = 12, 26, 9
    data['ema_short'] = data['close'].ewm(macd_short).mean()
    data['ema_long'] = data['close'].ewm(macd_long).mean()
    data['macd'] = data['ema_short'] - data['ema_long']
    data['macd_signal'] = data['macd'].ewm(macd_signal).mean()
    data['macd_oscillator'] = data['macd'] - data['macd_signal']
    
    # RSI
    data['close_change'] = data['close'].diff()
    data['close_up'] = np.where(data['close_change']>=0, df['close_change'], 0)
    # data['close_up'] = data['close_change'].apply(lambda x: x if x >= 0 else 0)
    data['close_down'] = np.where(data['close_change'] < 0, df['close_change'].abs(), 0)
    # data['close_down] = data['close_change'].apply(lambda x: -x if x < 0 else 0)
    data['rs'] = data['close_up'].ewm(alpha=1/14, min_periods=14).mean() / data['close_down'].ewm(alpha=1/14, min_periods=14).mean()
    data['rsi'] = 100 - (100 / (1 + data['rs']))
    
    
    
    
    return data

In [30]:
df_adj = preprocess(df)

In [31]:
df_adj = df_adj[30:].reset_index(drop=True)

In [32]:
df_adj.iloc[:, 4]

0          0.142857
1          0.138393
2          0.133371
3          0.126116
4          0.118862
            ...    
10870    172.750000
10871    173.229996
10872    171.130005
10873    173.000000
10874    172.619995
Name: close, Length: 10875, dtype: float64

In [33]:
df_adj

Unnamed: 0,date,high,low,open,close,volume,adj_close,ticker,close_ma5,volume_ma5,...,ema_short,ema_long,macd,macd_signal,macd_oscillator,close_change,close_up,close_down,rs,rsi
0,1981-01-27,0.143973,0.143973,0.142857,0.142857,0.110405,23699200.0,AAPL,0.144977,0.112044,...,0.142455,0.141406,0.001048,0.001137,-0.000089,-0.001116,0.000000,0.001116,1.099457,52.368628
1,1981-01-28,0.138951,0.138951,0.138393,0.138393,0.106955,28156800.0,AAPL,0.143638,0.111009,...,0.142116,0.141247,0.000869,0.001109,-0.000240,-0.004464,0.000000,0.004464,0.893069,47.175730
2,1981-01-29,0.133929,0.133929,0.133371,0.133371,0.103074,43904000.0,AAPL,0.140960,0.108939,...,0.141392,0.140838,0.000554,0.001052,-0.000498,-0.005022,0.000000,0.005022,0.727594,42.116042
3,1981-01-30,0.127232,0.127232,0.126116,0.126116,0.097467,46188800.0,AAPL,0.136942,0.105834,...,0.140134,0.140083,0.000051,0.000949,-0.000898,-0.007255,0.000000,0.007255,0.564786,36.093507
4,1981-02-02,0.119420,0.119420,0.118862,0.118862,0.091861,23766400.0,AAPL,0.131920,0.101953,...,0.138392,0.139011,-0.000619,0.000788,-0.001407,-0.007254,0.000000,0.007254,0.455127,31.277493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10870,2024-03-11,172.940002,174.380005,172.050003,172.750000,172.750000,60139500.0,AAPL,170.343997,170.343997,...,179.364069,182.966690,-3.602621,-1.591875,-2.010746,2.020004,2.020004,0.000000,0.514266,33.961404
10871,2024-03-12,173.149994,174.029999,171.009995,173.229996,173.229996,59825400.0,AAPL,170.965997,170.965997,...,178.892217,182.606071,-3.713854,-1.804073,-1.909782,0.479996,0.479996,0.000000,0.547323,35.372263
10872,2024-03-13,172.770004,173.190002,170.759995,171.130005,171.130005,52488700.0,AAPL,171.367999,171.367999,...,178.295124,182.181032,-3.885908,-2.012256,-1.873652,-2.099991,0.000000,2.099991,0.473565,32.137364
10873,2024-03-14,172.910004,174.309998,172.050003,173.000000,173.000000,72913500.0,AAPL,172.167999,172.167999,...,177.887807,181.840994,-3.953187,-2.206349,-1.746838,1.869995,1.869995,0.000000,0.602798,37.609120


# Environment

In [34]:
# environment

class Environment:
    ''' 
    Attribute
    ---------
    - chart_data : stock price chart data
    - observation : current observation
    - idx : current postion of chart data
    
    Functions
    --------
    - reset() : initialize idx and observation
    - observe() : move idx into next postion and get a new observation
    - get_price() : get close price of current observation
    '''
    
    def __init__(self, chart_data=None):
        self.PRICE_IDX = 4  # index postion of close price
        self.chart_data = chart_data
        self.observation = None
        self.idx = -1
        
    def reset(self):
        self.observation = None
        self.idx = -1
        
    def observe(self):
        # if there is no more idx, return None
        if len(self.chart_data) > self.idx + 1:
            self.idx += 1
            self.observation = self.chart_data.iloc[self.idx]
            return self.observation
        return None
    
    def get_price(self):
        # return close price
        if self.observation is not None:
            return self.observation[self.PRICE_IDX]
        return None
        

In [35]:
a = Environment(df_adj)
a.reset()
a.observe()
a.observe()
a.get_price()

0.13839299976825714

# utility functions

In [36]:
# utility functions
import time
import datetime
import numpy as np

# str format on date, time
FORMAT_DATE = '%Y%m%d'
FORMAT_DATETIME = '%Y%m%d%H%M%S'

def get_today_str():
    today = datetime.datetime.combine(
        datetime.date.today(), datetime.datetime.min.time()
    )
    today_str = today.strftime(FORMAT_DATE)
    return today_str

def get_time_str():
    return datetime.datetime.fromtimestamp(
        int(time.time())
    ).strftime(FORMAT_DATETIME)
    
def sigmoid(x):
    x = max(min(x, 10), -10)
    return 1. / (1. + np.exp(-x))

In [37]:
get_today_str()

'20240319'

In [38]:
get_time_str()

'20240319002734'

# Agent

In [39]:
# agent
import numpy as np

class Agent:
    ''' 
    Attributes
    --------
    - enviroment : instance of environment
    - initial_balance : initial capital balance
    - min_trading_price : minimum trading price
    - max_trading_price : maximum trading price
    - balance : cash balance
    - num_stocks : obtained stocks
    - portfolio_value : value of portfolios (balance + price * num_stocks)
    - num_buy : number of buying
    - num_sell : number of selling
    - num_hold : number of holding
    - ratio_hold : ratio of holding stocks
    - profitloss : current profit or loss
    - avg_buy_price : average price of a stock bought
    
    Functions
    --------
    - reset() : initialize an agent
    - set_balance() : initialize initial balance
    - get_states() : get the state of an agent
    - decide_action() : exploration or exploitation behavior according to the policy net
    - validate_action() : validate the behavior
    - decide_trading_unit() : decide how many stocks are sold or bought
    - act() : act the behavior 
    '''
    
    # agent states dimension
    ## (ration_hold, profit-loss ratio, current price to avg_buy_price change ratio)
    STATE_DIM = 3
    
    # trading charge and tax
    TRADING_CHARGE = 0.00015    # trading charge 0.015%
    TRADING_TAX = 0.02          # trading tax = 0.2%
    
    # action space
    ACTION_BUY = 0      # buy
    ACTION_SELL = 1     # sell
    ACTION_HOLD = 2     # hold
    
    # get probabilities from neural nets
    ACTIONS = [ACTION_BUY, ACTION_SELL, ACTION_HOLD]
    NUM_ACTIONS = len(ACTIONS)      # output number from nueral nets
    
    def __init__(self, environment, initial_balance, min_trading_price, max_trading_price):
        # get current price form the enviroment
        self.environment = environment
        self.initial_balance = initial_balance  # initial balance
        
        # minimum and maximum buying prices
        self.min_trading_price = min_trading_price
        self.max_trading_price = max_trading_price
        
        # attributes for Agent class
        self.balance = initial_balance      # current balance
        self.num_stocks = 0                 # number of obtained stocks
        
        # value of portfolio : balance + num_stocks * {current stock price} 
        self.portfolio_value = 0
        self.num_buy = 0        # number of buying
        self.num_sell = 0       # number of selling
        self.num_hold = 0       # number of holding
        
        # the state of Agent class
        self.ratio_hold = 0     
        self.profitloss = 0     # profit-loss ratio
        self.avg_buy_price = 0 
        
    def reset(self):
        self.balance = self.initial_balance
        self.num_stocks = 0
        self.portfolio_value = self.initial_balance
        self.num_buy = 0
        self.num_sell = 0
        self.num_hold = 0
        self.ratio_hold = 0
        self.profitloss = 0
        self.avg_buy_price = 0
        
        
    def set_balance(self, balance):
        # set initial balance
        self.initial_balance = balance
        
    def get_states(self):
        # ratio_hold = num_stocks / (portfolio_value / price)
        self.ratio_hold = self.num_stocks * self.environment.get_price() / self.portfolio_value
        
        return (
            self.ratio_hold,
            self.profitloss,    # profitloss = (portfolio_value / initial_balance) - 1
            (self.environment.get_price() / self.avg_buy_price) - 1 if self.avg_buy_price > 0 else 0 
        )
        
    def decide_action(self, pred_value, pred_policy, epsilon):
        # act randomly according with epsilon probability, act according to neural nets otherwise.
        confidence = 0
        
        # if there is a pred_policy, follow it, otherwise, follow pred_value
        pred = pred_policy
        if pred is None:
            pred = pred_value
        
        if pred is None:
            # there are no predictions, explore
            epsilon = 1
        else:
            # values are equal, explore
            maxpred = np.max(pred)
            if (pred == maxpred).all():
                epsilon = 1
            
            # the difference between buying and selling (e.g., 0.05), explore
            if pred_policy is not None:
                if np.max(pred_policy) - np.min(pred_policy) < 0.05:
                    epsilon = 1
                    
        # decide exploration
        if np.random.rand() < epsilon:
            exploration = True
            action = np.random.randint(self.NUM_ACTIONS)
        else:
            exploration = False
            action = np.argmax(pred)
            
        confidence = .5
        if pred_policy is not None:
            confidence = pred[action]
        elif pred_value is not None:
            confidence = sigmoid(pred[action])
            
        return action, confidence, exploration
    
    def validate_action(self, action):
        # validate decided action
        if action == Agent.ACTION_BUY:
            # check if at least one stock can be bought
            if self.balance < self.environment.get_price() * (1 + self.TRADING_CHARGE):
                return False
        elif action == Agent.ACTION_SELL:
            # check if there are obtained stocks
            if self.num_stocks <= 0:
                return False
        return True
    
    def decide_trading_unit(self, confidence):
        # adjust number of buying or selling according to confidence level
        if np.isnan(confidence):
            return self.min_trading_price
        
        # set buying price range between self.min_trading_price + added_trading_priceRK [min_trading_price, max_trading_price]
        # in case that confidence > 1 causes the price over max_trading_price, we set min() so that the value cannot have larger value than self.max_trading_price - self.min_trading_price
        # in case that confidence < 0, we set max() so that added_trading_price cannot have negative value.
        added_trading_price = max(min(
            int(confidence * (self.max_trading_price - self.min_trading_price)),
            self.max_trading_price - self.min_trading_price
        ), 0)
        
        trading_price = self.min_trading_price + added_trading_price
        
        return max(int(trading_price / self.environment.get_price()), 1)
    
    def act(self, action, confidence):
        ''' 
        Arguments
        ---------
        - action : decided behavior based on exploratio or exploitation (0 or 1)
        - confidence : sofmax probability derived from neural nets
        '''
        
        if not self.validate_action(action):
            action = Agent.ACTION_HOLD
            
        # get the price from the environment
        curr_price = self.environment.get_price()
        
        # buy
        if action == Agent.ACTION_BUY:
            # how many stocks
            trading_unit = self.decide_trading_unit(confidence)
            balance = (
                self.balance - curr_price * (1 + self.TRADING_CHARGE) * trading_unit
            )
            
            # if lacks of money, buy maximum units within the amount of money what we have
            if balance < 0:
                trading_unit = min(
                    int(self.balance / (curr_price * (1 + self.TRADING_CHARGE))),
                    int(self.max_trading_price / curr_price)
                )
                
            # total amount of money with trading charge
            invest_amount = curr_price * (1 + self.TRADING_CHARGE) * trading_unit
            if invest_amount > 0:
                self.avg_buy_price = (self.avg_buy_price * self.num_stocks + curr_price * trading_unit) / (self.num_stocks + trading_unit)
                self.balance -= invest_amount
                self.num_stocks += trading_unit
                self.num_buy += 1
                
        # sell
        elif action == Agent.ACTION_SELL:
            # how many stocks
            trading_unit = self.decide_trading_unit(confidence)
            # if lacks of obtained stocks, sell as many as possible
            trading_unit = min(trading_unit, self.num_stocks)
            # sell
            invest_amount = curr_price * (
                1 - (self.TRADING_TAX + self.TRADING_CHARGE)
            ) * trading_unit
            
            if invest_amount > 0:
                # update avg_buy_price
                self.avg_buy_price = (self.avg_buy_price * self.num_stocks - curr_price * trading_unit) / (self.num_stocks - trading_unit)
                self.num_stocks -= trading_unit
                self.balance += invest_amount
                self.num_sell += 1
                
        # hold
        elif action == Agent.ACTION_HOLD:
            self.num_hold += 1
            
        # update portfolio value
        self.portfolio_value = self.balance + curr_price * self.num_stocks
        self.profitloss = self.portfolio_value / self.initial_balance - 1
        return self.profitloss

# Visualizer

In [40]:
# visualizer

import threading
import numpy as np 
import matplotlib.pyplot as plt
import datetime
plt.switch_backend('agg')

from mplfinance.original_flavor import candlestick_ohlc

lock = threading.Lock()

class Visulaizer:
    ''' 
    Attributes
    ---------
    - fig : Figure class instance playing a canvas role
    - axes : Axes class instance for plotting chart
    - title : title for plot
    
    Functions
    --------
    - prepare() : initialize a figure and print daily chart
    - plot() : print all chart except daily chart
    - save() : save a figure as an image file
    - clear() : initial all the chart except daily chart
    
    Returns
    --------
    - Figure title : parameters, epochs, and exploration rate
    - Axes 1 : daily ohlc chart
    - Axes 2 : obtained stocks and agent action
    - Axes 3 : neural net output
    - Axes 4 : policy net output and exploration chart
    - Axes 5 : Portfolio value and learning point chart
    '''
    
    COLORS = ['r', 'b', 'g']
    
    def __init__(self):
        self.canvas = None
        self.fig = None
        self.axes = None
        self.title = ''
        self.x = []
        self.xticks = []
        self.xlabels = []
        
    def prepare(self, chart_data, title):
        self.title = title
        with lock:
            # initialize canva and prepare drawing 5 charts
            self.fig, self.axes = plt.subplots(
                nrows=5, ncols=1, facecolor='w', sharex=True
            )
            
            for ax in self.axes:
                # deactivate the marks unnecessary
                ax.get_xaxis().get_major_formatter().set_scientific(False)
                ax.get_yaxis().get_major_formatter().set_scientific(False)
                # y axis 위치 오른쪽으로 변경
                ax.yaxis.tick_right()
                
            # chart 1. daily ohlc chart
            self.axes[0].set_ylabel('Env.')     # y label
            x = np.arange(len(chart_data))
            # set two dimensional matrix ordering by open, high, low, close 
            ohlc = np.hstack((
                x.reshape(-1, 1), np.array(chart_data)[:, 1:-1]
            ))
            
            # red for positive, blue for negative
            candlestick_ohlc(self.axes[0], ohlc, colorup='r', colordown='b')
            
            # visualize volume
            ax = self.axes[0].twinx()
            volume = np.array(chart_data)[:, -1].tolist()
            ax.bar(x, volume, color='b', alpha=0.3)
            
            # set x-axis
            self.x = np.arange(len(chart_data['date']))
            self.xticks = chart_data.index[[0, -1]]
            self.xlabels = chart_data.iloc[[0, -1]]['date']
            
    def plot(self, epoch_str=None, num_epochs=None, epsilon=None,
             action_list=None, actions=None, num_stocks=None,
             outvals_value=[], outvals_policy=[], exps=None,
             initial_balance=None, pvs=None):
        ''' 
        Attributes
        ---------
        - epoch_str : epoch for Figure title
        - num_epochs : total epoch
        - epsilon : exploration rate
        - action_list : action list that an Agent can take
        - actions: : action array that an Agent took
        - num_stocks : obtained stock number array
        - outvals_value : value network output array
        - outvals_policy : policy network output array
        - exps : whether it is explored
        - initial_balance : initial balance
        - pvs : portfolio value array
        '''
        
        # conver numpy array because matplotlib() takes numpy() array
        
        with lock:
            # action, num_stocks, outvals_value, outvals_policy, pvs have same size
            # we create array for x-axis as same as action array
            actions = np.array(actions)     # action array for an agent aciton
            # output array for value network
            outvals_value = np.array(outvals_value)
            # output array for policy network
            outvals_policy = np.array(outvals_policy)
            # array for initial balance
            pvs_base = np.zeros(len(actions)) + initial_balance
            
            # chart 2. Agent states (action, num_stocks)
            for action, color in zip(action_list, self.COLORS):
                for i in self.x[actions == action]:
                    # background color : red = buying, blue = selling
                    self.axes[1].axvline(i, color=color, alpha=0.1)
            self.axes[1].plot(self.x, num_stocks, '-k')     # number of stocks
            
            # char 3. Value network
            if len(outvals_value) > 0:
                max_actions = np.argmax(outvals_value, axis=1)
                for action, color in zip(action_list, self.COLORS):
                    # background
                    for idx in self.x:
                        if max_actions[idx] == action:
                            self.axes[2].axvline(idx, color=color, alpha=0.1)
                    # plot value network
                    ## red for buying, blue for selling, green for holding
                    ## if there is no holding in predicting, no green is plotted
                    self.axes[2].plot(self.x, outvals_value[:, action], color=color, linestyle='-')
                    
            # chart 4. policy network
            # yellow for exploration
            for exp_idx in exps:
                self.axes[3].axvline(exp_idx, color='y')
            # background for action
            _outvals = outvals_policy if len(outvals_policy) > 0 else outvals_value
            for idx, outval in zip(self.x, _outvals):
                color = 'white'
                if np.isnan(outval.max()):
                    continue
                # in the no exploration point, red for buying, blue for selling
                if outval.argmax() == Agent.ACTION_BUY:
                    color = self.COLORS[0]
                elif outval.argmax() == Agent.ACTION_SELL:
                    color = self.COLORS[1]
                elif outval.argmax() == Agent.ACTION_HOLD:
                    color = self.COLORS[2]
                self.axes[3].axvline(idx, color=color, alpha=0.1)
                
            # plot policy network output
            # policy network for buying is red, policy network for selling is blue
            # if red line is above the blue line, an agent buys, otherwise sells
            if len(outvals_policy) > 0:
                for action, color in zip(action_list, self.COLORS):
                    self.axes[3].plot(
                        self.x, outvals_policy[:, action],
                        color=color, linstyle='-'
                    )
                    
            # chart 5. Portfolio value
            # horizontal straight line for initial balance
            self.axes[4].axhline(
                initial_balance, linestyle='-', color='gray'
            )
            # the part above initial balance is plotted as red
            self.axes[4].fill_between(
                self.x, pvs, pvs_base,
                where=pvs > pvs_base, facecolor='r', alpha=0.1
            )
            # the part below initial balance is plotted as blue
            self.axes[4].fill_between(
                self.x, pvs, pvs_base,
                where=pvs < pvs_base, facecolor='b', alpha=0.1
            )
            self.axes[4].plot(self.x, pvs, '-k')
            self.axes[4].xaxis.set_ticks(self.xticks)
            self.axes[4].xaxis.set_ticklabels(self.xlabels)
            
            # epoch and exploration rate
            self.fig.suptitle(f'{self.title}\nEPOCH:{epoch_str}/{num_epochs} EPSILON:{epsilon:.2f}')
            # adjust canvas layout
            self.fig.tight_layout()
            self.fig.subplots_adjust(top=0.85)
            
    # initialize visualization
    def clear(self, xlim):
        with lock:
            _axes = self.axes.tolist()
            # initialize chart except unchanged values
            for ax in _axes[1:]:
                ax.cla()        # erase charts
                ax.relim()      # initialize limit
                ax.autoscale()  # rescale
            
            # reset y label
            self.axes[1].set_ylabel('Agent')
            self.axes[2].set_ylabel('V')
            self.axes[3].set_ylabel('P')
            self.axes[4].set_ylabel('PV')
            for ax in _axes:
                ax.set_xlim(xlim)       # reset x-axis limit
                ax.get_xaxis().get_major_formatter().set_scientific(False)
                ax.get_yaxis().get_major_formatter().set_scientific(False)
                # set x-axix interval equally
                ax.ticklabel_format(useOffset=False)
                
    # save results
    def save(self, path):
        with lock:
            self.fig.savefig(path)
            


# Networks

In [41]:
# networks
import threading
import abc
import numpy as np
import torch
import torch.nn.functional as F 


In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [43]:
class Network:
    ''' 
    Common network class
    and DNN, LSTMNetwork, CNN
    
    Attributes
    ---------
    - input_dim 
    - output_dim
    - lr : learning rate for network
    - shared_network : upper part of network shared with various networks
    - activation : action function for output layer e.g) 'linear', 'sigmoid', 'tanh', 'softmax' 
    - loss : loss function for network
    - model : final network model
    
    Functions
    --------
    - predict() : calculate behavioral value and probabilities 
    - train_on_batch() : generate dataset for batch learning
    - save_model() : save network model as a file
    - load_model() : load network model from a file
    - get_shared_network() : function for generating shared network
    '''
    
    # thread lock for A3C
    lock = threading.Lock()
    
    def __init__(self, input_dim=0, output_dim=0, num_steps=1, lr=0.001,
                 share_network=None, activation='sigmoid', loss='mse'):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.num_steps = num_steps
        self.lr = lr
        self.shared_network = share_network
        self.activation = activation
        self.loss = loss
        
        # data dimension for network
        # CNN, LSTMNework has 3 dimension -> inp = (num_steps, input_dim), DNN -> inp = (input_dim, )
        inp = None
        # if hasattr(self, 'num_steps):
        #   inp = (self.num_steps, input_dim)
        # else:
        #   inp = (self.input_dim, )
        if self.num_steps > 1:
            inp = (self.num_steps, input_dim)
        else:
            inp = (self.input_dim, )
        
        # using shared network
        self.head = None
        if self.shared_network is None:
            self.head = self.get_network_head(inp, self.output_dim)
        else:
            self.head = self.shared_network
            
        # not using shared network
        # self.head = self.get_network_head(inp, self.output_dim)
        
        # network model
        ## create head network model
        self.model = torch.nn.Sequential(self.head)
        if self.activation == 'linear':
            pass
        elif self.activation == 'relu':
            self.model.add_module('activation', torch.nn.ReLU())
        elif self.activation == 'leaky_relu':
            self.model.add_module('activation', torch.nn.LeakyReLU())
        elif self.activation == 'sigmoid':
            self.model.add_module('activation', torch.nn.Sigmoid())
        elif self.activation == 'tanh':
            self.model.add_module('activation', torch.nn.Tanh())
        elif self.activation == 'softmax':
            self.model.add_module('activation', torch.nn.Tanh())
        self.model.apply(Network.init_weights)
        self.model.to(device)
        
        # optimizer
        # self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.lr)
        self.optimizer = torch.optim.NAdam(self.model.parameters(), lr=self.lr)
        
        # loss function
        self.criterion = None
        if loss == 'mse':
            self.criterion = torch.nn.MSELoss()
        elif loss == 'binary_crossentropy':
            self.criterion = torch.nn.BCELoss()
            
    def predict(self, sample):
        # return prediction buy, sell and hold on sample data
        # value network : behavioral value on sample data, policy network : behavioral probabilities on sample data
        with self.lock:
            # transform evaluation mode : deactivate layers such as dropout used only in learning process
            self.model.eval()
            with torch.no_grad():
                x = torch.from_numpy(sample).float().to(device)
                pred = self.model(x).detatch().cpu().numpy()
                pred = pred.flatten()
            return pred
        
    def train_on_batch(self, x, y):
        if self.num_steps > 1:
            x = np.array(x).reshape((-1, self.num_steps, self.input_dim))
        else:
            x = np.array(x).reshape((-1, self.input_dim))
        loss = 0.
        with self.lock:
            # transform learning mode
            self.model.train()
            _x = torch.from_numpy(x).float().to(device)
            _y = torch.from_numpy(y).float().to(device)
            y_pred = self.model(_x)
            _loss = self.criterion(y_pred, _y)
            self.optimizer.zero_grad()
            _loss.backward()
            self.optimizer.step()
            loss += _loss.item()
        return loss
    
    def train_on_batch_for_ppo(self, x, y, a, eps, K):
        if self.num_steps > 1:
            x = np.array(x).reshape((-1, self.num_steps, self.input_dim))
        else:
            x = np.array(x).reshape((-1, self.input_dim))
        loss = 0.
        with self.lock:
            self.model.train()
            _x = torch.from_numpy(x).float().to(device)
            _y = torch.from_numpy(y).float().to(device)
            probs = F.softmax(_y, dim=1)
            for _ in range(K):
                y_pred = self.model(_x)
                probs_pred = F.softmax(y_pred, dim=1)
                rto = torch.exp(torch.log(probs[:, a]) - torch.log(probs_pred[:, a]))
                rto_adv = rto * _y[:, a]
                clp_adv = torch.clamp(rto, 1 - eps, 1 + eps) * +y[:, a]
                _loss = -torch.min(rto_adv, clp_adv).mean()
                self.optimizer.zero_grad()
                _loss.backward()
                self.optimizer.step()
                loss += _loss.item()
        return loss
    
    @classmethod
    def get_shared_network(cls, net='dnn', num_steps=1, input_dim=0, output_dim=0):
        if net == 'dnn':
            return DNN.get_network_head((input_dim,), output_dim)
        elif net == 'lstm':
            return LSTMNetwork.get_network_head((num_steps, input_dim), output_dim)
        elif net == 'cnn':
            return CNN.get_network_head((num_steps, input_dim), output_dim)
        
    @abc.abstractmethod
    def get_network_head(inp, output_dim):
        pass
    
    @staticmethod
    def init_weight(m):
        # initialize weights based on normal distribution
        if isinstance(m, torch.nn.Linear) or isinstance(m, torch.nn.Conv1d):
            torch.nn.init.normal_(m.weight, std=0.01)
        elif isinstance(m, torch.nn.LSTM):
            for weights in m.all_weights:
                for weight in weights:
                    torch.nn.init.normal_(weight, std=0.01)
                    
    def save_model(self, model_path):
        if model_path is not None and self.model is not None:
            torch.save(self.model, model_path)
            
    def load_model(self, model_path):
        if model_path is not None:
            self.model = torch.load(model_path)
            
class DNN(Network):
    @staticmethod
    def get_network_head(inp, output_dim):
        return torch.nn.Sequential(
            torch.nn.BatchNorm1d(inp[0]),
            torch.nn.Linear(inp[0], 256),
            torch.nn.BatchNorm1d(256),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(256, 128),
            torch.nn.BatchNorm1d(128),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(128, 64),
            torch.nn.BatchNorm1d(64),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(64, 32),
            torch.nn.BatchNorm1d(32),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(32, output_dim),
        )
        
    # def train_on_batch(self, x, y):
    #   x = np.array(x).reshape((-1, self.input_dim))
    #   return super().train_on_batch(x, y)
    
    def predict(self, sample):
        sample = np.array(sample).reshape((1, self.input_dim))
        return super().predict(sample)
    
class LSTMNetwork(Network):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    @staticmethod
    def get_network_head(inp, output_dim):
        return torch.nn.Sequential(
            torch.nn.BatchNorm1d(inp[0]),
            LSTMModule(inp[1], 128, batch_fist=True, use_last_only=True),
            torch.nn.BatchNorm1d(128),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(128, 64),
            torch.nn.BatchNorm1d(64),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(64, 32),
            torch.nn.BatchNorm1d(32),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(32, output_dim),
        )
        
    # def train_on_batch(self, x, y):
    #     x = np.array(x).reshape((-1, self.num_steps, self.input_dim))
    #     return super().train_on_batch(x, y)
    
    def predict(self, sample):
        sample = np.array(sample).reshape((-1, self.num_steps, self.input_dim))
        return super().predict(sample)
    
class LSTMModule(torch.nn.LSTM):
    def __init__(self, *args, use_last_only=False, **kwargs):
        super().__init__(*args, **kwargs)
        self.use_last_only = use_last_only
        
    def forward(self, x):
        output, (h_n, _) = super().forward(x)
        if self.use_last_only:
            return h_n[-1]
        return output
    
class CNN(Network):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    @staticmethod
    def get_network_head(inp, output_dim):
        kernel_size = 2
        return torch.nn.Sequential(
            torch.nn.BatchNorm1d(inp[0]),
            torch.nn.Conv1d(inp[0], 1, kernel_size),
            torch.nn.BatchNorm1d(1),
            torch.nn.Flatten(),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(inp[1] - (kernel_size - 1), 128),
            torch.nn.BatchNorm1d(128),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(128, 64),
            torch.nn.BatchNorm1d(64),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(64, 32),
            torch.nn.BatchNorm1d(32),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(32, output_dim),
        )

    # def train_on_batch(self, x, y):
    #     x = np.array(x).reshape((-1, self.num_steps, self.input_dim))
    #     return super().train_on_batch(x, y)
    
    def predict(self, sample):
        sample = np.array(sample).reshape((1, self.num_steps, self.input_dim))
        return super().predict(sample)      
    

# Settings

In [44]:
# import os
# import locale
# import platform

# logger name
LOGGER_NAME = 'rltrader'

# # path setting
# # parent path
# BASE_DIR = os.environ.get('RLTRADER_BASE', os.path.abspath(os.path.join(__file__, os.path.pardir)))

# # locale setting
# if 'Lines' in platform.system() or 'Darwin' in platform.system():
#     locale.setlocale(locale.LC_ALL, 'ko_KR.UTF-8')
# elif 'Windows' in platform.system():
#     locale.setlocale(locale.LC_ALL, '')

# Learners

In [45]:
# learners

import os 
import logging
import abc
import collections
import threading
import time
import json
import numpy as np
from tqdm import tqdm

logger = logging.getLogger(LOGGER_NAME)

# DQNLearner
class ReinforcementLearner:
    ''' 
    Attributes
    ---------
    - stock_code 
    - chart_data
    - environment
    - agent
    - training_data
    - value_network
    - policy_network
    
    Functions
    --------
    - init_value_network()
    - init_policy_network()
    - build_sample() : get sample from environment instance
    - update_network() : update value network and policy network
    - fit() : request learn value and policy network
    - visualize() : visualize epoch info
    - fun() : reinforcement learning
    - save_models() : save value and policy network
    '''
    
    __metaclass__ = abc.ABCMeta
    lock = threading.Lock()
    
    def __init__(self, rl_method='rl', stock_code=None,
                 chart_data=None, training_data=None,
                 min_trading_price=10, max_trading_price=10000,
                 net='dnn', num_steps=1, lr=0.0005,
                 discount_factor=0.9, num_epochs=1000,
                 balance=100000, start_epsilon=1, 
                 value_network=None, policy_network=None,
                 output_path='', reuse_models=True, gen_output=True):
        ''' 
        Attributes
        --------
        - rl_method : reinforcement learning method / 'dqn' : DQNLearner, 'pg' : PolicyGradient, 'ac' : ActorCriticLearner, 'a2c' : A2CLearner, 'a3c' : A3CLearner
        - stock_code
        - chart_data 
        - training_data
        - min_trading_price, max_trading_price
        - net : neural network / 'dnn', 'lstm', 'cnn'
        - n_steps : sample step size for LSTM, CNN
        - lr : learning rate
        - discount_factor : state-action value discount factor
        - num_epochs : total training epochs
        - balance : initial balance
        - start_epsilon : initial exploration rate
        - value_network, policy_network
        - output_path : save path for model
        - reuse_models 
        '''
        
        # check arguments
        assert min_trading_price > 0
        assert max_trading_price > 0
        assert max_trading_price >= min_trading_price
        assert num_steps > 0
        assert lr > 0
        
        # setting reinforcement learning
        self.rl_method = rl_method
        self.discount_factor = discount_factor
        self.num_epochs = num_epochs
        self.start_epsilon = start_epsilon
        
        # set Environment
        self.stock_code = stock_code
        self.chart_data = chart_data
        self.environment = Environment(chart_data)
        
        # set Agent
        self.agent = Agent(self.environment, balance, min_trading_price, max_trading_price)
        
        
        # training data
        self.trainig_data = training_data
        self.sample = None
        self.trainig_data_idx = -1
        
        # size of vector = vector for training data size + agent states size
        self.num_features = self.agent.STATE_DIM
        if self.trainig_data is not None:
            self.num_features += self.trainig_data.shape[1]
            
        # set neural network
        self.net = net
        self.num_steps = num_steps
        self.lr = lr
        self.value_network = value_network
        self.policy_network = policy_network
        self.reuse_models = reuse_models
        
        # set Visualizer
        self.visualizer = Visulaizer()
        
        # memory
        self.memory_sample = []     # training data sample
        self.memory_action = []     # actions that were taken
        self.memory_reward = []     # rewards that were got
        self.memory_value = []      # prediction values of actions
        self.memory_policy = []     # prediction probabilities of actions
        self.memory_pv = []         # portfolio value
        self.memory_num_stocks = [] # number of stocks obtained
        self.memory_exp_idx = []    # exploration position
        
        # epoch info.
        self.loss = 0.          # loss during epoch
        self.itr_cnt = 0        # number of profit
        self.exploration_cnt = 0    # count of exploration
        self.batch_size = 0     # training epoch
        
        # log
        self.output_path = output_path
        self.gen_output = gen_output
        
    def init_value_network(self, shared_network=None, activation='linear', loss='mse'):
        if self.net == 'dnn':
            self.value_network = DNN(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr, share_network=shared_network,
                activation=activation, loss=loss
            )
        elif self.net == 'lstm':
            self.value_network = LSTMNetwork(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr, num_steps=self.num_steps,
                shared_network=shared_network,
                activation=activation, loss=loss
            )
        elif self.net == 'cnn':
            self.value_network = CNN(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr, num_steps=self.num_steps,
                shared_network=shared_network,
                activation=activation, loss=loss
            )
        if self.reuse_models and os.path.exists(self.value_network_path):
            self.value_network.load_model(model_path=self.value_network_path)
    
    def init_policy_network(self, shared_network=None, activation='sigmoid', loss='binary_crossentropy'):
        
        if self.net == 'dnn':
            self.policy_network = DNN(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr, share_network=shared_network,
                activation=activation, loss=loss
            )
            
        elif self.net == 'lstm':
            self.policy_network = LSTMNetwork(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr, num_steps=self.num_steps,
                shared_network=shared_network,
                activation=activation, loss=loss
            )
            
        elif self.net == 'cnn':
            self.policy_network = CNN(
                input_dim=self.num_features,
                output_dim=self.agent.NUM_ACTIONS,
                lr=self.lr, num_steps=self.num_steps,
                shared_network=shared_network,
                activation=activation, loss=loss
            )
            
        if self.reuse_models and os.path.exists(self.policy_network_path):
            self.policy_network.load_model(model_path=self.policy_network_path)
            
    def reset(self):
        self.sample = None
        self.trainig_data_idx = -1
        
        # reset environment
        self.environment.reset()
        
        # reset agent
        self.agent.reset()
        
        # reset visualizer
        self.visualizer.clear([0, len(self.chart_data)])
        
        # intialize memories
        self.memory_sample = []
        self.memory_action = []
        self.memory_reward = []
        self.memory_value = []
        self.memory_policy = []
        self.memory_pv = []
        self.memory_num_stocks = []
        self.memory_exp_idx = []
        
        # intialize epoch info
        self.loss = 0.
        self.itr_cnt = 0
        self.exploration_cnt = 0
        self.batch_size = 0
        
    def build_sample(self):
        # get data from the next index
        self.environment.observe()
        # 47 values in the sample + next agent states = 50 values
        if len(self.trainig_data) > self.trainig_data_idx + 1:
            self.trainig_data_idx += 1
            self.sample = self.trainig_data[self.trainig_data_idx].tolist()
            self.sample.extend(self.agent.get_states())
            return self.sample
        return None
    
    # abstrac method
    @abc.abstractmethod
    def get_batch(self):
        pass
    
    # after create training batch data, call train_on_batch() method for training value network and policy network
    # value network : DQNLearner, ActorCriticLearner, A2CLearner
    # policy network : PolicyGrdient, ActorCriticLearner, A2CLearner
    # After training, save loss instance and return the sum of value network loss and policy network loss in case tha both network are used
    def fit(self):
        # create batch fata
        x, y_value, y_polcy = self.get_batch()
        # init loss
        self.loss = None
        if len(x) > 0:
            loss = 0
            if y_value is not None:
                # update value network
                loss += self.value_network.train_on_batch(x, y_value)
            if y_polcy is not None:
                # update policy network
                loss += self.value_network.train_on_batch(x, y_polcy)
            self.loss = loss
            
    # after end of an epoch, visualize it.
    # In case of LSTM, CNN, agent's action, number of stocks, value network output, policy network output, value of portfolio are (num_steps - 1) less than environment observation and fill dummy variables
    def visualize(self, epoch_str, num_epochs, epsilon):
        self.memory_action = [Agent.ACTION_HOLD] * (self.num_steps - 1) + self.memory_action
        self.memory_num_stocks = [0] * (self.num_steps - 1) + self.memory_num_stocks
        if self.value_network is not None:
            self.memory_value = [np.array([np.nan] * len(Agent.ACTIONS))] * (self.num_steps - 1) + self.memory_value
        if self.policy_network is not None:
            self.memory_policy = [np.array([np.nan] * len(Agent.ACTIONS))]  * (self.num_steps - 1) + self.memory_policy
            
        self.memory_pv = [self.agent.initial_balance] * (self.num_steps - 1) + self.memory_pv
        
        self.visualizer.plot(
            epoch_str=epoch_str, num_epochs=num_epochs,
            epsilon=epsilon, action_list=Agent.ACTIONS,
            actions=self.memory_action,
            num_stocks=self.memory_num_stocks,
            outvals_value=self.memory_value,
            outvals_policy=self.memory_policy,
            exps=self.memory_exp_idx,
            initial_balance=self.agent.initial_balance,
            pvs=self.memory_pv,
        )
        self.visualizer.save(os.path.join(self.epoch_summary_dir, f'epoch_summary_{epoch_str}.png'))
        
            
        
    
    
     
        