### 01. High Price Predict Model with GRU

In [1]:
# 시간 측정 시작
import time
start = time.time()

# 라이브러리
import pandas as pd
kospi = pd.read_csv('../data/KOSPI.csv')

# KOSPI에 상장된 종목 코드
kospi['종목코드'] = kospi['종목코드'].apply(lambda x : str(x).rjust(6, '0'))
code_list = kospi['종목코드'].unique()

# 라이브러리
import numpy as np

# 진행과정 Check
check = 0

# 결과 저장 데이터프레임
result_df = pd.DataFrame()

# 코드 별 진행
for code in code_list[:1] : 

    # Hyper Parameter
    tr_name = 'TR_SCHART'
    term = 'D'
    start_date = '20180510'   # 학습 기간 시작점
    end_date = '20200229'     # 학습 기간 종료점
    Lookup = '9999' 
    
    # url
    url = 'http://ssecd.roboadvisor.co.kr:9999/' + tr_name + '?0=' + code + '&1=' + term + '&2=1' +\
          '&3=' + start_date + '&4=' + end_date + '&5=' + Lookup

    # url open
    from urllib.request import urlopen
    url_page = urlopen(url)

    # json 파일로 받아오기
    import json
    url_data = json.loads(url_page.read())

    # json을 데이터프레임으로 바꾸기
    import pandas as pd
    data = pd.DataFrame(url_data)
    
    # Data Column명 변경
    data.columns = ['date','time','open','high','low','close','price_ccr','volume_ccr',
                    'rock','volume','volume_price']
    data = data[['date', 'time', 'open', 'high', 'low', 'close', 'volume', 'volume_price']]
    
    # 순서 뒤집기
    data = data[::-1]
    data = data.set_index('date')
    
    # 필요한 변수만 추출
    data = data[['open', 'high', 'low', 'close', 'volume', 'volume_price']]

    # 만약에 데이터가 비어있다면 그 종목은 PASS
    if data["open"][0] == '' : 
        continue
    
    # 변수 속성 정수로 바꾸기
    for column in data.columns : 
        data[column] = data[column].apply(lambda x : int(x))
        
    # 거래량과 거래가격을 모두 로그 변환
    data['volume'] = data['volume'].apply(lambda x : np.log(x+1))
    data['volume_price'] = data['volume_price'].apply(lambda x : np.log(x+1))
        
    # NaN이 들어가 있는 행 제거
    data = data.dropna()
    
    print(data)
    
    # 제네레이터 생성
    def generator(data, lookback, delay, min_index, max_index, shuffle=False, batch_size=128, step=6):
        if max_index is None :
            max_index = len(data) - delay - 1
        i = min_index + lookback
        while 1 :
            if shuffle :
                rows = np.random.randint(min_index + lookback, max_index, size=batch_size)
            else :
                if i + batch_size >= max_index :
                    i = min_index + lookback

                rows = np.arange(i, min(i + batch_size, max_index))
                i += len(rows)
            samples = np.zeros((len(rows), lookback // step, data.shape[-1]))
            targets = np.zeros((len(rows),))
            for j, row in enumerate(rows) :
                indices = range(rows[j] - lookback, rows[j], step)
                samples[j] = data[indices]
                targets[j] = data[rows[j] + delay][1]

            yield samples, targets
    

    # 각 데이터셋별 제네레이터 생성
    lookback = 5              # 뒤돌아갈 지점
    step = 1                  # 데이터 뽑을 스텝
    delay = 1                 # 타겟 위치
    batch_size = 34           # 배치 사이즈

    train_gen = generator(data, lookback=lookback, delay=delay, min_index=0, max_index=350,
                          shuffle=True, step=step, batch_size=batch_size)

    val_gen = generator(data, lookback=lookback, delay=delay, min_index=351, max_index=400,
                        step=step, batch_size=batch_size)

    test_gen = generator(data, lookback=lookback, delay=delay, min_index=401, max_index=None,
                         step=step, batch_size=batch_size)

    # 전체 검증 세트를 순회하기 위해 val_gen에서 추출할 횟수
    val_steps = (300 - 251 - lookback) // batch_size

    # 전체 테스트 세트를 순회하기 위해 test_gen에서 추출할 횟수
    test_steps = (len(data) - 401 - lookback) // batch_size
        
    # GRU Model Build
    from keras.models import Sequential
    from keras import layers
    from keras.optimizers import RMSprop

    model = Sequential()
    
    model.add(layers.GRU(32,
                         dropout=0.1,
                         recurrent_dropout=0.5,
                         return_sequences=True,
                         input_shape=(None, data.shape[-1])))
    
    model.add(layers.GRU(64, activation='relu',
                         dropout=0.1, 
                         recurrent_dropout=0.5))
    
    model.add(layers.Dense(1))

    model.compile(optimizer=RMSprop(), loss='mae')
    
    history = model.fit_generator(train_gen,
                                  steps_per_epoch=500,
                                  epochs=20,
                                  validation_data=val_gen,
                                  validation_steps=val_steps)

    # Test Data 생성하기
    start_date, end_date = '20200301', '20200318'
    url = 'http://ssecd.roboadvisor.co.kr:9999/' + tr_name + '?0=' + code + '&1=' + term + '&2=1' +\
          '&3=' + start_date + '&4=' + end_date + '&5=' + Lookup
    url_page = urlopen(url)
    url_data = json.loads(url_page.read())
    data = pd.DataFrame(url_data)
    data.columns = ['date','time','open','high','low','close','price_ccr','volume_ccr',
                    'rock','volume','volume_price']
    data = data[['date', 'time', 'open', 'high', 'low', 'close', 'volume', 'volume_price']]
    data = data[::-1]
    data = data.set_index('date')
    data = data[['open', 'high', 'low', 'close', 'volume', 'volume_price']]
    for column in data.columns : 
        data[column] = data[column].apply(lambda x : int(x))
    data['volume'] = data['volume'].apply(lambda x : np.log(x+1))
    data['volume_price'] = data['volume_price'].apply(lambda x : np.log(x+1))
    for column in ['open', 'high', 'low', 'close', 'volume', 'volume_price'] : 
        for i in range(1,5,1) : 
            data['{}_shift_{}'.format(column,i)] = data[column].shift(i)
    data['y'] = data['high'].shift(-1)
        
    data = data[-1:]
    
    data = data[data.columns[:-1]]

    # 필요한것들
    result = dict({'종목' : code,
                   '어제 날짜' : data.index.values[0],
                   '어제 종가' : int(data['close'].values),
                   '오늘 고가' : int(round(reg.predict(data)[0])),
                   '오늘 고가 변동폭 예측' : 100*(int(round(reg.predict(data)[0]))-int(data['close'].values))/\
                                     int(data['close'].values)})
    result_df = result_df.append(result, ignore_index=True)
    
    # 각 종목별 결과 출력
    if check % 100 == 0 : 
        print('{:4} / {:4} 종목 ====> {:7.2f}% 완료  {:.4f}'.format(check+1,2144,100*check/2144,time.time()-start))
    check += 1

# 전체 완료
print('{:4} / {:4} 종목 ====> {:7.2f}% 완료  {:.4f}'.format(check + 1,2144,100*check/2144,time.time()-start))

# 시간측정 완료
print("\n\nall time :", time.time() - start)

          open  high   low  close     volume  volume_price
date                                                      
20180510  2950  3240  2910   3170  13.978534     19.718490
20180511  3170  3170  3020   3075  13.066605     18.798701
20180514  3070  3270  3065   3205  13.545661     19.314548
20180515  3240  3240  3150   3235  12.628270     18.395767
20180516  3245  4050  3160   3760  15.128300     21.043196
...        ...   ...   ...    ...        ...           ...
20200224  2675  2715  2565   2570  11.804647     17.366498
20200225  2590  2750  2575   2740  11.547443     17.139146
20200226  2710  2710  2625   2645  11.778132     17.359022
20200227  2645  2655  2525   2570  12.102583     17.652842
20200228  2500  2535  2345   2390  13.081106     18.570373

[443 rows x 6 columns]


Using TensorFlow backend.


Epoch 1/20


KeyError: "None of [Int64Index([23, 24, 25, 26, 27], dtype='int64')] are in the [columns]"