In [1]:
# Library
from urllib.request import urlopen
import pandas as pd
from bs4 import BeautifulSoup
import webbrowser
import numpy as np
import re

# DataFrame
train_dataset = pd.DataFrame()
test_dataset = pd.DataFrame()

# url connecting
url ='https://lab.donutz.co/krx/products'
resultXML = urlopen(url)
result = resultXML.read()
soup = BeautifulSoup(result,'html.parser')
soup_string = str(soup)
regex = re.compile(r'(\d{6})')
codes = regex.findall(soup_string)
codes = [i for i in codes]
start_date = '20170101'
end_date = '20191231'

# 진행과정 계산
COUNT = 0
    
for code in codes[:100] : 
    
    # Crawling
    url ='https://lab.donutz.co/krx/products/' + code + '?sdate=' + start_date + '&edate=' + end_date
    resultXML = urlopen(url)
    result = resultXML.read()
    soup = BeautifulSoup(result,'html.parser')
    soup_string = str(soup)

    # 특수문자 제거
    def clean(data):
        text = re.sub('[a-zA-Z-=+#/\?^$@*\"※~&%ㆍ!_』:\\‘|\(\)\[\]\<\>`\'{}…》]', '', data)
        return text
    text = re.split('[,]+',clean(soup_string))
    information = []
    for i in text:
        information.append(i)
        
    # 데이터프레임으로 변환
    data = pd.DataFrame(np.array(information).reshape(-1,15))
    data.columns = ['date','open','high','low','close','trading_volume',\
                    'score','index','probability','l1','l2','l3','l4','lgap','lrate']
    
    # Target 생성
    data['Target'] = data['close'].shift(-1)
    data = data.dropna()
    
    # 날짜기준으로 재정렬
    data = data.sort_values(by=['date'])

    # 날짜 인덱싱
    data = data.set_index('date')
    
    # 데이터 실수화
    data['open'] = data['open'].apply(lambda x : np.float(x))
    data['high'] = data['high'].apply(lambda x : np.float(x))
    data['low'] = data['low'].apply(lambda x : np.float(x))
    data['close'] = data['close'].apply(lambda x : np.float(x))
    data['trading_volume'] = data['trading_volume'].apply(lambda x : np.float(x))
    data['score'] = data['score'].apply(lambda x : np.float(x))
    data['index'] = data['index'].apply(lambda x : np.float(x))
    data['probability'] = data['probability'].apply(lambda x : np.float(x))
    data['l1'] = data['l1'].apply(lambda x : np.float(x))
    data['l2'] = data['l2'].apply(lambda x : np.float(x))
    data['l3'] = data['l3'].apply(lambda x : np.float(x))
    data['l4'] = data['l4'].apply(lambda x : np.float(x))
    data['lgap'] = data['lgap'].apply(lambda x : np.float(x))
    data['lrate'] = data['lrate'].apply(lambda x : np.float(x))
    data['Target'] = data['Target'].apply(lambda x : np.float(x))
            
    # Rate로 피쳐 변경
    data['open_rate'] = (data['open'] - data['open'].shift(1)) / data['open'].shift(1)
    data['high_rate'] = (data['high'] - data['high'].shift(1)) / data['high'].shift(1)
    data['low_rate'] = (data['low'] - data['low'].shift(1)) / data['low'].shift(1)
    data['close_rate'] = (data['close'] - data['close'].shift(1)) / data['close'].shift(1)
    data['trading_volume_rate'] = (data['trading_volume'] - data['trading_volume'].shift(1))
    data['score_rate'] = (data['score'] - data['score'].shift(1)) / data['score'].shift(1)
    data['index_rate'] = (data['index'] - data['index'].shift(1)) / data['index'].shift(1)
    data['probability_rate'] = (data['probability'] - data['probability'].shift(1))
    data['l1_rate'] = (data['l1'] - data['l1'].shift(1)) / data['l1'].shift(1)
    data['l2_rate'] = (data['l2'] - data['l2'].shift(1)) / data['l2'].shift(1)
    data['l3_rate'] = (data['l3'] - data['l3'].shift(1)) / data['l3'].shift(1)
    data['l4_rate'] = (data['l4'] - data['l4'].shift(1)) / data['l4'].shift(1)
    data['lgap_rate'] = (data['lgap'] - data['lgap'].shift(1))
    data['lrate_rate'] = (data['lrate'] - data['lrate'].shift(1))

    # Target 형태 변경
    data['prediction'] = 100 * (data['Target'] - data['close']) / data['close']
    
    # 결측값 제거
    data = data.dropna()
    
    # 기존변수 제거
    data = data[['open_rate', 'high_rate', 'low_rate', 'close_rate', 'trading_volume_rate',\
                 'score_rate', 'index_rate', 'probability_rate', 'l1_rate', 'l2_rate',\
                  'l3_rate', 'l4_rate', 'lgap_rate', 'lrate_rate', 'prediction']]
    
    # Train Test Split
    train = data['20170101':'20190930']
    test = data['20191001':'20191231']    
    
    # 데이터 저장
    train_dataset = train_dataset.append(train)
    test_dataset = test_dataset.append(test)
    
print("Train Dataset = {:,} obs" .format(len(train_dataset)))
print("Test Dataset = {:,} obs" .format(len(test_dataset)))

Train Dataset = 67,000 obs
Test Dataset = 6,099 obs


In [2]:
# Train
train_X = train_dataset[train_dataset.columns[:-1]]
train_Y = train_dataset[train_dataset.columns[-1:]]

# Test
test_X = test_dataset[test_dataset.columns[:-1]]
test_Y = test_dataset[test_dataset.columns[-1:]]

In [3]:
# Library
import tensorflow as tf

# Model Build
model = tf.keras.Sequential([
                             tf.keras.layers.Dense(units=52, activation='relu', input_shape=(14,)),
                             tf.keras.layers.Dense(units=39, activation='relu'),
                             tf.keras.layers.Dense(units=26, activation='relu'),
                             tf.keras.layers.Dense(units=1)
])

# Model Compile
model.compile(optimizer=tf.keras.optimizers.Adam(lr=.07), loss='mse')
model.summary()

# Model Learning
history = model.fit(train_X, train_Y, epochs=40, batch_size=1024, validation_split=0.25)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 52)                780       
_________________________________________________________________
dense_1 (Dense)              (None, 39)                2067      
_________________________________________________________________
dense_2 (Dense)              (None, 26)                1040      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 27        
Total params: 3,914
Trainable params: 3,914
Non-trainable params: 0
_________________________________________________________________
Train on 50250 samples, validate on 16750 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 1

In [4]:
# loss 시각화
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()

<Figure size 640x480 with 1 Axes>

In [5]:
test_Y['model_predicton'] = model.predict(test_X).reshape(-1)
test_Y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,prediction,model_predicton
date,Unnamed: 1_level_1,Unnamed: 2_level_1
20191001,-5.185185,-0.260597
20191002,1.562500,-0.260597
20191004,-0.128205,-0.260597
20191007,0.513479,-0.260597
20191008,0.127714,-0.260597
...,...,...
20191220,-1.175214,-0.260597
20191223,-0.540541,-0.260597
20191224,0.326087,-0.260597
20191226,1.408451,-0.260597
