### 01. Data Loading

In [1]:
# Target Code
code_1 = 66570
code_2 = '066570'

In [2]:
# Read Dataset
import pandas as pd
dataset = pd.read_csv('./total_price.csv')

# Del Unnamed: 0
del dataset['Unnamed: 0']

# l1~l4 line Setting
for step in ['l1', 'l2', 'l3', 'l4'] : 
    dataset[step] = dataset[step].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))

# Warning Message
import warnings
warnings.filterwarnings(action='ignore') 

# Filter code
data = dataset[dataset['code'] == code_1]

# Target Rate Create
data['target'] = data['close'].shift(-1)
data = data.dropna(axis=0)

# Data Head
data

Unnamed: 0,date,open,high,low,close,trading_volume,score,index,probability,l1,l2,l3,l4,lgap,lrate,code,target
1099535,20160622,55600,55600,54500,54900,501106,2.044,1.571,51.176,55800.0,55250.0,54700.0,54150.0,1650.0,3,66570,55200.0
1099536,20160623,54800,55600,54400,55200,436187,2.056,1.857,57.523,56000.0,55400.0,54800.0,54200.0,1800.0,3,66570,52300.0
1099537,20160624,55900,55900,52000,52300,1291330,2.361,0.857,22.353,56050.0,54100.0,52150.0,50200.0,5850.0,11,66570,52000.0
1099538,20160627,50900,52600,50600,52000,832384,4.046,2.000,70.000,53300.0,52300.0,51300.0,50300.0,3000.0,6,66570,53000.0
1099539,20160628,51900,53400,51600,53000,557900,4.193,3.000,70.000,54100.0,53200.0,52300.0,51400.0,2700.0,5,66570,53700.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1100431,20200218,67900,70000,67700,68900,1333785,0.000,0.077,20.588,70600.0,69450.0,68300.0,67150.0,3450.0,5,66570,68800.0
1100432,20200219,69300,69400,68000,68800,635128,0.000,0.077,20.588,69800.0,69100.0,68400.0,67700.0,2100.0,3,66570,67200.0
1100433,20200220,68700,69000,66700,67200,674987,2.608,0.667,10.769,69250.0,68100.0,66950.0,65800.0,3450.0,5,66570,66100.0
1100434,20200221,66000,67000,65800,66100,529652,1.412,1.222,39.901,67150.0,66550.0,65950.0,65350.0,1800.0,3,66570,63400.0


### 02. Data Handling

In [3]:
# Input & Target Split
X = data[data.columns[1:-2]]
y = data[data.columns[-1]]

# Train & Valid Split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=1234)

# Dataset For LightGBM
import lightgbm as lgb
train_ds = lgb.Dataset(X_train, label=y_train) 
valid_ds = lgb.Dataset(X_valid, label=y_valid) 

### 03. Model

In [4]:
# Hypter Parameter Setting
params = {'learning_rate': 0.01, 
          'max_depth': 6, 
          'boosting' : 'gbdt', 
          'objective' : 'regression', 
          'metric' : 'mse', 
          'is_training_metric' : True, 
          'num_leaves' : 144, 
          'feature_fraction' : 0.9, 
          'bagging_fraction' : 0.7, 
          'bagging_freq': 5, 
          'seed' : 1234}

# Model Learning
model = lgb.train(params, train_ds, 1000, valid_ds, verbose_eval=100, early_stopping_rounds=100)

# Train Predict
predict_train = model.predict(X_train)
predict_X_valid = model.predict(X_valid)

# mse and r^2
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_valid, predict_X_valid)
r2 = r2_score(y_valid, predict_X_valid)
print('\n\nMean squared error         :', mse)
print('Root Mean squared error    :', mse**.5)
print('R2 score                   :', r2)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 4.39794e+07
[200]	valid_0's l2: 8.6278e+06
[300]	valid_0's l2: 3.73775e+06
[400]	valid_0's l2: 2.99314e+06
[500]	valid_0's l2: 2.86169e+06
[600]	valid_0's l2: 2.82859e+06
[700]	valid_0's l2: 2.82793e+06
Early stopping, best iteration is:
[682]	valid_0's l2: 2.81203e+06


Mean squared error         : 2812032.4278049837
Root Mean squared error    : 1676.9115742355002
R2 score                   : 0.9906927608688906


### 04. Validation Predict

In [5]:
# Predict rate
import numpy as np
aaa = 100 * (np.array(y_valid) - np.array(X_valid.close)) / np.array(X_valid.close)
bbb = 100 * (model.predict(X_valid) - np.array(X_valid.close)) / np.array(X_valid.close)

# DataFrame Result
predict = pd.DataFrame({'오늘 실제 종가' : np.array(X_valid.close),
                        '내일 실제 종가' : np.array(y_valid), 
                        '내일 예측 종가' : model.predict(X_valid),
                        '실제 내일 변동(%)' : aaa,
                        '모델 예상 내일 변동(%)' : bbb})

# Result
round(predict, 2)

Unnamed: 0,오늘 실제 종가,내일 실제 종가,내일 예측 종가,실제 내일 변동(%),모델 예상 내일 변동(%)
0,87400,87900.0,86458.18,0.57,-1.08
1,70500,67800.0,69999.69,-3.83,-0.71
2,53600,51800.0,53314.72,-3.36,-0.53
3,67700,69100.0,67127.12,2.07,-0.85
4,67900,66500.0,67903.24,-2.06,0.00
...,...,...,...,...,...
86,100500,99900.0,101365.02,-0.60,0.86
87,69900,71400.0,70320.10,2.15,0.60
88,98000,97900.0,96809.68,-0.10,-1.21
89,48400,50900.0,48252.88,5.17,-0.30


### 05. Test Crawling and Predict

In [8]:
# ssl Setting
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

# Test Date
start_date = '200301'
end_date = '200309'

# url and html
url = "https://lab.donutz.co/krx/products/" + code_2 + "?sdate=" + start_date + "&edate=" + end_date
html = urlopen(url)  

# Crawling
bsObject = BeautifulSoup(html, "html.parser") 
soup_string = str(bsObject)

# 데이터 정규식
def clean(data):
    text = re.sub('[a-zA-Z-=+#/\?^$@*\"※~&%ㆍ!_』:\\‘|\(\)\[\]\<\>`\'{}…》]', '', data)
    return text

# Text Split
text = re.split('[,]+',clean(soup_string))
information = []
for i in text:
    information.append(i)
    
# Test Dataset Handling
test_data = pd.DataFrame(np.array(information).reshape(-1,15))
test_data.columns = ['date','open','high','low','close','trading_volume','score','index','probability','l1','l2','l3','l4','lgap','lrate']
test_data = test_data.set_index('date')
test_data['open'] = test_data['open'].apply(lambda x : float(x))
test_data['high'] = test_data['high'].apply(lambda x : float(x))
test_data['low'] = test_data['low'].apply(lambda x : float(x))
test_data['close'] = test_data['close'].apply(lambda x : float(x))
test_data['trading_volume'] = test_data['trading_volume'].apply(lambda x : float(x))
test_data['score'] = test_data['score'].apply(lambda x : float(x))
test_data['index'] = test_data['index'].apply(lambda x : float(x))
test_data['probability'] = test_data['probability'].apply(lambda x : float(x))
test_data['l1'] = test_data['l1'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
test_data['l2'] = test_data['l2'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
test_data['l3'] = test_data['l3'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
test_data['l4'] = test_data['l4'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
test_data['lgap'] = test_data['lgap'].apply(lambda x : float(x))
test_data['lrate'] = test_data['lrate'].apply(lambda x : float(x))

# DataFrame Setting
X_test = pd.DataFrame(test_data)

# Test Data Head
X_test  

Unnamed: 0_level_0,open,high,low,close,trading_volume,score,index,probability,l1,l2,l3,l4,lgap,lrate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
20200302,60300.0,61200.0,59600.0,60100.0,700278.0,12600.0,21.0,79.773756,61450.0,60650.0,59850.0,59050.0,2400.0,4.0
20200303,61500.0,61700.0,60000.0,60200.0,584352.0,12200.0,20.0,79.750623,61800.0,60950.0,60100.0,59250.0,2550.0,4.0
20200304,60100.0,61200.0,60100.0,60600.0,518796.0,12200.0,20.0,79.750623,61450.0,60900.0,60350.0,59800.0,1650.0,3.0
20200305,61300.0,61900.0,60800.0,61500.0,571762.0,12200.0,20.0,79.750623,62250.0,61700.0,61150.0,60600.0,1650.0,3.0
20200306,60900.0,61100.0,60200.0,60300.0,541454.0,12200.0,20.0,79.750623,61150.0,60700.0,60250.0,59800.0,1350.0,2.0


In [9]:
# Predict rate

ccc = 100 * (np.array(X_test.close.shift(-1)) - np.array(X_test.close)) / np.array(X_test.close)
ddd = 100 * (model.predict(X_test) - np.array(X_test.close)) / np.array(X_test.close)

# DataFrame Result
predict = pd.DataFrame({'오늘 실제 종가' : np.array(X_test.close),
                        '내일 실제 종가' : np.array(X_test.close.shift(-1)),
                        '내일 예측 종가' : model.predict(X_test),
                        '실제 내일 변동(%)' : ccc,
                        '모델 예상 내일 변동(%)' : ddd})
# Result
round(predict, 2)

Unnamed: 0,오늘 실제 종가,내일 실제 종가,내일 예측 종가,실제 내일 변동(%),모델 예상 내일 변동(%)
0,60100.0,60200.0,60574.19,0.17,0.79
1,60200.0,60600.0,60969.72,0.66,1.28
2,60600.0,61500.0,60550.76,1.49,-0.08
3,61500.0,60300.0,62155.74,-1.95,1.07
4,60300.0,,61018.34,,1.19
