### 01. Data Loading

In [1]:
# Read Dataset
import pandas as pd
dataset = pd.read_csv('./total_price.csv')

# Del Unnamed: 0
del dataset['Unnamed: 0']

# l1~l4 line Setting
dataset['l1'] = dataset['l1'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
dataset['l2'] = dataset['l2'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
dataset['l3'] = dataset['l3'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
dataset['l4'] = dataset['l4'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))

# Target Code
code_1 = 47310
code_2 = '047310'

# Warning Message
import warnings
warnings.filterwarnings(action='ignore') 

# Filter code
data = dataset[dataset['code'] == code_1]

# Target Rate Create
data['target'] = data['close'].shift(-1)
data = data.dropna(axis=0)

# Data Head
data

Unnamed: 0,date,open,high,low,close,trading_volume,score,index,probability,l1,l2,l3,l4,lgap,lrate,code,target
906507,20160622,4125,4180,4115,4155,236179,7.630,1.000,30.000,4200.0,4167.5,4135.0,4102.5,97.5,2,47310,4000.0
906508,20160623,4155,4155,3990,4000,488671,10.508,1.000,30.000,4160.0,4077.5,3995.0,3912.5,247.5,6,47310,3705.0
906509,20160624,4060,4105,3655,3705,915963,8.029,3.286,71.522,4130.0,3905.0,3680.0,3455.0,675.0,18,47310,3815.0
906510,20160627,3510,3850,3510,3815,479407,9.203,2.667,77.671,4002.5,3832.5,3662.5,3492.5,510.0,13,47310,3840.0
906511,20160628,3750,3885,3725,3840,370515,9.203,2.667,77.671,3942.5,3862.5,3782.5,3702.5,240.0,6,47310,4080.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
907403,20200218,10500,10500,10050,10150,747270,10.662,5.833,77.145,10550.0,10325.0,10100.0,9875.0,675.0,7,47310,10100.0
907404,20200219,10250,10300,9820,10100,966653,15.091,3.400,72.038,10440.0,10200.0,9960.0,9720.0,720.0,7,47310,10050.0
907405,20200220,10200,10400,9960,10050,957884,13.266,4.625,75.534,10445.0,10225.0,10005.0,9785.0,660.0,7,47310,9830.0
907406,20200221,9840,10100,9810,9830,704262,6150.000,67.000,79.978,10110.0,9965.0,9820.0,9675.0,435.0,4,47310,9510.0


### 02. Data Handling

In [2]:
# Input & Target Split
X = data[data.columns[1:-2]]
y = data[data.columns[-1]]

# Train & Valid Split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=1234)

# Dataset For LightGBM
import lightgbm as lgb
train_ds = lgb.Dataset(X_train, label = y_train) 
valid_ds = lgb.Dataset(X_valid, label = y_valid) 

### 03. Model

In [3]:
# Hypter Parameter Setting
params = {'learning_rate': 0.01, 
          'max_depth': 6, 
          'boosting' : 'gbdt', 
          'objective' : 'regression', 
          'metric' : 'mse', 
          'is_training_metric' : True, 
          'num_leaves' : 144, 
          'feature_fraction' : 0.9, 
          'bagging_fraction' : 0.7, 
          'bagging_freq': 5, 
          'seed' : 1234}

# Model Learning
model = lgb.train(params, train_ds, 1000, valid_ds, verbose_eval=100, early_stopping_rounds=100)

# Train Predict
predict_train = model.predict(X_train)
predict_X_valid = model.predict(X_valid)

# mse and r^2
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_valid, predict_X_valid)
r2 = r2_score(y_valid, predict_X_valid)
print('\n\nMean squared error         :', mse)
print('Root Mean squared error    :', mse**.5)
print('R2 score                   :', r2)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 770558
[200]	valid_0's l2: 148221
[300]	valid_0's l2: 65128.6
[400]	valid_0's l2: 54718.9
[500]	valid_0's l2: 54468.7
Early stopping, best iteration is:
[425]	valid_0's l2: 53913.1


Mean squared error         : 53913.08394192688
Root Mean squared error    : 232.19191187878806
R2 score                   : 0.9900236716584183


### 04. Validation Predict

In [4]:
# Predict rate
import numpy as np
aaa = 100 * (np.array(y_valid) - np.array(X_valid.close)) / np.array(X_valid.close)
bbb = 100 * (model.predict(X_valid) - np.array(X_valid.close)) / np.array(X_valid.close)

# DataFrame Result
predict = pd.DataFrame({'오늘 실제 종가' : np.array(X_valid.close),
                        '내일 실제 종가' : np.array(y_valid), 
                        '내일 예측 종가' : model.predict(X_valid),
                        '실제 내일 변동(%)' : aaa,
                        '모델 예상 내일 변동(%)' : bbb})

# Result
round(predict, 2)

Unnamed: 0,오늘 실제 종가,내일 실제 종가,내일 예측 종가,실제 내일 변동(%),모델 예상 내일 변동(%)
0,6000,5640.0,5930.76,-6.00,-1.15
1,5230,5450.0,5234.43,4.21,0.08
2,3985,3965.0,4029.06,-0.50,1.11
3,5840,5730.0,5912.81,-1.88,1.25
4,4770,4780.0,4778.30,0.21,0.17
...,...,...,...,...,...
86,5550,5450.0,5559.82,-1.80,0.18
87,7480,7460.0,7592.26,-0.27,1.50
88,5740,5770.0,5789.55,0.52,0.86
89,4405,4275.0,4513.19,-2.95,2.46


### 05. Test Crawling and Predict

In [5]:
# ssl Setting
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

# Test Date
start_date = '200226'
end_date = '200305'

# url and html
url = "https://lab.donutz.co/krx/products/" + code_2 + "?sdate=" + start_date + "&edate=" + end_date
html = urlopen(url)  

# Crawling
bsObject = BeautifulSoup(html, "html.parser") 
soup_string = str(bsObject)

# 데이터 정규식
def clean(data):
    text = re.sub('[a-zA-Z-=+#/\?^$@*\"※~&%ㆍ!_』:\\‘|\(\)\[\]\<\>`\'{}…》]', '', data)
    return text

# Text Split
text = re.split('[,]+',clean(soup_string))
information = []
for i in text:
    information.append(i)
    
# Test Dataset Handling
test_data = pd.DataFrame(np.array(information).reshape(-1,15))
test_data.columns = ['date','open','high','low','close','trading_volume','score','index','probability','l1','l2','l3','l4','lgap','lrate']
test_data['open'] = test_data['open'].apply(lambda x : float(x))
test_data['high'] = test_data['high'].apply(lambda x : float(x))
test_data['low'] = test_data['low'].apply(lambda x : float(x))
test_data['close'] = test_data['close'].apply(lambda x : float(x))
test_data['trading_volume'] = test_data['trading_volume'].apply(lambda x : float(x))
test_data['score'] = test_data['score'].apply(lambda x : float(x))
test_data['index'] = test_data['index'].apply(lambda x : float(x))
test_data['probability'] = test_data['probability'].apply(lambda x : float(x))
test_data['l1'] = test_data['l1'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
test_data['l2'] = test_data['l2'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
test_data['l3'] = test_data['l3'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
test_data['l4'] = test_data['l4'].apply(lambda x : str(x)).apply(lambda x : x[1:]).apply(lambda x : float(x))
test_data['lgap'] = test_data['lgap'].apply(lambda x : float(x))
test_data['lrate'] = test_data['lrate'].apply(lambda x : float(x))

# DataFrame Setting
test = pd.DataFrame(test_data)

# Input Setting
X_test = test[test.columns[1:]]

# Test Data Head
X_test

Unnamed: 0,open,high,low,close,trading_volume,score,index,probability,l1,l2,l3,l4,lgap,lrate
0,9410.0,9640.0,9350.0,9510.0,572010.0,6080.0,65.0,79.976337,9720.0,9575.0,9430.0,9285.0,435.0,5.0
1,9520.0,9760.0,8880.0,8990.0,1131178.0,7150.0,15.0,79.557522,9815.0,9375.0,8935.0,8495.0,1320.0,15.0
2,8680.0,8850.0,8470.0,8520.0,1001769.0,6430.0,73.0,79.981238,8875.0,8685.0,8495.0,8305.0,570.0,7.0
3,8520.0,9050.0,8420.0,9050.0,756360.0,8420.0,15.0,79.557522,9365.0,9050.0,8735.0,8420.0,945.0,10.0
4,9400.0,9410.0,8960.0,9070.0,668249.0,8420.0,15.0,79.557522,9465.0,9240.0,9015.0,8790.0,675.0,7.0
5,8950.0,9290.0,8930.0,9240.0,503286.0,8620.0,15.0,79.557522,9445.0,9265.0,9085.0,8905.0,540.0,6.0
6,9400.0,9400.0,9150.0,9320.0,562283.0,6460.0,73.0,79.981238,9485.0,9360.0,9235.0,9110.0,375.0,4.0


In [6]:
# Predict rate

ccc = 100 * (np.array(test[test.columns[1:]].close.shift(-1)) - np.array(test[test.columns[1:]].close))\
                      /np.array(test[test.columns[1:]].close)
ddd = 100 * (model.predict(test[test.columns[1:]]) - np.array(test[test.columns[1:]].close))\
                      /np.array(test[test.columns[1:]].close)

# DataFrame Result
predict = pd.DataFrame({'오늘 실제 종가' : np.array(test[test.columns[1:]].close),
                        '내일 실제 종가' : np.array(test[test.columns[1:]].close.shift(-1)),
                        '내일 예측 종가' : model.predict(test[test.columns[1:]]),
                        '실제 내일 변동(%)' : ccc,
                        '모델 예상 내일 변동(%)' : ddd})
# Result
round(predict, 2)

Unnamed: 0,오늘 실제 종가,내일 실제 종가,내일 예측 종가,실제 내일 변동(%),모델 예상 내일 변동(%)
0,9510.0,8990.0,9559.52,-5.47,0.52
1,8990.0,8520.0,9096.06,-5.23,1.18
2,8520.0,9050.0,8716.58,6.22,2.31
3,9050.0,9070.0,8942.54,0.22,-1.19
4,9070.0,9240.0,9092.25,1.87,0.25
5,9240.0,9320.0,9074.31,0.87,-1.79
6,9320.0,,9461.25,,1.52
