In [2]:
# Seed
from numpy.random import seed
seed(1234)
from tensorflow import set_random_seed
set_random_seed(1234)

In [3]:
# Warning Message
import warnings
warnings.filterwarnings(action='ignore') 

In [4]:
# Data Loading
import pandas as pd
data = pd.read_csv("./data/df_final.csv")

In [5]:
# 필요한 칼럼 추출
data = data[["date", "open", "high", "low", "close", "code"]]

In [6]:
# Code 앞에 0 추가
data["code"] = data["code"].apply(lambda x : "{:0>6}" .format(x))

In [7]:
# Code type을 str로 변경
data["code"] = data["code"].apply(lambda x : str(x))

In [8]:
# 각 종목별 obs 개수
print("="*80)
print("{:=^74}" .format(" 각 종목별 obs 개수 "))
print("="*80)
print(data["code"].value_counts())

019210    733
059090    733
010600    733
150840    733
037950    733
         ... 
287310    494
287330    494
287320    494
570024    494
269620    493
Name: code, Length: 2539, dtype: int64


In [9]:
# 3년간의 데이터가 완전한 종목 코드
print("="*80)
print("{:=^67}" .format(" 3년간 데이터가 완전한 종목 코드 733obs "))
print("="*80)
FULL_DATA_CODE = data["code"].value_counts().index[0:2323]
print(FULL_DATA_CODE)

Index(['019210', '059090', '010600', '150840', '037950', '123010', '065690',
       '006890', '171010', '004380',
       ...
       '036460', '140710', '065710', '070590', '160580', '114260', '170790',
       '057680', '068050', '090460'],
      dtype='object', length=2323)


In [10]:
# 3년간의 데이터가 완전한 종목만 추출한 데이터
print("="*80)
print("{:=^61}" .format(" 3년간의 데이터가 완전한 종목만 추출한 데이터 "))
print("="*80)
data = data[data["code"].isin(FULL_DATA_CODE)]
print(data)


               date   open   high    low  close    code
0        2017-01-02  79905  81585  79170  79800  012320
1        2017-01-03  80535  80535  79170  79800  012320
2        2017-01-04  80535  80955  76965  77175  012320
3        2017-01-05  77070  78855  75600  78225  012320
4        2017-01-06  78225  79800  77280  77385  012320
...             ...    ...    ...    ...    ...     ...
1759974  2019-12-23  14050  15000  13950  14300  215600
1759975  2019-12-24  14400  14400  13500  13650  215600
1759976  2019-12-26  13700  14050  13500  13900  215600
1759977  2019-12-27  13900  14300  13900  14050  215600
1759978  2019-12-30  14250  14750  14150  14550  215600

[1702759 rows x 6 columns]


In [11]:
# RESULT DATASET 생성
FINAL_DATA = pd.DataFrame()

# 각 종목 별 Shift 생성
# 5일 전 종가로 5일 후 종가를 예측
temp_data = data                                                                                      
for i in FULL_DATA_CODE :                                                                      
    data = data[data["code"] == i]                                                                   
    for s in range(1,6):                                                                             
            data['{}{}'.format("close",s)] = data["close"].shift(s)                                        
    data = data.dropna().drop(["open", "high", "low", "close1", "close2", "close3", "close4", "date", "code"], axis=1)
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(data)
    data = scaler.transform(data)
    data = pd.DataFrame(data, columns = ["close", "close5"])
    FINAL_DATA = FINAL_DATA.append(data)                                                             
    data = temp_data                                                                               
print("="*80)
print("{:=^66}" .format(" SHIFT 생성 : 5일 전 종가로 5일 후 종가를 예측 "))
print("="*80)
FINAL_DATA = FINAL_DATA[["close", "close5"]]
print("DATA Length : {}obs" .format(len(FINAL_DATA)))
print(FINAL_DATA[0:10])

DATA Length : 1691144obs
      close    close5
0  0.155096  0.149926
1  0.153619  0.158050
2  0.160266  0.152880
3  0.162482  0.159527
4  0.163959  0.158050
5  0.151403  0.155096
6  0.147710  0.153619
7  0.150665  0.160266
8  0.149926  0.162482
9  0.136632  0.163959


In [12]:
# Data 새로 만들기
FINAL_DATA.to_csv("./data/ALLCODE_DATA.csv")

In [13]:
# 새로 만든 데이터 불러오기
data = pd.read_csv("./data/ALLCODE_DATA.csv")

In [14]:
# Feature 추출
data = data[['close', 'close5']]

In [15]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[["close5"]], data[["close"]], test_size=0.10, random_state=42)

In [16]:
# Model Fitting
from sklearn.linear_model import LinearRegression
reg = LinearRegression(n_jobs = -1).fit(X_train, y_train)

In [17]:
print("="*80)
print("{:=^80}" .format(" Linear Regression "))
print("="*80)
print("Train Score : {}" .format(reg.score(X_train, y_train)))
print("Test Score : {}" .format(reg.score(X_test, y_test)))

Train Score : 0.9276316613166372
Test Score : 0.9257146303532147


In [18]:
# RMSE
import numpy as np
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_train, reg.predict(X_train))
rmse = mse ** 0.5
print("Train RMSE : {}" .format(rmse))
mse = mean_squared_error(y_test, reg.predict(X_test))
rmse = mse ** 0.5
print("Test RMSE : {}" .format(rmse))


Train RMSE : 0.06561498432988283
Test RMSE : 0.0663062607476601


In [19]:
# Data Loading
data = pd.read_csv("./data/df_final.csv")

In [20]:
# 필요한 칼럼 추출
data = data[["date", "open", "high", "low", "close", "code"]]

In [21]:
# Code 앞에 0 추가
data["code"] = data["code"].apply(lambda x : "{:0>6}" .format(x))

In [22]:
# Code type을 str로 변경
data["code"] = data["code"].apply(lambda x : str(x))

In [23]:
# 3년간의 데이터가 완전한 종목만 추출한 데이터
print("="*80)
print("{:=^76}" .format(" SK 하이닉스 "))
print("="*80)
data = data[data["code"] == "000660"]
print(data)

              date   open   high    low  close    code
833420  2017-01-02  44750  46000  44600  45800  000660
833421  2017-01-03  46200  47300  46200  47250  000660
833422  2017-01-04  47000  47150  46200  46500  000660
833423  2017-01-05  47000  47500  46850  46950  000660
833424  2017-01-06  47700  48450  47600  48000  000660
...            ...    ...    ...    ...    ...     ...
834148  2019-12-23  95800  96200  94100  94600  000660
834149  2019-12-24  95000  95600  93800  93800  000660
834150  2019-12-26  94000  95100  93500  94800  000660
834151  2019-12-27  94800  97000  94200  96000  000660
834152  2019-12-30  95600  96300  94100  94100  000660

[733 rows x 6 columns]


In [24]:
# 각 종목 별 Shift 생성
# 5일 전 종가로 5일 후 종가를 예측
for s in range(1,6):                                                                             
        data['{}{}'.format("close",s)] = data["close"].shift(s)                                        
data = data.dropna().drop(["open", "high", "low", "close1", "close2", "close3", "close4", "date", "code"], axis=1)
print("="*80)
print("{:=^76}" .format(" SK 하이닉스 SHIFT DATA "))
print("="*80)
print("DATA Length : {}obs" .format(len(data)))
print(data[0:10])

DATA Length : 728obs
        close   close5
833425  49550  45800.0
833426  49750  47250.0
833427  51600  46500.0
833428  50600  46950.0
833429  50300  48000.0
833430  49300  49550.0
833431  49300  49750.0
833432  48850  51600.0
833433  49600  50600.0
833434  49150  50300.0


In [25]:
FINAL_DATA = pd.DataFrame()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(data)
data = scaler.transform(data)
data = pd.DataFrame(data, columns = ["close", "close5"])
FINAL_DATA = FINAL_DATA.append(data)
print("="*80)
print("{:=^76}" .format(" SK 하이닉스  MinMaxScaler "))
print("="*80)
FINAL_DATA = FINAL_DATA[["close", "close5"]]
print("DATA Length : {}obs" .format(len(FINAL_DATA)))
print(FINAL_DATA[0:10])

DATA Length : 728obs
   close    close5
0  0.071  0.000000
1  0.075  0.029293
2  0.112  0.014141
3  0.092  0.023232
4  0.086  0.044444
5  0.066  0.075758
6  0.066  0.079798
7  0.057  0.117172
8  0.072  0.096970
9  0.063  0.090909


In [26]:
# 하이닉스 X, y
X = FINAL_DATA[["close5"]].values
y = FINAL_DATA[["close"]].values

In [27]:
# Sk 하이닉스 Score
print("="*80)
print("{:=^76}" .format(" SK하이닉스 Score "))
print("="*80)
print("Test Score : {}" .format(reg.score(X, y)))

Test Score : 0.9032836294674936


In [28]:
# RMSE
import numpy as np
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y, reg.predict(X))
rmse = mse ** 0.5
print("Test RMSE : {}" .format(rmse))

Test RMSE : 0.07064654077620407
