### 1. Setting

In [1]:
# Seed
from numpy.random import seed
seed(1234)
from tensorflow import set_random_seed
set_random_seed(1234)

In [2]:
# Warning Message
import warnings
warnings.filterwarnings(action='ignore') 

### 2. Data Loading

In [3]:
# Original 데이터
import pandas as pd
data_original_2017 = pd.read_csv('./data/original_data_2017.csv')
data_original_2018 = pd.read_csv('./data/original_data_2018.csv')
data_original_2019 = pd.read_csv('./data/original_data_2019.csv')

In [4]:
# 사내 창출 데이터
import pandas as pd
data_new_2017 = pd.read_csv('./data/new_data_2017.csv')
data_new_2018 = pd.read_csv('./data/new_data_2017.csv')
data_new_2019 = pd.read_csv('./data/new_data_2017.csv')

In [5]:
# Unnamed 칼럼 제거
data_original_2017 = data_original_2017[data_original_2017.columns[1:]]
data_new_2017 = data_new_2017[data_new_2017.columns[1:]]

### 3. Data Head

In [6]:
data_original_2017

Unnamed: 0,date,open,high,low,close,trading_volume,trading_value,code
0,2017-01-02 00:00:00,79905,81585,79170,79800,2730,2.082262e+08,12320
1,2017-01-03 00:00:00,80535,80535,79170,79800,3093,2.353751e+08,12320
2,2017-01-04 00:00:00,80535,80955,76965,77175,2183,1.625183e+08,12320
3,2017-01-05 00:00:00,77070,78855,75600,78225,2560,1.881497e+08,12320
4,2017-01-06 00:00:00,78225,79800,77280,77385,2386,1.782250e+08,12320
...,...,...,...,...,...,...,...,...
587160,2017-12-21 00:00:00,11550,11950,10500,10600,1305109,1.447922e+10,187870
587161,2017-12-22 00:00:00,10300,11250,9650,9960,1040672,1.074168e+10,187870
587162,2017-12-26 00:00:00,10150,10350,9410,9410,481185,4.688869e+09,187870
587163,2017-12-27 00:00:00,9540,11650,9470,9750,5397011,5.784463e+10,187870


In [7]:
data_new_2017

Unnamed: 0,date,kindex,index,score,probability,code
0,2017-01-02 00:00:00,0.000685,0.388889,0.546392,6.863271,12320
1,2017-01-03 00:00:00,0.000685,0.388889,0.546392,6.863271,12320
2,2017-01-04 00:00:00,0.001262,0.555556,0.974026,3.584906,12320
3,2017-01-05 00:00:00,0.001213,0.625000,0.948718,8.089888,12320
4,2017-01-06 00:00:00,0.001226,0.625000,0.948718,8.089888,12320
...,...,...,...,...,...,...
587160,2017-12-21 00:00:00,0.009434,1.000000,1.000000,30.000000,187870
587161,2017-12-22 00:00:00,0.010040,1.000000,1.000000,30.000000,187870
587162,2017-12-26 00:00:00,0.010627,1.000000,1.000000,30.000000,187870
587163,2017-12-27 00:00:00,0.010256,1.000000,1.000000,30.000000,187870


### 4. Code

In [8]:
CODE_LIST = data_new_2019.code.value_counts().index

### 5. Data Merge & Labeling

In [9]:
stock_data = pd.DataFrame()
df_final = pd.DataFrame()
NUM = 0

for code in CODE_LIST :
    
    # 2017
    data_original = data_original_2017[data_original_2017['code'] == code]
    data_new = data_new_2017[data_new_2017['code'] == code]
    data = pd.merge(data_original, data_new, how='left')
    data = data[['date','trading_volume', 'trading_value', 'kindex', 'index', 'score', 'probability', 'close']]
    df_final = df_final.append(data)
    
    # 2018
    data_original = data_original_2018[data_original_2018['code'] == code]
    data_new = data_new_2018[data_new_2018['code'] == code]
    data = pd.merge(data_original, data_new, how='left')
    data = data[['date','trading_volume', 'trading_value', 'kindex', 'index', 'score', 'probability', 'close']]
    df_final = df_final.append(data)
    
    # 2019
    data_original = data_original_2019[data_original_2019['code'] == code]
    data_new = data_new_2019[data_new_2019['code'] == code]
    data = pd.merge(data_original, data_new, how='left')
    data = data[['date','trading_volume', 'trading_value', 'kindex', 'index', 'score', 'probability', 'close']]
    df_final = df_final.append(data)
    
    # Index Reset
    df_final = df_final.reset_index()
    
    # Data Shfit 생성    
    df_final['close5'] = df_final['close'].shift(-5)
    df_final['pred_rate'] = (100 * (df_final['close5'] - df_final['close'])) / df_final['close']
    df_final = df_final[:-5]

    # 예측값 라벨링
    def labeling(x) : 
        if x > 10.00 : 
            result = '10.00 ~ Inf'
        elif x > 5.00 : 
            result = '5.00 ~ 9.99'
        elif x > 1.00 : 
            result = '1.00 ~ 4.99'
        elif x > -1.00 : 
            result = '-1.00 ~ 1.00'
        elif x > -5.00 : 
            result = '-5.00 ~ -1.01'
        elif x > -10.00 : 
            result = '-10.00 ~ -5.01'
        else : 
            result = '-Inf ~ -10.00'
        return result

    # 함수 적용
    df_final['pred_rate_label'] = df_final['pred_rate'].apply(lambda x : labeling(x))
    
    # Feature Selecting
    df_final = df_final[['trading_volume', 'trading_value', 'kindex', 'index', 'score', 'probability', 'pred_rate_label']]

    # MinMaxScaling
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(df_final[['trading_volume', 'trading_value', 'kindex', 'index', 'score', 'probability']])
    X = scaler.transform(df_final[['trading_volume', 'trading_value', 'kindex', 'index', 'score', 'probability']])
    y = df_final[['pred_rate_label']]
    df_final_X = pd.DataFrame(X, columns=['trading_volume', 'trading_value', 'kindex', 'index', 'score', 'probability'])
    df_final_y = pd.DataFrame(y, columns=['pred_rate_label'])
    df_final = pd.concat([df_final_X, df_final_y], axis=1)
    
    # 최종 데이터셋에 추가
    stock_data = stock_data.append(df_final)
    
    # df_final Reset
    df_final = pd.DataFrame()
    
    # 경과 check
    if NUM % 100 == 0 : 
        print("{:4}번째 종목 계산 중...." .format(NUM))
    NUM += 1

   0번째 종목 계산 중....
 100번째 종목 계산 중....
 200번째 종목 계산 중....
 300번째 종목 계산 중....
 400번째 종목 계산 중....
 500번째 종목 계산 중....
 600번째 종목 계산 중....
 700번째 종목 계산 중....
 800번째 종목 계산 중....
 900번째 종목 계산 중....
1000번째 종목 계산 중....
1100번째 종목 계산 중....
1200번째 종목 계산 중....
1300번째 종목 계산 중....
1400번째 종목 계산 중....
1500번째 종목 계산 중....
1600번째 종목 계산 중....
1700번째 종목 계산 중....
1800번째 종목 계산 중....
1900번째 종목 계산 중....
2000번째 종목 계산 중....
2100번째 종목 계산 중....
2200번째 종목 계산 중....
2300번째 종목 계산 중....
2400번째 종목 계산 중....
2500번째 종목 계산 중....


In [10]:
stock_data

Unnamed: 0,trading_volume,trading_value,kindex,index,score,probability,pred_rate_label
0,0.000887,0.000284,0.002800,0.020243,0.002881,0.352102,-1.00 ~ 1.00
1,0.000965,0.000308,0.002800,0.020243,0.002881,0.352102,-5.00 ~ -1.01
2,0.001138,0.000362,0.002800,0.020243,0.002881,0.352102,-5.00 ~ -1.01
3,0.001045,0.000331,0.003382,0.027665,0.003482,0.451567,-5.00 ~ -1.01
4,0.001190,0.000376,0.005843,0.035088,0.006026,0.534200,-5.00 ~ -1.01
...,...,...,...,...,...,...,...
483,0.001143,0.000549,,,,,1.00 ~ 4.99
484,0.001683,0.000886,,,,,-1.00 ~ 1.00
485,0.002004,0.001105,,,,,1.00 ~ 4.99
486,0.011238,0.006931,,,,,-5.00 ~ -1.01


In [11]:
stock_data.describe()

Unnamed: 0,trading_volume,trading_value,kindex,index,score,probability
count,1801859.0,1801859.0,587075.0,587075.0,587075.0,587075.0
mean,0.04232418,0.0405476,0.230653,0.195274,0.217463,0.640102
std,0.0891965,0.08852779,0.333281,0.276289,0.329985,0.368791
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.003447224,0.00256976,0.000299,0.017336,0.000238,0.285012
50%,0.01239528,0.01041721,0.001612,0.059075,0.001406,0.875592
75%,0.04154743,0.03860664,0.514467,0.254089,0.467181,0.985024
max,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
import matplotlib.pyplot as plt
label_count = pd.DataFrame([stock_data.pred_rate_label.value_counts()]).T
label_count['label'] = label_count.index
plt.figure(figsize=(18,6))
import seaborn as sns
sns.set(style = 'whitegrid')
ax = sns.barplot(x='label', y='pred_rate_label', data=label_count)

In [13]:
# Label Setting
def label_to_numeric(x) : 
    if x == '-Inf ~ -10.00' : 
        result = 0
    elif x == '-10.00 ~ -5.01' : 
        result = 1
    elif x == '-5.00 ~ -1.01' : 
        result = 2
    elif x == '-1.00 ~ 1.00' : 
        result = 3
    elif x == '1.00 ~ 4.99' : 
        result = 4
    elif x == '5.00 ~ 9.99' : 
        result = 5
    else : 
        result = 6
    return result
        
# 함수 적용
stock_data['label'] = stock_data['pred_rate_label'].apply(lambda x : label_to_numeric(x))
stock_data

Unnamed: 0,trading_volume,trading_value,kindex,index,score,probability,pred_rate_label,label
0,0.000887,0.000284,0.002800,0.020243,0.002881,0.352102,-1.00 ~ 1.00,3
1,0.000965,0.000308,0.002800,0.020243,0.002881,0.352102,-5.00 ~ -1.01,2
2,0.001138,0.000362,0.002800,0.020243,0.002881,0.352102,-5.00 ~ -1.01,2
3,0.001045,0.000331,0.003382,0.027665,0.003482,0.451567,-5.00 ~ -1.01,2
4,0.001190,0.000376,0.005843,0.035088,0.006026,0.534200,-5.00 ~ -1.01,2
...,...,...,...,...,...,...,...,...
483,0.001143,0.000549,,,,,1.00 ~ 4.99,4
484,0.001683,0.000886,,,,,-1.00 ~ 1.00,3
485,0.002004,0.001105,,,,,1.00 ~ 4.99,4
486,0.011238,0.006931,,,,,-5.00 ~ -1.01,2


### LightGBM

In [14]:
# Train Test Split
from sklearn.model_selection import train_test_split
X = stock_data[['trading_volume', 'trading_value', 'kindex', 'index', 'score', 'probability']].values
y = stock_data[['label']].values
y = y.reshape(-1)
x_train, x_test, y_train, y_test = train_test_split(X, y,test_size=0.1, random_state=1234)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train ,test_size=0.2, random_state=1234)

# HyperParameter
params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'multiclass',
    'metric' : {'multi_logloss'},
    'num_leaves' : 63,
    'learning_rate' : 0.1,
    'feature_fraction' : 0.9,
    'bagging_fraction' : 0.9,
    'bagging_freq': 0,
    'verbose' : 1,
    'num_class' : 7,
    'save_binary' : True
}

# Model Fitting
import lightgbm as lgb
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds = 30)

[1]	valid_0's multi_logloss: 1.714
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's multi_logloss: 1.70854
[3]	valid_0's multi_logloss: 1.70381
[4]	valid_0's multi_logloss: 1.69956
[5]	valid_0's multi_logloss: 1.69571
[6]	valid_0's multi_logloss: 1.69257
[7]	valid_0's multi_logloss: 1.68969
[8]	valid_0's multi_logloss: 1.68707
[9]	valid_0's multi_logloss: 1.68479
[10]	valid_0's multi_logloss: 1.68268
[11]	valid_0's multi_logloss: 1.68077
[12]	valid_0's multi_logloss: 1.67893
[13]	valid_0's multi_logloss: 1.67746
[14]	valid_0's multi_logloss: 1.6759
[15]	valid_0's multi_logloss: 1.67448
[16]	valid_0's multi_logloss: 1.67325
[17]	valid_0's multi_logloss: 1.67211
[18]	valid_0's multi_logloss: 1.67103
[19]	valid_0's multi_logloss: 1.67007
[20]	valid_0's multi_logloss: 1.66914
[21]	valid_0's multi_logloss: 1.6683
[22]	valid_0's multi_logloss: 1.66754
[23]	valid_0's multi_logloss: 1.66681
[24]	valid_0's multi_logloss: 1.66612
[25]	valid_0's multi_logloss: 1.66549
[

KeyboardInterrupt: 

In [15]:
predictions = []
import numpy as np
for x in gbm.predict(x_train):
    predictions.append(np.argmax(x))

y_train_pred = np.array(predictions).reshape(-1)
y_train_true = y_train

from sklearn.metrics import confusion_matrix
print("== Train Obs ==")
print('{:,} obs' .format(len(x_train)), end="\n\n")
print("== Train Error Matrix ==")
print(confusion_matrix(y_train_true, y_train_pred), end="\n\n")
print("== Train Accuracy ==")
acc = (8822 + 8666 + 243487 + 175691 + 85904 + 4567 + 6722) / len(y_train)
print("train accuracy : {:.2f}%" .format(acc * 100))

NameError: name 'gbm' is not defined

In [16]:
predictions = []
for x in gbm.predict(x_valid):
    predictions.append(np.argmax(x))

y_valid_pred = np.array(predictions).reshape(-1)
y_valid_true = y_valid

from sklearn.metrics import confusion_matrix
print("== Valid Obs ==")
print('{:,} obs' .format(len(x_valid)), end="\n\n")
print("== Valid Error Matrix ==")
print(confusion_matrix(y_valid_true, y_valid_pred), end="\n\n")
print("== Valid Accuracy ==")
acc = (1605 + 1437 + 58757 + 41977 + 18551 + 634 + 1037) / len(y_valid)
print("valid accuracy : {:.2f}%" .format(acc * 100))

NameError: name 'gbm' is not defined

In [17]:
predictions = []
for x in gbm.predict(x_test):
    predictions.append(np.argmax(x))

y_test_pred = np.array(predictions).reshape(-1)
y_test_true = y_test

from sklearn.metrics import confusion_matrix
print("== Test Obs ==")
print('{:,} obs' .format(len(x_test)), end="\n\n")
print("== Test Error Matrix ==")
print(confusion_matrix(y_test_true, y_test_pred), end="\n\n")
print("== Test Accuracy ==")
acc = (828 + 802 + 32689 + 23706 + 10423 + 318 + 600) / len(y_test)
print("test accuracy : {:.2f}%" .format(acc * 100))

NameError: name 'gbm' is not defined

In [18]:
heatmap =pd.DataFrame(confusion_matrix(y_test_true, y_test_pred),
                      columns=['-Inf ~ -10.00', '-10.00 ~ -5.00', '-4.99 ~ -1.00', '-1.00 ~ 1.00', '1.00 ~ 5.00', '5.00 ~ 10.00', '10.0 ~ Inf'],
                      index = ['-Inf ~ -10.00', '-10.00 ~ -5.00', '-4.99 ~ -1.00', '-1.00 ~ 1.00', '1.00 ~ 5.00', '5.00 ~ 10.00', '10.0 ~ Inf'])

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Draw a heatmap with the numeric values in each cell
f, ax = plt.subplots(figsize=(10,6))
map = sns.heatmap(heatmap, annot=True, fmt="d", linewidths=.5, ax=ax, cmap='coolwarm')
map.set_ylim(7,0)
plt.show()

NameError: name 'confusion_matrix' is not defined

### Test 상승, 하락, 변화없음 3 Class로만 분류한 경우

In [None]:
Class3 = np.array([[828+139+3031+221+802+10309+184+205+32689, 391+1315+9556, 491+2+31+1724+12+50+4887+32+59],
                   [113+96+19175, 23706, 5181+27+44],
                   [133+117+19525+66+74+7364+73+60+4160, 9899+1516+706, 10423+36+54+2812+318+43+1580+21+600]])

print("== Up and Not and Down ==")
print(Class3, end="\n\n")

acc = (48408+23706+15887) / (48408+11262+7288+19384+23706+15252+31572+12121+15887)

print("Test 3 Class Accuracy = {:.2f}%". format(acc*100))

In [None]:
heatmap =pd.DataFrame(Class3,
                      columns=['Down', 'Not', 'Up'],
                      index = ['Down', 'Not', 'Up'])

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Draw a heatmap with the numeric values in each cell
f, ax = plt.subplots(figsize=(10,6))
map = sns.heatmap(heatmap, annot=True, fmt="d", linewidths=.5, ax=ax, cmap='coolwarm')
map.set_ylim(3,0)
plt.show()