### 01. Data Loading

In [1]:
# 데이터 불러오기
import pandas as pd
data = pd.read_csv('./total_price.csv')

# 005930
data = data[data['code'] == 5930]

# 불필요한 열 제거
del data['Unnamed: 0']

# 날짜를 인덱스로 변경
data = data.set_index('date')

# l1 ~ l4 Line Setting
data['l1'] = data['l1'].apply(lambda x : str(x)).apply(lambda x : x[1:])
data['l2'] = data['l2'].apply(lambda x : str(x)).apply(lambda x : x[1:])
data['l3'] = data['l3'].apply(lambda x : str(x)).apply(lambda x : x[1:])
data['l4'] = data['l4'].apply(lambda x : str(x)).apply(lambda x : x[1:])

# Float
data['l1'] = data['l1'].apply(lambda x : float(x))
data['l2'] = data['l2'].apply(lambda x : float(x))
data['l3'] = data['l3'].apply(lambda x : float(x))
data['l4'] = data['l4'].apply(lambda x : float(x))

# Feature Selecting
data = data[data.columns[:-1]]

# target 생성
data['target'] = data['close'].shift(-1) - data['close']
data['target'] = data['target'].apply(lambda x : 1 if x > 0 else 0)

# Head
data

Unnamed: 0_level_0,open,high,low,close,trading_volume,score,index,probability,l1,l2,l3,l4,lgap,lrate,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20160622,28920,29000,28620,28900,8948250,1.369,0.750,16.000,29140.0,28950.0,28760.0,28570.0,570.0,2,0
20160623,28880,28900,28540,28600,11256300,0.934,0.474,1.674,28930.0,28750.0,28570.0,28390.0,540.0,2,0
20160624,28900,28900,27200,28000,20552600,1.238,0.667,10.769,29300.0,28450.0,27600.0,26750.0,2550.0,9,0
20160627,28000,28100,27700,27960,11839000,1.369,0.750,16.000,28230.0,28030.0,27830.0,27630.0,600.0,2,1
20160628,27800,28080,27580,27980,10718950,1.369,0.750,16.000,28280.0,28030.0,27780.0,27530.0,750.0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200219,59800,60400,59400,60200,12924455,4.871,1.000,30.000,60800.0,60300.0,59800.0,59300.0,1500.0,2,0
20200220,60700,61300,59600,60000,14503879,3.573,1.500,49.231,61500.0,60650.0,59800.0,58950.0,2550.0,4,0
20200221,58800,59800,58500,59200,13252415,3.522,1.286,42.308,60150.0,59500.0,58850.0,58200.0,1950.0,3,0
20200224,57400,58100,57400,57800,7895962,5.537,1.333,44.000,58300.0,57950.0,57600.0,57250.0,1050.0,2,1


### 02. RSI

 - RSI(Relative Strengh Index)는 주식, 선물, 옵션 등의 기술적 분석에 사용되는 보조지표이다. RSI는 **가격의 상승압력과 하락압력 간의 상대적인 강도**를 나타낸다. 1978년 미국의 월레스 와일더가 개발했다.
 
 
 - 요약해 말하면, 일정 기간 동안 가격 상승폭과 하락폭 중 어느 쪽이 더 높냐를 나타내는 것으로 **0에 가까울수록 하락 강도가 강하다**는 뜻이고 반대의 경우는 상승 강도가 강하다는 뜻이다.
 
 
 - 계산법
  - U = 전일 주가가 비교 대상 주가보다 상승했을 때 상승폭
  - D = 전일 주가가 비교 대상 주가보다 하락했을 때 하락폭
  - AU = 일정 기간동안 U의 평균
  - AD = 일정 기간동안 D의 평균
  - RS = AU / AD
  - RSI = AU / (AU + AD)
  - RSI 시그널 = RSI의 이동평균선
  
  
 - 기본 셋팅값
   - AU, AD 계산에서 사용하는 기간은 RSI 개발자의 말을 따라 **14일**로 설정.
   - RSI 시그널이라함은 RSI의 이동평균선으로 **9일** 이동평균선을 이용.

In [2]:
import numpy as np
import pandas as pd

def calcRSI(df, period) : 
    
    # df.diff 를 통해 (기준일 종가 - 기준일 전일 종가)를 계산하여 0보다 크면 증가분을, 감소했으면 0을 넣어줌.
    U = np.where(df.diff(1)['close'] > 0, df.diff(1)['close'], 0)
    
    # df.diff 를 통해 (기준일 종가 - 기준일 전일 종가)를 계산하여 0보다 작으면 감소분을, 증가했으면 0을 넣어줌.
    D = np.where(df.diff(1)['close'] < 0, df.diff(1)['close']*(-1), 0)
    
    # AU, period=14 일 동안의 U의 평균
    AU = pd.DataFrame(U, index=data.index).rolling(window=period).mean()
    
    # AD, period=14 일 동안의 D의 평균
    AD = pd.DataFrame(D, index=data.index).rolling(window=period).mean()
    
    # RSI
    RSI = AU / (AD + AU)
    
    return RSI

# 데이터에 적용
data.insert(len(data.columns), 'rsi', calcRSI(data, 14))
data.insert(len(data.columns), 'rsi_signal', data['rsi'].rolling(window=9).mean())

### 03. Momentum strategy

 - 모멘텀 전략이란 **최근 수익률이 좋았던 주식을 사는 전략**이다. 지금까지 잘 올랐던 주식이 앞으로도 올라갈 것이라는 기대로 "비싸게 사서 더 비싸게 판다." 라는 철학을 가지고 있다. 회귀 분석 계수 $R^2$값과 90일간의 지수 회귀 기울기를 곱한 값으로 모멘텀을 구한다.

In [3]:
import numpy as np
import pandas as pd
from scipy.stats import linregress

def momentum(closes) : 
    
    # 회귀분석 준비
    returns = np.log(closes)
    x = np.arange(len(returns))
    
    # 회귀분석계수 계산
    slope, _, rvalue, _, _ = linregress(x, returns)
    
    # 모멘텀값 출력
    return ((1 + slope) ** 252) * (rvalue ** 2)
    
# 데이터에 저장
data['momentum'] = data['close'].rolling(90).apply(momentum)

### 04. 지수이동평균(Exponential Moving Average)

 - 평균이란 개념은 초등학교 때부터 들어와서 누구나 쉽게 인지하고 있습니다. 가령, 5명의 평균 나이를 계산하라고 하면 5명의 나이 합을 5로 나누어서 구합니다. 이동평균이라는 것은 5명 대신 5일로 바뀌었을 뿐 산출하는 방법은 동일합니다. 따라서 평균과 이동평균을 그냥 동일 개념으로 생각하고 넘어가기도 합니다. 특히, 지수이동평균은 최근에 높은 가중치를 주지만, 오래된 과거도 비록 낮은 영향력이지만 가중치를 두여하도록 고려한 방법입니다.
 
 
 - $EMV(t) = (1-w) \times EMV(t-1) + w \times EMV(t)$
 
 
 - $EMV(t) = (1-w) \times ((1-w) \times EMV(t-2) + w \times Price(t-1)) + w \times Price(t)$

In [4]:
data['emv3'] = data['close'].ewm(span=3).mean()
data['emv5'] = data['close'].ewm(span=5).mean()
data['emv9'] = data['close'].ewm(span=9).mean()

### 05. KDJ Stochastic 지표

In [5]:
# 일자(n,m,t)에 따른 Stochastic(KDJ)의 값을 구하기 위해 함수형태로 만듬 
def get_stochastic(df, n=15, m=5, t=3):
    
    # 입력받은 값이 dataframe이라는 것을 정의해줌
    df = pd.DataFrame(df)
    
    # n일중 최고가
    ndays_high = df.high.rolling(window=n, min_periods=1).max()
    
    # n일중 최저가
    ndays_low = df.low.rolling(window=n, min_periods=1).min()
 
    # Fast%K 계산
    kdj_k = ((df.close - ndays_low) / (ndays_high - ndays_low))*100
    # Fast%D (=Slow%K) 계산
    kdj_d = kdj_k.ewm(span=m).mean()
    # Slow%D 계산
    kdj_j = kdj_d.ewm(span=t).mean()
 
    # dataframe에 컬럼 추가
    df = df.assign(kdj_k=kdj_k, kdj_d=kdj_d, kdj_j=kdj_j).dropna()
    
    return df

# 데이터에 적용
data = get_stochastic(data)

### 06. 변수 선택 및 결측값 제거

In [6]:
data = data[['open', 'high', 'low', 'close', 'trading_volume', 'score', 'index', 'probability',\
             'l1', 'l2', 'l3', 'l4', 'lgap', 'lrate',\
             'rsi','rsi_signal','momentum','emv3','emv5','emv9','kdj_k','kdj_d','kdj_j','target']]

In [7]:
data = data.dropna()

### 07. Result Dataset

In [8]:
data.head()

Unnamed: 0_level_0,open,high,low,close,trading_volume,score,index,probability,l1,l2,...,rsi,rsi_signal,momentum,emv3,emv5,emv9,kdj_k,kdj_d,kdj_j,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20161101,32600,33040,32240,33040,10347400,1.889,1.278,42.016,33440.0,33040.0,...,0.714286,0.492799,0.739101,32696.050486,32466.939281,32262.091044,100.0,74.992221,64.449778,0
20161102,32800,33040,32620,32860,10099250,1.889,1.278,42.016,33160.0,32950.0,...,0.665385,0.508235,0.72789,32778.025243,32597.95952,32381.672835,92.105263,80.696568,72.573173,0
20161103,32600,32800,32120,32320,10341850,2.408,0.75,16.0,32900.0,32560.0,...,0.573034,0.517579,0.701201,32549.012622,32505.306347,32369.338268,68.421053,76.60473,74.588951,1
20161104,32100,32680,32100,32540,7103850,3.008,0.533,2.145,32900.0,32610.0,...,0.569811,0.526149,0.673603,32544.506311,32516.870898,32403.470615,78.070175,77.093212,75.841081,1
20161107,32940,33000,32680,32800,7661900,2.604,0.75,16.0,33060.0,32900.0,...,0.592058,0.539315,0.650471,32672.253155,32611.247265,32482.776492,87.5,80.562141,78.201611,1


### 08. Train Test Split

In [9]:
train = data.loc[:'20190331']
test = data.loc['20190401':]

In [10]:
print('Train Dataset = {} obs   Train Shape = {}   Train Rate = {:.2}'\
                                    .format(len(train), train.shape, len(train)/(len(train)+len(test))))
train.head()

Train Dataset = 589 obs   Train Shape = (589, 24)   Train Rate = 0.72


Unnamed: 0_level_0,open,high,low,close,trading_volume,score,index,probability,l1,l2,...,rsi,rsi_signal,momentum,emv3,emv5,emv9,kdj_k,kdj_d,kdj_j,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20161101,32600,33040,32240,33040,10347400,1.889,1.278,42.016,33440.0,33040.0,...,0.714286,0.492799,0.739101,32696.050486,32466.939281,32262.091044,100.0,74.992221,64.449778,0
20161102,32800,33040,32620,32860,10099250,1.889,1.278,42.016,33160.0,32950.0,...,0.665385,0.508235,0.72789,32778.025243,32597.95952,32381.672835,92.105263,80.696568,72.573173,0
20161103,32600,32800,32120,32320,10341850,2.408,0.75,16.0,32900.0,32560.0,...,0.573034,0.517579,0.701201,32549.012622,32505.306347,32369.338268,68.421053,76.60473,74.588951,1
20161104,32100,32680,32100,32540,7103850,3.008,0.533,2.145,32900.0,32610.0,...,0.569811,0.526149,0.673603,32544.506311,32516.870898,32403.470615,78.070175,77.093212,75.841081,1
20161107,32940,33000,32680,32800,7661900,2.604,0.75,16.0,33060.0,32900.0,...,0.592058,0.539315,0.650471,32672.253155,32611.247265,32482.776492,87.5,80.562141,78.201611,1


In [11]:
print('Test Dataset = {} obs   Test Shape = {}   Test Rate = {:.2}'\
                                    .format(len(test), test.shape, len(test)/(len(train)+len(test))))
test.head()

Test Dataset = 224 obs   Test Shape = (224, 24)   Test Rate = 0.28


Unnamed: 0_level_0,open,high,low,close,trading_volume,score,index,probability,l1,l2,...,rsi,rsi_signal,momentum,emv3,emv5,emv9,kdj_k,kdj_d,kdj_j,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20190401,45200,45450,44850,45050,7362129,0.807,1.0,30.0,45550.0,45250.0,...,0.528571,0.538783,0.546823,44967.664587,45011.847251,44993.978679,50.0,48.978234,50.046258,1
20190402,45550,46100,45350,45750,9480688,0.637,0.857,22.353,46300.0,45925.0,...,0.637681,0.57551,0.600299,45358.832294,45257.898168,45145.182943,67.948718,55.301728,52.673993,1
20190403,46750,46750,45800,46600,12436570,0.0,0.143,22.0,47150.0,46675.0,...,0.677419,0.589396,0.658277,45979.416147,45705.265445,45436.146354,89.74359,66.782349,59.728171,1
20190404,46150,47100,46150,46950,12650168,1.341,1.429,47.114,47500.0,47025.0,...,0.677419,0.596682,0.706439,46464.708073,46120.176963,45738.917084,96.25,76.604899,68.166535,0
20190405,46950,47550,46600,46850,8546339,0.0,0.143,22.0,47675.0,47200.0,...,0.714286,0.611865,0.742409,46657.354037,46363.451309,45961.133667,84.269663,79.15982,73.663178,0


### 09. Input & Target Split

In [12]:
X_train = train[train.columns[:-1]]
y_train = train[train.columns[-1]]

In [13]:
X_test = test[test.columns[:-1]]
y_test = test[test.columns[-1]]

### 10. Data Normalization

In [14]:
# min, std
min_train = X_train.min(axis=0)
std_train = X_train.std(axis=0)

# Train Normalization
X_train -= min_train
X_train /= std_train

# Test Normalization
X_test -= min_train
X_test /= std_train

In [15]:
train.head()

Unnamed: 0_level_0,open,high,low,close,trading_volume,score,index,probability,l1,l2,...,rsi,rsi_signal,momentum,emv3,emv5,emv9,kdj_k,kdj_d,kdj_j,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20161101,32600,33040,32240,33040,10347400,1.889,1.278,42.016,33440.0,33040.0,...,0.714286,0.492799,0.739101,32696.050486,32466.939281,32262.091044,100.0,74.992221,64.449778,0
20161102,32800,33040,32620,32860,10099250,1.889,1.278,42.016,33160.0,32950.0,...,0.665385,0.508235,0.72789,32778.025243,32597.95952,32381.672835,92.105263,80.696568,72.573173,0
20161103,32600,32800,32120,32320,10341850,2.408,0.75,16.0,32900.0,32560.0,...,0.573034,0.517579,0.701201,32549.012622,32505.306347,32369.338268,68.421053,76.60473,74.588951,1
20161104,32100,32680,32100,32540,7103850,3.008,0.533,2.145,32900.0,32610.0,...,0.569811,0.526149,0.673603,32544.506311,32516.870898,32403.470615,78.070175,77.093212,75.841081,1
20161107,32940,33000,32680,32800,7661900,2.604,0.75,16.0,33060.0,32900.0,...,0.592058,0.539315,0.650471,32672.253155,32611.247265,32482.776492,87.5,80.562141,78.201611,1


In [16]:
test.head()

Unnamed: 0_level_0,open,high,low,close,trading_volume,score,index,probability,l1,l2,...,rsi,rsi_signal,momentum,emv3,emv5,emv9,kdj_k,kdj_d,kdj_j,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20190401,45200,45450,44850,45050,7362129,0.807,1.0,30.0,45550.0,45250.0,...,0.528571,0.538783,0.546823,44967.664587,45011.847251,44993.978679,50.0,48.978234,50.046258,1
20190402,45550,46100,45350,45750,9480688,0.637,0.857,22.353,46300.0,45925.0,...,0.637681,0.57551,0.600299,45358.832294,45257.898168,45145.182943,67.948718,55.301728,52.673993,1
20190403,46750,46750,45800,46600,12436570,0.0,0.143,22.0,47150.0,46675.0,...,0.677419,0.589396,0.658277,45979.416147,45705.265445,45436.146354,89.74359,66.782349,59.728171,1
20190404,46150,47100,46150,46950,12650168,1.341,1.429,47.114,47500.0,47025.0,...,0.677419,0.596682,0.706439,46464.708073,46120.176963,45738.917084,96.25,76.604899,68.166535,0
20190405,46950,47550,46600,46850,8546339,0.0,0.143,22.0,47675.0,47200.0,...,0.714286,0.611865,0.742409,46657.354037,46363.451309,45961.133667,84.269663,79.15982,73.663178,0


### 10. Support Vector Machine

In [17]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier = RandomForestClassifier(max_depth=30, random_state=1234, min_samples_leaf=3,
                                                min_samples_split=4, n_estimators=500)
RandomForestClassifier.fit(X_train, y_train)

print('Train Accuracy =', round(RandomForestClassifier.score(X_train, y_train), 4))
print('Test Accuracy =', round(RandomForestClassifier.score(X_test, y_test), 4))

Train Accuracy = 0.9864
Test Accuracy = 0.5625
