**1. 데이터 불러오기**

In [1]:
# 데이터 불러오기
import pandas as pd
original_data = pd.read_csv('./total_price.csv')

# 불필요한 열 제거
del original_data['Unnamed: 0']

# 날짜를 인덱스로 변경
original_data = original_data.set_index('date')

# l1 ~ l4 Line Setting
original_data['l1'] = original_data['l1'].apply(lambda x : str(x)).apply(lambda x : x[1:])
original_data['l2'] = original_data['l2'].apply(lambda x : str(x)).apply(lambda x : x[1:])
original_data['l3'] = original_data['l3'].apply(lambda x : str(x)).apply(lambda x : x[1:])
original_data['l4'] = original_data['l4'].apply(lambda x : str(x)).apply(lambda x : x[1:])

# Data Head
original_data

Unnamed: 0_level_0,open,high,low,close,trading_volume,score,index,probability,l1,l2,l3,l4,lgap,lrate,code
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
20160622,10150,10150,9780,9830,315346,2.586,0.538,2.477,10175.0,9990.0,9805.0,9620.0,555.0,6,20
20160623,9710,9870,9510,9730,293348,2.778,1.429,47.114,9980.0,9800.0,9620.0,9440.0,540.0,6,20
20160624,9840,9910,8700,9080,621895,4.162,0.960,27.960,10100.0,9495.0,8890.0,8285.0,1815.0,20,20
20160627,8750,9480,8750,9400,334886,3.940,0.880,23.643,9805.0,9440.0,9075.0,8710.0,1095.0,12,20
20160628,9210,9770,9210,9760,282254,3.940,0.880,23.643,10045.0,9765.0,9485.0,9205.0,840.0,9,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200219,14750,14880,14750,14850,1090,2857.000,10.000,79.010,14930.0,14865.0,14800.0,14735.0,195.0,1,241180
20200220,14850,15005,14790,14850,3900,2732.000,20.000,79.751,15035.0,14927.5,14820.0,14712.5,322.5,2,241180
20200221,14695,14845,14695,14845,1706,2722.000,20.000,79.751,14920.0,14845.0,14770.0,14695.0,225.0,2,241180
20200224,14650,14680,14560,14625,489,2687.000,6.000,77.297,14712.5,14652.5,14592.5,14532.5,180.0,1,241180


**2. 데이터 전처리**

In [2]:
# 라이브러리
import numpy as np
import pandas as pd

# 종목 리스트
codes = original_data.code.unique()

# 최종 데이터프레임을 저장할 공간
dataset = pd.DataFrame()

# 중간 결과 집계
COUNT = 0

# 종목별 전처리 진행
for code in codes : 
    
    # 날짜별로 데이터 재정렬
    data = original_data[original_data['code'] == code].reset_index().sort_values(by=['date'])
    
    # open, high, low, close 를 모두 실수화
    data['open'] = data['open'].apply(lambda x : np.float(x))
    data['high'] = data['high'].apply(lambda x : np.float(x))
    data['low'] = data['low'].apply(lambda x : np.float(x))
    data['close'] = data['close'].apply(lambda x : np.float(x))

    # Zone 지정
    data['zone']=100*(data['close']-data['close'].shift(-1))/data['close'].shift(-1)
    
    def zone(x) : 
        if x < -10 : return "-30 ~ -10"
        elif x < -5 : return "-10 ~ -3"
        elif x < -1 : return "-3 ~ -1"
        elif x < +1 : return "-1 ~ +1"
        elif x < +5 : return "+1 ~ +3"
        elif x < +10 : return "+3 ~ +10"
        elif x < +20 : return "+10 ~ +20"
        else : return "+10 ~ +30"
            
    data['zone'] = data['zone'].apply(lambda x : zone(x))
    
    # open과 high와의 차이를 open으로 나누고 반올림
    data['open_high'] = (data['high'] - data['open']) / data['open']

    # open과 low와의 차이를 open으로 나누고 반올림
    data['open_low'] = (data['low'] - data['open']) / data['open']
    
    # open과 close의 차이를 open으로 나누고 반올림
    data['open_close'] = (data['close'] - data['open']) / data['open']
    
    # open 가격을 전일대비 변동폭으로 계산하고, 실수화한 다음, 반올림
    data['open'] = 100 * (data['open']-data['open'].shift(1)) / data['open'].shift(1)
    
    # high 가격을 전일대비 변동폭으로 계산하고, 실수화한 다음, 반올림
    data['high'] = 100*(data['high']-data['high'].shift(1)) / data['high'].shift(1)

    # low 가격을 전일대비 변동폭으로 계산하고, 실수화한 다음, 반올림
    data['low'] = 100*(data['low']-data['low'].shift(1))/data['low'].shift(1)

    # close 가격을 전일대비 변동폭으로 계산하고, 실수화한 다음, 반올림
    data['close'] = 100*(data['close']-data['close'].shift(1))/data['close'].shift(1)

    # trading_volume 실수화한 다음, 반올림 (로그변환 하기 전에 +1 해줌)
    data['volume'] = data['trading_volume'].apply(lambda x : np.float(x))
    data['volume'] = data['volume'].apply(lambda x : np.log(x + 1))    

    # Score는 실수화하고, 로그변환만 진행하고 반올림 (로그변환 하기 전에 +1 해줌)
    data['score'] = data['score'].apply(lambda x : np.float(x))
    data['score'] = data['score'].apply(lambda x : np.log(x +1))

    # index는 실수화하고, 로그변환만 진행하고 반올림 (로그변환 하기 전에 +1 해줌)
    data['index'] = data['index'].apply(lambda x : np.float(x))
    data['index'] = data['index'].apply(lambda x : np.log(x + 1))
    
    # probability는 그대로 이용하고 반올림
    data['probability'] = data['probability'].apply(lambda x : np.float(x))

    # l1 line 가격을 전일대비 변동폭으로 계산하고, 실수화한 다음, 반올림
    data['l1'] = data['l1'].apply(lambda x : np.float(x))
    data['l1'] = 100*(data['l1']-data['l1'].shift(1))/data['l1'].shift(1)
    data['l1'] = data['l1'].apply(lambda x : np.float(x))
    
    # l2 line 가격을 전일대비 변동폭으로 계산하고, 실수화한 다음, 반올림
    data['l2'] = data['l2'].apply(lambda x : np.float(x))
    data['l2'] = 100*(data['l2']-data['l2'].shift(1))/data['l2'].shift(1)
    data['l2'] = data['l2'].apply(lambda x : np.float(x))

    # l3 line 가격을 전일대비 변동폭으로 계산하고, 실수화한 다음, 반올림
    data['l3'] = data['l3'].apply(lambda x : np.float(x))
    data['l3'] = 100*(data['l3']-data['l3'].shift(1))/data['l3'].shift(1)
    data['l3'] = data['l3'].apply(lambda x : np.float(x))

    # l1 line 가격을 전일대비 변동폭으로 계산하고, 실수화한 다음, 반올림
    data['l4'] = data['l4'].apply(lambda x : np.float(x))
    data['l4'] = 100*(data['l4']-data['l4'].shift(1))/data['l4'].shift(1)
    data['l4'] = data['l4'].apply(lambda x : np.float(x))

    # lgap는 로그변환만하고, 반올림 (로그변환 하기 전에 +1 해줌)
    data['lgap'] = data['lgap'].apply(lambda x : np.float(x))
    data['lgap'] = data['lgap'].apply(lambda x : np.log(x + 1))

    # lrate는 그대로 이용
    data['lrate']=data['lrate'].apply(lambda x : np.float(x))

    # 모델에 불필요한 변수 제거
    del data['date']
    del data['code']
    
    # 전처리 과정에서 생긴 결측값 제거
    data = data.dropna()
    
    # 최종 데이터셋에 추가
    dataset = dataset.append(data)
    
    # data 리셋
    data = pd.DataFrame()
    
    # 중간결과 출력
    COUNT += 1
    if COUNT % 100 == 0 : 
        print('{}번째 종목 전처리 진행중.....' .format(COUNT))
        
# 이상값 제거 (전일대비 OHLC가 35% 이상 뛴 종목)
dataset = dataset[dataset['open'] < 35.00]
dataset = dataset[dataset['high'] < 35.00]
dataset = dataset[dataset['low'] < 35.00]
dataset = dataset[dataset['close'] < 35.00]

# # 데이터프레임 순서 변경
dataset = dataset[['open', 'high', 'low', 'close', 'volume', 'score', 'index', 'probability',\
                   'l1', 'l2', 'l3', 'l4', 'lgap', 'lrate', 'open_high', 'open_low', 'open_close', 'zone']]

100번째 종목 전처리 진행중.....
200번째 종목 전처리 진행중.....
300번째 종목 전처리 진행중.....
400번째 종목 전처리 진행중.....
500번째 종목 전처리 진행중.....
600번째 종목 전처리 진행중.....
700번째 종목 전처리 진행중.....
800번째 종목 전처리 진행중.....
900번째 종목 전처리 진행중.....
1000번째 종목 전처리 진행중.....
1100번째 종목 전처리 진행중.....
1200번째 종목 전처리 진행중.....
1300번째 종목 전처리 진행중.....
1400번째 종목 전처리 진행중.....
1500번째 종목 전처리 진행중.....
1600번째 종목 전처리 진행중.....
1700번째 종목 전처리 진행중.....
1800번째 종목 전처리 진행중.....
1900번째 종목 전처리 진행중.....
2000번째 종목 전처리 진행중.....
2100번째 종목 전처리 진행중.....


**3. 데이터 전처리 결과**

In [3]:
dataset

Unnamed: 0,open,high,low,close,volume,score,index,probability,l1,l2,l3,l4,lgap,lrate,open_high,open_low,open_close,zone
1,-4.334975,-2.758621,-2.760736,-1.017294,12.589118,1.329195,0.887480,47.114,-1.916462,-1.901902,-1.886792,-1.871102,6.293419,6.0,0.016478,-0.020597,0.002060,+3 ~ +10
2,1.338826,0.405268,-8.517350,-6.680370,13.340528,1.641324,0.672944,27.960,1.202405,-3.112245,-7.588358,-12.235169,7.504392,20.0,0.007114,-0.115854,-0.077236,-3 ~ -1
3,-11.077236,-4.339051,0.574713,3.524229,12.721548,1.597365,0.631272,23.643,-2.920792,-0.579252,2.080990,5.129753,6.999422,12.0,0.083429,0.000000,0.074286,-3 ~ -1
4,5.257143,3.059072,5.257143,3.829787,12.550566,1.597365,0.631272,23.643,2.447731,3.442797,4.517906,5.683123,6.734592,9.0,0.060803,0.000000,0.059718,-1 ~ +1
5,6.948969,3.377687,5.320304,-0.102459,12.772218,1.597365,0.631272,23.643,0.796416,1.638505,2.530311,3.476372,6.398595,6.0,0.025381,-0.015228,-0.010152,-3 ~ -1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
897,-0.371496,-0.468227,-0.169205,0.337838,6.994850,7.957877,2.397895,79.010,-0.217210,-0.067227,0.084531,0.238095,5.278115,1.0,0.008814,0.000000,0.006780,-1 ~ +1
898,0.677966,0.840054,0.271186,0.000000,8.268988,7.913155,3.044522,79.751,0.703282,0.420451,0.135135,-0.152698,5.779199,2.0,0.010438,-0.004040,0.000000,-1 ~ +1
899,-1.043771,-1.066311,-0.642326,-0.033670,7.442493,7.909489,3.044522,79.751,-0.764882,-0.552671,-0.337382,-0.118946,5.420535,2.0,0.010208,0.000000,0.010208,+1 ~ +3
900,-0.306227,-1.111485,-0.918680,-1.481980,6.194405,7.896553,1.945910,77.297,-1.390751,-1.296733,-1.201760,-1.105818,5.198497,1.0,0.002048,-0.006143,-0.001706,-3 ~ -1


In [4]:
dataset['zone'].value_counts()

-1 ~ +1      917587
+1 ~ +3      464532
-3 ~ -1      400994
+3 ~ +10      49751
-10 ~ -3      45136
-30 ~ -10     12382
+10 ~ +20      7819
+10 ~ +30      3598
Name: zone, dtype: int64

In [5]:
A = 100 *dataset['zone'].value_counts()[0] / 1901799
B = 100 *dataset['zone'].value_counts()[1] / 1901799
C = 100 *dataset['zone'].value_counts()[2] / 1901799
D = 100 *dataset['zone'].value_counts()[3] / 1901799
E = 100 *dataset['zone'].value_counts()[4] / 1901799
F = 100 *dataset['zone'].value_counts()[5] / 1901799
G = 100 *dataset['zone'].value_counts()[6] / 1901799
H = 100 *dataset['zone'].value_counts()[7] / 1901799

print('-1 ~ +1 비율 = {:.2f}%' .format(A))
print('+1 ~ +3 비율 = {:.2f}%' .format(B))
print('-3 ~ -1 비율 = {:.2f}%' .format(C))
print('+3 ~ +10 비율 = {:.2f}%' .format(D))
print('-10 ~ -3 비율 = {:.2f}%' .format(E))
print('-30 ~ -10  비율 = {:.2f}%' .format(F))
print('+10 ~ +20 비율 = {:.2f}%' .format(G))
print('+10 ~ +30 비율 = {:.2f}%' .format(H))

-1 ~ +1 비율 = 48.25%
+1 ~ +3 비율 = 24.43%
-3 ~ -1 비율 = 21.08%
+3 ~ +10 비율 = 2.62%
-10 ~ -3 비율 = 2.37%
-30 ~ -10  비율 = 0.65%
+10 ~ +20 비율 = 0.41%
+10 ~ +30 비율 = 0.19%


In [6]:
round(dataset.describe(), 2)

Unnamed: 0,open,high,low,close,volume,score,index,probability,l1,l2,l3,l4,lgap,lrate,open_high,open_low,open_close
count,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0,1901799.0
mean,-0.0,0.01,-0.01,0.01,10.79,3.29,1.47,52.5,0.03,0.0,-0.01,0.01,5.74,5.55,0.02,-0.02,-0.0
std,3.16,3.34,2.64,2.95,2.79,3.18,1.08,26.68,3.83,2.86,2.45,3.11,1.61,4.88,0.03,0.02,0.03
min,-94.54,-93.68,-95.92,-95.92,0.0,0.0,0.01,0.0,-93.68,-94.8,-95.92,-97.04,0.0,0.0,0.0,-0.46,-0.45
25%,-1.34,-1.16,-0.98,-1.16,9.69,0.85,0.69,30.0,-1.34,-1.1,-0.92,-1.05,4.86,3.0,0.0,-0.03,-0.01
50%,0.0,-0.11,0.0,0.0,11.19,1.8,1.05,57.52,-0.25,-0.16,0.0,0.1,5.71,4.0,0.01,-0.01,0.0
75%,1.17,0.79,0.94,0.96,12.47,6.4,2.2,78.46,0.77,0.71,0.93,1.18,6.72,7.0,0.02,-0.0,0.01
max,34.99,32.48,34.89,32.23,20.55,15.15,4.62,79.99,52.79,31.0,32.4,76.88,12.62,128.0,0.81,0.0,0.67


In [7]:
pd.pivot_table(dataset, index=['zone'], values=['open','high','low','close'], aggfunc='mean')

Unnamed: 0_level_0,close,high,low,open
zone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
+1 ~ +3,0.06861,-0.006569,-0.014094,0.015655
+10 ~ +20,1.017318,1.637341,-0.106365,0.77283
+10 ~ +30,-0.387899,-0.096567,-1.704367,-1.419056
+3 ~ +10,-0.013288,0.198751,-0.21654,0.099057
-1 ~ +1,0.004149,-0.018515,0.024138,0.009858
-10 ~ -3,0.220833,0.381341,0.066822,0.094208
-3 ~ -1,-0.146607,-0.044965,-0.076692,-0.095751
-30 ~ -10,1.836947,1.934108,0.513892,0.565828


In [8]:
pd.pivot_table(dataset, index=['zone'], values=['volume','score','index','probability'], aggfunc='mean')

Unnamed: 0_level_0,index,probability,score,volume
zone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
+1 ~ +3,1.465461,52.320408,3.21353,11.398407
+10 ~ +20,1.503085,53.038595,3.040934,13.154365
+10 ~ +30,1.470276,51.724254,3.066994,11.883696
+3 ~ +10,1.494101,52.808705,3.073277,12.616814
-1 ~ +1,1.463888,52.290396,3.346557,9.993461
-10 ~ -3,1.493578,52.893235,3.091318,12.331497
-3 ~ -1,1.507029,53.128114,3.298385,11.364539
-30 ~ -10,1.4147,51.168732,2.81227,12.85814


In [9]:
pd.pivot_table(dataset, index=['zone'], values=['l1','l2','l3','l4'], aggfunc='mean')

Unnamed: 0_level_0,l1,l2,l3,l4
zone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
+1 ~ +3,0.046545,0.020194,0.018475,0.050232
+10 ~ +20,2.20298,1.291385,0.419987,-0.363776
+10 ~ +30,0.552374,-0.258868,-1.062976,-1.830542
+3 ~ +10,0.326819,0.068654,-0.138929,-0.267842
-1 ~ +1,-0.021049,-0.012865,0.00916,0.049036
-10 ~ -3,0.477039,0.281178,0.125959,0.032678
-3 ~ -1,-0.066854,-0.10554,-0.121236,-0.105685
-30 ~ -10,2.582869,1.849053,1.153338,0.532833


In [10]:
pd.pivot_table(dataset, index=['zone'], values=['lgap','lrate'], aggfunc='mean')

Unnamed: 0_level_0,lgap,lrate
zone,Unnamed: 1_level_1,Unnamed: 2_level_1
+1 ~ +3,5.98863,6.193879
+10 ~ +20,6.428202,14.63512
+10 ~ +30,6.037399,11.509728
+3 ~ +10,6.244209,10.310667
-1 ~ +1,5.460085,4.291023
-10 ~ -3,6.198901,9.24774
-3 ~ -1,5.974883,6.240864
-30 ~ -10,6.096934,12.921176


In [11]:
pd.pivot_table(dataset, index=['zone'], values=['open_high','open_low','open_close'], aggfunc='mean')

Unnamed: 0_level_0,open_close,open_high,open_low
zone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
+1 ~ +3,-0.000217,0.020967,-0.020404
+10 ~ +20,0.005608,0.055533,-0.043888
+10 ~ +30,0.010151,0.044923,-0.031506
+3 ~ +10,-0.001442,0.036422,-0.032706
-1 ~ +1,-0.000551,0.014083,-0.014537
-10 ~ -3,0.00086,0.034273,-0.027572
-3 ~ -1,-0.001902,0.021397,-0.02018
-30 ~ -10,0.014412,0.056076,-0.032531


**4. 전처리된 데이터 저장**

In [12]:
dataset.to_csv('./final_dataset.csv')