In [11]:
import pandas as pd
import numpy as np

### 1. Data Set DownLoad

In [37]:
kospi_data = pd.read_csv('./data/final_kospi.csv',  na_values = ['?', '??', 'N/A', 'NA', 'nan', 'NaN', '-nan', '-NaN', 'null'])
kosdaq_data = pd.read_csv('./data/final_kosdaq.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [53]:
processed_kospi_data = kospi_data.copy(deep=True)

unnecessary_columns = ['시가', '고가', '저가', '종가', '날짜', 'CODE', 'code_y']
processed_kospi_data = processed_kospi_data.copy().drop(columns=unnecessary_columns, inplace=False)
processed_kospi_data = processed_kospi_data.dropna(axis=0) # null 값 제거
for col_name in processed_kospi_data.columns :
    processed_kospi_data = processed_kospi_data[processed_kospi_data[col_name] != '-']
processed_kospi_data = processed_kospi_data.reset_index(drop=True)
strict=False

len(processed_kospi_data)

365862

### 2. Split Data Set
Split by 60% train data, 20% valid data, 20% test data

In [54]:
import sklearn
from sklearn.model_selection import train_test_split

x = processed_kospi_data.copy().drop(columns=['Y'], inplace=False)
y = processed_kospi_data['Y']

x_train, x_remain, y_train, y_remain = train_test_split(x, y, train_size = 0.8)
x_valid, x_test, y_valid, y_test = train_test_split(x_remain, y_remain, train_size=0.5)

x_train = x_train.reset_index(drop=True)
x_valid = x_valid.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [55]:
print(x_train.columns)

Index(['Unnamed: 0', 'BPS', 'PER', 'PBR', 'EPS', 'DIV', 'DPS', 'code_x', '거래량',
       '시가총액', '금리', '유동자산', '비유동자산', '자산총계', '유동부채', '비유동부채', '부채총계', '이익잉여금',
       '자본총계', '매출액', '영업이익', '법인세차감전 순이익', '당기순이익', '자본금'],
      dtype='object')


### 3. Data Normalization and Visualization

In [56]:
## iqr 을 적용하여 outlier를 제외한 평균 분산 값으로 정규화

def calc_statistic (x) : 
    x_IQR = x.quantile(0.75) - x.quantile(0.25)
    lower_bound = (x > x.quantile(0.25) - x_IQR * 1.5) ## -2.7σ in Gaussian (if x ~ Gaussian)
    upper_bound = (x < x.quantile(0.75) + x_IQR * 1.5) ## 2.7σ in Gaussian
    bound_conditions = lower_bound & upper_bound
    x_clean = x.loc[bound_conditions]
    return (x_clean.mean(), x_clean.std())

    
for col_name in x_train.columns :
    x_train[col_name] = x_train[col_name].apply(lambda x : float(x))
    x_valid[col_name] = x_valid[col_name].apply(lambda x : float(x))
    x_test[col_name] = x_test[col_name].apply(lambda x : float(x))
    m, v = calc_statistic(x_train[col_name])
    
    
    x_train[col_name] = (x_train[col_name] - m) / v
    x_valid[col_name] = (x_valid[col_name] - m) / v
    x_test[col_name] = (x_test[col_name] - m) / v


In [57]:
x_train.head()

Unnamed: 0.1,Unnamed: 0,BPS,PER,PBR,EPS,DIV,DPS,code_x,거래량,시가총액,...,유동부채,비유동부채,부채총계,이익잉여금,자본총계,매출액,영업이익,법인세차감전 순이익,당기순이익,자본금
0,0.627878,3.29254,0.662492,0.149602,3.138912,-0.09622,3.478891,-0.76345,-0.621206,0.414084,...,-0.82437,-0.701744,-0.798864,0.205968,0.043004,-0.836436,-0.494169,-0.44865,-0.412,-0.64381
1,1.715353,-0.733654,1.834797,26.092822,0.307427,-0.948765,-0.730663,-0.535446,9.493697,3.896698,...,-0.368302,-0.381422,-0.416038,-1.206913,-0.681806,-0.073949,-0.387263,-11.045479,-13.345662,-0.390852
2,0.620704,-0.568747,0.303763,1.067123,-0.174943,0.006086,-0.244945,7.94235,0.256509,0.148851,...,0.031655,0.590235,0.136121,-0.388547,-0.129846,-0.270381,0.73326,0.667983,0.600173,0.27085
3,-1.585192,9.206245,-0.240338,-0.089751,16.922013,-0.948765,-0.730663,-0.73009,-0.802941,-0.447128,...,-0.80055,-0.474721,-0.712978,-0.055237,-0.43964,-0.625262,-0.378495,1.361023,1.485813,-0.803196
4,1.415411,1.022191,1.443456,0.269279,0.567433,1.083702,4.126515,-0.668058,-0.66661,0.967481,...,2.960084,4.073175,3.041116,-0.290449,0.737919,0.351909,2.818256,2.204696,1.931139,1.476185


In [59]:
x_train.to_csv('./data/features/x_train.csv', index=False)
x_valid.to_csv('./data/features/x_valid.csv', index=False)
x_test.to_csv('./data/features/x_test.csv', index=False)
y_train.to_csv('./data/features/y_train.csv', index=False)
y_valid.to_csv('./data/features/y_valid.csv', index=False)
y_test.to_csv('./data/features/y_test.csv', index=False)