In [1]:
import pandas as pd
import numpy as np

### 1. Data Set DownLoad

In [20]:
kospi_data = pd.read_csv('./data/track1/final_kospi.csv',  na_values = ['?', '??', 'N/A', 'NA', 'nan', 'NaN', '-nan', '-NaN', 'null'])
kosdaq_data = pd.read_csv('./data/track1/final_kosdaq.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [21]:
len(kospi_data[kospi_data['날짜'] > '2021-01-01'] )

138355

In [22]:
# processed_kospi_data = kospi_data.copy(deep=True)
processed_kospi_data = kospi_data[kospi_data['날짜'] > '2021-01-01']
processed_kospi_data = processed_kospi_data.drop(columns=['Unnamed: 0'], inplace=False)

unnecessary_columns = ['시가', '고가', '저가', '종가', 'code_x', 'code_y'] # 날짜 CODE 정보는 추후 분석을 위해 남겨 둠
processed_kospi_data = processed_kospi_data.drop(columns=unnecessary_columns, inplace=False)
processed_kospi_data = processed_kospi_data.dropna(axis=0) # null 값 제거
for col_name in processed_kospi_data.columns :
    processed_kospi_data = processed_kospi_data[processed_kospi_data[col_name] != '-']
processed_kospi_data = processed_kospi_data.reset_index(drop=True)
strict=False

len(processed_kospi_data)

127494

### 2. Split Data Set
Split by 60% train data, 20% valid data, 20% test data

In [23]:
import sklearn
from sklearn.model_selection import train_test_split

x = processed_kospi_data.copy().drop(columns=['Y'], inplace=False)
y = processed_kospi_data['Y']

x_train, x_remain, y_train, y_remain = train_test_split(x, y, train_size = 0.6)
x_valid, x_test, y_valid, y_test = train_test_split(x_remain, y_remain, train_size=0.5)

x_train = x_train.reset_index(drop=True)
x_valid = x_valid.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [24]:
print(x_train.columns)

Index(['BPS', 'PER', 'PBR', 'EPS', 'DIV', 'DPS', '날짜', '거래량', '시가총액', '금리',
       'CODE', '유동자산', '비유동자산', '자산총계', '유동부채', '비유동부채', '부채총계', '이익잉여금',
       '자본총계', '매출액', '영업이익', '법인세차감전 순이익', '당기순이익', '자본금'],
      dtype='object')


### 3. Data Normalization and Visualization

In [25]:
## iqr 을 적용하여 outlier를 제외한 평균 분산 값으로 정규화

def calc_statistic (x) : 
    x_IQR = x.quantile(0.75) - x.quantile(0.25)
    lower_bound = (x > x.quantile(0.25) - x_IQR * 1.5) ## -2.7σ in Gaussian (if x ~ Gaussian)
    upper_bound = (x < x.quantile(0.75) + x_IQR * 1.5) ## 2.7σ in Gaussian
    bound_conditions = lower_bound & upper_bound
    x_clean = x.loc[bound_conditions]
    return (x_clean.mean(), x_clean.std())

    
for col_name in x_train.columns :
    if col_name == 'CODE' or col_name == '날짜' :
        continue
    x_train[col_name] = x_train[col_name].apply(lambda x : float(x))
    x_valid[col_name] = x_valid[col_name].apply(lambda x : float(x))
    x_test[col_name] = x_test[col_name].apply(lambda x : float(x))
    m, v = calc_statistic(x_train[col_name])
    
    
    x_train[col_name] = (x_train[col_name] - m) / v
    x_valid[col_name] = (x_valid[col_name] - m) / v
    x_test[col_name] = (x_test[col_name] - m) / v


In [27]:
x_train.head()

Unnamed: 0,BPS,PER,PBR,EPS,DIV,DPS,날짜,거래량,시가총액,금리,...,유동부채,비유동부채,부채총계,이익잉여금,자본총계,매출액,영업이익,법인세차감전 순이익,당기순이익,자본금
0,-0.495113,1.256968,-0.766571,-0.548917,0.616262,-0.433568,2022-05-30,-0.693488,-0.725389,1.067388,...,-0.85158,-0.707609,-0.811188,-0.108761,-0.571487,-0.760714,-0.53673,-0.498647,-0.495902,-0.7322
1,-0.633751,0.137159,-0.169323,-0.374962,1.216071,-0.254778,2021-03-26,8.542551,-0.53226,-0.906554,...,-0.519256,-0.476089,-0.540351,-0.036868,-0.54398,1.937225,0.388199,-0.042843,-0.06968,-0.503973
2,7.747009,-0.794005,-1.168565,18.660106,3.426993,7.314013,2022-08-17,0.162462,2.636423,1.856964,...,13.15269,6.916716,9.866762,2.03426,10.296689,5.435611,7.902064,7.849587,6.368202,5.548438
3,0.426044,-0.547647,-0.215265,4.198169,0.581389,1.205344,2022-07-27,-0.790637,0.522686,1.856964,...,-0.193619,-0.538328,-0.374662,0.508705,0.792714,0.023084,0.977278,1.357815,1.156906,-0.970299
4,-0.816213,-0.932689,1.071116,-0.678304,-0.911158,-0.731552,2021-05-11,3.244534,-0.729458,-0.906554,...,-0.490316,-0.568161,-0.55547,-0.865778,-0.929122,-0.697889,-1.150816,-1.088154,-1.173132,-0.134626


In [18]:
x_train.to_csv('./data/track1/features/x_train.csv', index=False)
x_valid.to_csv('./data/track1/features/x_valid.csv', index=False)
x_test.to_csv('./data/track1/features/x_test.csv', index=False)
y_train.to_csv('./data/track1/features/y_train.csv', index=False)
y_valid.to_csv('./data/track1/features/y_valid.csv', index=False)
y_test.to_csv('./data/track1/features/y_test.csv', index=False)