In [155]:
import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

from sklearn.linear_model import LinearRegression, LogisticRegression
from tqdm import tqdm

## Get Stock List

In [2]:
path = '../data'
list_name = 'stock_list.csv'
sample_name = 'sample_submission.csv'

stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list

Unnamed: 0,종목명,종목코드,상장시장
0,삼성전자,005930,KOSPI
1,SK하이닉스,000660,KOSPI
2,NAVER,035420,KOSPI
3,카카오,035720,KOSPI
4,삼성바이오로직스,207940,KOSPI
...,...,...,...
365,맘스터치,220630,KOSDAQ
366,다날,064260,KOSDAQ
367,제이시스메디칼,287410,KOSDAQ
368,크리스에프앤씨,110790,KOSDAQ


## Get Data & Modeling

In [156]:
# start_date = '20210104'
start_date = '20200106'
end_date = '20211105'

start_weekday = pd.to_datetime(start_date).weekday()
max_weeknum = pd.to_datetime(end_date).strftime('%V')
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

print(f'WEEKDAY of "start_date" : {start_weekday}')
print(f'NUM of WEEKS to "end_date" : {max_weeknum}')
print(f'HOW MANY "Business_days" : {Business_days.shape}', )
display(Business_days.head())

WEEKDAY of "start_date" : 0
NUM of WEEKS to "end_date" : 44
HOW MANY "Business_days" : (480, 1)


Unnamed: 0,Date
0,2020-01-06
1,2020-01-07
2,2020-01-08
3,2020-01-09
4,2020-01-10


## Baseline 모델의 구성 소개 ( Sample )

- X : (월 ~ 금) * 43주간
- y : (다음주 월 ~ 금) * 43주간
    - y_0 : 다음주 월요일
    - y_1 : 다음주 화요일
    - y_2 : 다음주 수요일
    - y_3 : 다음주 목요일
    - y_4 : 다음주 금요일


- 이번주 월~금요일의 패턴을 학습해 다음주 월요일 ~ 금요일을 각각 예측하는 모델을 생성
    
- 이 과정을 모든 종목(370개)에 적용

In [157]:
fdr.DataReader(sample_code, start = start_date, end = end_date)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-06,54900,55600,54600,55500,10278951,0.000000
2020-01-07,55700,56400,55600,55800,10009778,0.005405
2020-01-08,56200,57400,55900,56800,23501171,0.017921
2020-01-09,58400,58600,57400,58600,24102579,0.031690
2020-01-10,58800,59700,58300,59500,16000170,0.015358
...,...,...,...,...,...,...
2021-11-01,70200,70600,69900,69900,11503729,0.001433
2021-11-02,70800,72200,70700,71500,16812570,0.022890
2021-11-03,71700,71700,70100,70400,12770428,-0.015385
2021-11-04,71200,71600,70500,70600,11818987,0.002841


In [158]:
sample_code = stock_list.loc[0,'종목코드']


sample = fdr.DataReader(sample_code, start = start_date, end = end_date).reset_index()
sample = pd.merge(Business_days, sample, how = 'outer')
sample['weekday'] = sample.Date.apply(lambda x : x.weekday())
sample['weeknum'] = sample.Date.apply(lambda x : x.strftime('%V'))

feature_columns = 'Open	High	Low	Close	Volume	Change	weekday'.split()
for c in feature_columns:
    sample[c] = sample[c].ffill()
    sample[c] = sample[c].bfill()

sample.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Change,weekday,weeknum
0,2020-01-06,54900.0,55600.0,54600.0,55500.0,10278951.0,0.0,0,2
1,2020-01-07,55700.0,56400.0,55600.0,55800.0,10009778.0,0.005405,1,2
2,2020-01-08,56200.0,57400.0,55900.0,56800.0,23501171.0,0.017921,2,2
3,2020-01-09,58400.0,58600.0,57400.0,58600.0,24102579.0,0.03169,3,2
4,2020-01-10,58800.0,59700.0,58300.0,59500.0,16000170.0,0.015358,4,2


In [159]:
X = sample[feature_columns].iloc[0:-6].to_numpy()
print(X.shape)
X

(474, 7)


array([[ 5.49000000e+04,  5.56000000e+04,  5.46000000e+04, ...,
         1.02789510e+07,  0.00000000e+00,  0.00000000e+00],
       [ 5.57000000e+04,  5.64000000e+04,  5.56000000e+04, ...,
         1.00097780e+07,  5.40540541e-03,  1.00000000e+00],
       [ 5.62000000e+04,  5.74000000e+04,  5.59000000e+04, ...,
         2.35011710e+07,  1.79211470e-02,  2.00000000e+00],
       ...,
       [ 7.06000000e+04,  7.15000000e+04,  7.04000000e+04, ...,
         1.05282520e+07,  1.28205128e-02,  1.00000000e+00],
       [ 7.10000000e+04,  7.10000000e+04,  7.00000000e+04, ...,
         1.02953160e+07, -1.40646976e-02,  2.00000000e+00],
       [ 6.95000000e+04,  7.22000000e+04,  6.95000000e+04, ...,
         2.06449020e+07,  8.55920114e-03,  3.00000000e+00]])

In [160]:
y = sample['Close'].iloc[1:-5].to_numpy()
print(y.shape)
# y = np.expand_dims(y, 1)
y

(474,)


array([55800., 56800., 58600., 59500., 60000., 60000., 59000., 60700.,
       61300., 62400., 61400., 62300., 60800., 60800., 60800., 58800.,
       59100., 57200., 56400., 57200., 58900., 59500., 61100., 60400.,
       59700., 59900., 60500., 60700., 61800., 61500., 59800., 60200.,
       60000., 59200., 56800., 57900., 56500., 55900., 54200., 55000.,
       55400., 57400., 57800., 56500., 54200., 54600., 52100., 50800.,
       49950., 48900., 47300., 45600., 42950., 45400., 42500., 46950.,
       48650., 47800., 48300., 47850., 47750., 45800., 46800., 47000.,
       48700., 49600., 48600., 49100., 49250., 48300., 49000., 49000.,
       49000., 51400., 50100., 49250., 49850., 49850., 49350., 49850.,
       50100., 50000., 50000., 50000., 48500., 48500., 49200., 48800.,
       48800., 48400., 47900., 48550., 48000., 47850., 48800., 50300.,
       50000., 49950., 48750., 48850., 49250., 49900., 50400., 50700.,
       51200., 51400., 54500., 54600., 55500., 54900., 55500., 55400.,
      

In [161]:
x_public = sample[feature_columns].iloc[-5:].to_numpy()
print(x_public.shape)
x_public

(5, 7)


array([[ 7.02000000e+04,  7.06000000e+04,  6.99000000e+04,
         6.99000000e+04,  1.15037290e+07,  1.43266476e-03,
         0.00000000e+00],
       [ 7.08000000e+04,  7.22000000e+04,  7.07000000e+04,
         7.15000000e+04,  1.68125700e+07,  2.28898426e-02,
         1.00000000e+00],
       [ 7.17000000e+04,  7.17000000e+04,  7.01000000e+04,
         7.04000000e+04,  1.27704280e+07, -1.53846154e-02,
         2.00000000e+00],
       [ 7.12000000e+04,  7.16000000e+04,  7.05000000e+04,
         7.06000000e+04,  1.18189870e+07,  2.84090909e-03,
         3.00000000e+00],
       [ 7.16000000e+04,  7.16000000e+04,  7.02000000e+04,
         7.02000000e+04,  1.26677430e+07, -5.66572238e-03,
         4.00000000e+00]])

- 예측

In [162]:
model = LinearRegression()
model.fit(X,y)

prediction = model.predict(x_public)
prediction

array([69983.71788649, 71606.27129622, 70356.8639272 , 70623.01926007,
       70160.0574515 ])

- 실제 Public 값

In [164]:
sample['Close'].iloc[-5:].values

array([69900., 71500., 70400., 70600., 70200.])

# 전체 모델링

In [141]:
sample_name = 'sample_submission.csv'
sample_submission = pd.read_csv(os.path.join(path,sample_name))
sample_submission

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2021-11-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2021-11-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2021-11-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2021-11-05,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,2021-11-29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2021-11-30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2021-12-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2021-12-02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2021-12-03,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [150]:
model = LinearRegression()
for code in tqdm(stock_list['종목코드'].values):
    data = fdr.DataReader(code, start = start_date, end = end_date).reset_index()
    data = pd.merge(Business_days, data, how = 'outer')
    data['weekday'] = data.Date.apply(lambda x : x.weekday())
    data['weeknum'] = data.Date.apply(lambda x : x.strftime('%V'))

    feature_columns = 'Open	High	Low	Close	Volume	Change	weekday'.split()
    for c in feature_columns:
        data[c] = data[c].ffill()
        data[c] = data[c].bfill()
    
    X = data[feature_columns].iloc[0:-6].to_numpy()   # 2021년 1월 04일 ~ 2021년 10월 22일까지의 {feature_columns} 데이터로    
    y = data['Close'].iloc[1:-5].to_numpy()           # 2021년 1월 11일 ~ 2021년 10월 29일까지의 종가(Close) 데이터를 학습한다.
    x_public = data[feature_columns].iloc[-5:].to_numpy()

    model = LinearRegression()
    model.fit(X,y)

    prediction = model.predict(x_public)
    sample_submission.loc[:,code] = prediction.tolist() * 2
sample_submission.isna().sum().sum()

100%|██████████| 370/370 [00:41<00:00,  8.99it/s]


0

In [151]:
sample_submission.columns

Index(['Day', '000060', '000080', '000100', '000120', '000150', '000240',
       '000250', '000270', '000660',
       ...
       '330860', '336260', '336370', '347860', '348150', '348210', '352820',
       '357780', '363280', '950130'],
      dtype='object', length=371)

In [152]:
columns = list(sample_submission.columns[1:])

columns = ['Day'] + [str(x).zfill(6) for x in columns]

sample_submission.columns = columns

In [153]:
sample_submission.to_csv('BASELINE_Linear.csv',index=False)

In [154]:
sample_submission

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-01,27862.31345,35290.272181,59938.164996,146774.301623,102998.427852,16679.934449,49497.650184,84436.367378,106398.161733,...,47949.068013,51590.700134,80833.932382,36713.365924,26054.396381,52185.035569,349207.366238,263263.467501,26613.313132,17760.478588
1,2021-11-02,29444.187022,35064.369444,61641.593905,148956.365483,106917.312762,17317.93312,48974.787151,85858.703805,107479.546079,...,50552.186957,54025.724226,81635.893678,35424.167139,26556.164133,53364.540281,348167.997015,259942.343022,27147.291478,18289.047725
2,2021-11-03,30423.161629,34053.470534,61594.399227,145950.256882,110125.305665,17138.058682,50170.50944,85680.456914,106294.097167,...,49243.503168,54570.217949,82732.225967,34535.539443,25944.463356,51352.677166,346433.663974,254141.96377,26771.063682,18403.937407
3,2021-11-04,29480.332166,33772.983151,61207.344682,145637.805779,123829.397004,17709.769765,51019.190335,86970.91737,106402.935662,...,48699.611039,53854.995672,81741.968324,35518.391306,26029.195765,50645.356102,355963.743005,252343.943956,26293.907894,17395.429254
4,2021-11-05,29464.566007,33386.227986,60809.924789,144352.754108,132879.629831,17203.131544,50805.576564,88015.853936,107078.692682,...,50298.333348,53935.334414,85060.873415,35887.359441,25504.289379,50207.822765,383434.324695,249767.025564,25967.848662,17708.380549
5,2021-11-29,27862.31345,35290.272181,59938.164996,146774.301623,102998.427852,16679.934449,49497.650184,84436.367378,106398.161733,...,47949.068013,51590.700134,80833.932382,36713.365924,26054.396381,52185.035569,349207.366238,263263.467501,26613.313132,17760.478588
6,2021-11-30,29444.187022,35064.369444,61641.593905,148956.365483,106917.312762,17317.93312,48974.787151,85858.703805,107479.546079,...,50552.186957,54025.724226,81635.893678,35424.167139,26556.164133,53364.540281,348167.997015,259942.343022,27147.291478,18289.047725
7,2021-12-01,30423.161629,34053.470534,61594.399227,145950.256882,110125.305665,17138.058682,50170.50944,85680.456914,106294.097167,...,49243.503168,54570.217949,82732.225967,34535.539443,25944.463356,51352.677166,346433.663974,254141.96377,26771.063682,18403.937407
8,2021-12-02,29480.332166,33772.983151,61207.344682,145637.805779,123829.397004,17709.769765,51019.190335,86970.91737,106402.935662,...,48699.611039,53854.995672,81741.968324,35518.391306,26029.195765,50645.356102,355963.743005,252343.943956,26293.907894,17395.429254
9,2021-12-03,29464.566007,33386.227986,60809.924789,144352.754108,132879.629831,17203.131544,50805.576564,88015.853936,107078.692682,...,50298.333348,53935.334414,85060.873415,35887.359441,25504.289379,50207.822765,383434.324695,249767.025564,25967.848662,17708.380549
