# Settings

## import libraries

In [1]:
import keyring
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

## connect databases

In [5]:
# test for connecting databases
user = 'root'
pw = keyring.get_password('macmini_db', user)
host = '192.168.219.106'
engine = create_engine(f'mysql+pymysql://{user}:{pw}@{host}:3306/stock')
price = pd.read_sql('SELECT * FROM sample_etf;', con=engine)
engine.dispose()

In [6]:
price

Unnamed: 0,Date,SPY,IEV,EWJ,EEM,TLT,IEF,IYR,RWX,GLD,DBC
0,1993-01-29,24.941397,,,,,,,,,
1,1993-02-01,25.118792,,,,,,,,,
2,1993-02-02,25.172007,,,,,,,,,
3,1993-02-03,25.438086,,,,,,,,,
4,1993-02-04,25.544548,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
7760,2023-11-22,455.019989,50.160000,62.490002,39.520000,90.870003,92.750000,82.250000,24.920000,184.559998,24.180000
7761,2023-11-24,455.299988,50.669998,62.610001,39.540001,89.800003,92.269997,82.550003,25.049999,185.520004,24.020000
7762,2023-11-27,454.480011,50.500000,62.560001,39.380001,91.300003,92.970001,82.849998,25.049999,186.770004,23.860001
7763,2023-11-28,454.929993,50.490002,62.549999,39.709999,91.480003,93.379997,83.339996,25.120001,189.259995,24.150000


# Collect kr stock data

## Get recent business day of Korea

In [7]:
# import libraries

import requests as rq
from bs4 import BeautifulSoup

url = 'https://finance.naver.com/sise/sise_deposit.nhn'
data = rq.get(url)
data_html = BeautifulSoup(data.content)
parse_day = data_html.select_one('div.subtop_sise_graph2 > ul.subtop_chart_note > li > span.tah').text

print(parse_day)

  |  2024.03.06


In [9]:
# regex
import re

biz_day = re.findall('[0-9]+', parse_day)
biz_day = ''.join(biz_day)
biz_day

'20240306'

## Get sector category info

### crawling KOSPI from KRX

In [10]:
# get OTP

import requests as rq
from io import BytesIO
import pandas as pd

gen_otp_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
gen_otp_stk = {
    'mktId' : 'STK',        # STK는 코스피
    'trdDd' : biz_day,
    'money' : '1',
    'csvxls_isNo' : 'false',
    'name' : 'fileDown',
    'url' : 'dbms/MDC/STAT/standard/MDCSTAT03901'
}

# add a referrer in the header
# we can get OTP from the first url, when sending this to the second url without a referrer, web site recognizes this request as one from a bot.
headers = {'Referer':  'http://data.krx.co.kr/contents/MDC/MDI/mdiLoader'}
# send queries by post() funciton, get data and select only text info.
otp_stk = rq.post(gen_otp_url, gen_otp_stk, headers=headers).text

print(otp_stk)

fjhTvrj2mv1NmfNJZx4OIJUx2lZOZ8cAtFn4xXE5oK8RtSksuLS7Bnxpl86F7dAOkunw9BBwugQaSjGAcH15eWv1yWsV/u36VXXLRya0RRstBgM+EFJCxYg3zco1gIgRZqIo4cIzoURnTI8+MmkJ4m8vFLhSKmM794gFu+ThsO31lY4woqehX8j6OlXFDcfHdV4NbYo4+D2Rwcfj24VnU3Zpq3ik/Dyw3FdyOXhJkBI=


In [11]:
# download data after sending OTP
down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'
down_sector_stk = rq.post(down_url, {'code': otp_stk}, headers=headers)
# 1. convert recieved data's content part into binary stream by ByteIO() and read by read_csv() function.
sector_stk = pd.read_csv(BytesIO(down_sector_stk.content), encoding='EUC-KR')

sector_stk.head()

Unnamed: 0,종목코드,종목명,시장구분,업종명,종가,대비,등락률,시가총액
0,95570,AJ네트웍스,KOSPI,서비스업,4560,-15,-0.33,206352581040
1,6840,AK홀딩스,KOSPI,기타금융,15940,-100,-0.62,211166122340
2,27410,BGF,KOSPI,기타금융,3820,-35,-0.91,365638141620
3,282330,BGF리테일,KOSPI,유통업,130400,400,0.31,2253821342400
4,138930,BNK금융지주,KOSPI,기타금융,7690,90,1.18,2476860088220


### crawling KOSDAQ from KRX

In [12]:
# download KOSDAQ data
gen_otp_ksq = {
    'mktId': 'KSQ',      # 코스닥 코드
    'trdDd': biz_day,
    'money': '1',
    'csvxls_isNo': 'false',
    'name': 'fileDown',
    'url': 'dbms/MDC/STAT/standard/MDCSTAT03901'
}

otp_ksq = rq.post(gen_otp_url, gen_otp_ksq, headers=headers).text

down_sector_ksq = rq.post(down_url, {'code': otp_ksq}, headers=headers)
sector_ksq = pd.read_csv(BytesIO(down_sector_ksq.content), encoding='EUC-KR')

sector_ksq.head()

Unnamed: 0,종목코드,종목명,시장구분,업종명,종가,대비,등락률,시가총액
0,60310,3S,KOSDAQ,기계·장비,2790,0,0.0,135417231180
1,54620,APS,KOSDAQ,금융,6770,-50,-0.73,138068876170
2,265520,AP시스템,KOSDAQ,반도체,21350,650,3.14,326258338350
3,211270,AP위성,KOSDAQ,통신장비,16060,170,1.07,242221802240
4,139050,BF랩스,KOSDAQ,컴퓨터서비스,7490,220,3.03,64354911390


In [13]:
# combine KOSPI and KOSDAQ data
krx_sector = pd.concat([sector_stk, sector_ksq]).reset_index(drop=True)
# delete blank in the company name only if there is.
krx_sector['종목명'] = krx_sector['종목명'].str.strip()
# add 'base_data' column
krx_sector['base_date'] = biz_day

krx_sector.head()

Unnamed: 0,종목코드,종목명,시장구분,업종명,종가,대비,등락률,시가총액,base_date
0,95570,AJ네트웍스,KOSPI,서비스업,4560,-15,-0.33,206352581040,20240306
1,6840,AK홀딩스,KOSPI,기타금융,15940,-100,-0.62,211166122340,20240306
2,27410,BGF,KOSPI,기타금융,3820,-35,-0.91,365638141620,20240306
3,282330,BGF리테일,KOSPI,유통업,130400,400,0.31,2253821342400,20240306
4,138930,BNK금융지주,KOSPI,기타금융,7690,90,1.18,2476860088220,20240306


In [15]:
import requests as rq
from bs4 import BeautifulSoup
import re
from io import BytesIO
import pandas as pd

class GetKoreanStockDataSet:
    
    def __init__(self):
        self.biz_day = self.get_recent_biz_day()
        self.gen_otp_url = 'http://data.krx.co.kr/comm/fileDn/GenerateOTP/generate.cmd'
        self.mktID = {
            'KOSPI' : 'STK',
            'KOSDAQ' : 'KSQ'
        }
        self.biz_day = self.get_recent_biz_day()
        
        # add a referrer in the header
        # we can get OTP from the first url, when sending this to the second url without a referrer, web site recognizes this request as one from a bot.
        self.headers = {'Referer':  'http://data.krx.co.kr/contents/MDC/MDI/mdiLoader'}
        
    def get_recent_biz_day(self):
        
        # get recent biz day from Naver finance
        url = 'https://finance.naver.com/sise/sise_deposit.nhn'
        data = rq.get(url)
        data_html = BeautifulSoup(data.content)
        parse_day = data_html.select_one('div.subtop_sise_graph2 > ul.subtop_chart_note > li > span.tah').text
        
        # regex
        biz_day = re.findall('[0-9]+', parse_day)
        biz_day = ''.join(biz_day)
        
        return biz_day
    
    def gen_otp_krx(self, market='KOSPI'):
        
        if market == 'Ind':
            gen_otp = {
                'searchType': '1',
                'mktId': 'ALL',
                'trdDd': self.biz_day,
                'csvxls_isNo': 'false',
                'name': 'fileDown',
                'url': 'dbms/MDC/STAT/standard/MDCSTAT03501'
            }
        else:
            gen_otp = {
                'mktId' : self.mktID[market],        # STK는 코스피
                'trdDd' : self.biz_day,
                'money' : '1',
                'csvxls_isNo' : 'false',
                'name' : 'fileDown',
                'url' : 'dbms/MDC/STAT/standard/MDCSTAT03901'
            }
        
        # send queries by post() funciton, get data and select only text info.
        otp_stk = rq.post(self.gen_otp_url, gen_otp, headers=self.headers).text

        print(f"generating OTP mktID: {gen_otp['mktId']}")
        
        return otp_stk
    
    def download_data_krx(self, market='KOSPI'):
        otp = self.gen_otp_krx(market)
        down_url = 'http://data.krx.co.kr/comm/fileDn/download_csv/download.cmd'
        down_sector = rq.post(down_url, {'code': otp}, headers=self.headers)
        
        sector = pd.read_csv(BytesIO(down_sector.content), encoding='EUC-KR')
        
        return sector
    
    def get_combined_data_krx(self):
        sector_stk = self.download_data_krx(market='KOSPI')
        sector_ksq = self.download_data_krx(market='KOSDAQ')
        krx_sector = pd.concat([sector_stk, sector_ksq]).reset_index(drop=True)
        
        # delete blank in the company_name
        krx_sector['종목명'] = krx_sector['종목명'].str.strip()
        
        # add 'data' column
        krx_sector['date'] = self.biz_day
        
        # get individual stock data from KRX
        krx_ind = self.download_data_krx(market='Ind')
        krx_ind['종목명'] = krx_ind['종목명'].str.strip()
        krx_ind['data'] = self.biz_day
        
        diff = list(set(krx_sector['종목명']).symmetric_difference(set(krx_ind['종목명'])))
        
        # diff codes are not normal, so we just merge two data bases
        kor_ticker = pd.merge(
            krx_sector,
            krx_ind,
            on = krx_sector.columns.intersection(
                krx_ind.columns
            ).to_list(), how='outer'
        )
        
        # 
    
    # def get_ind_data_krx(self):
        
    #     # get individual data from KRX
    #     krx_ind = self.download_data_krx(market='Ind')  
    #     krx_ind['종목명'] = krx_ind['종목명'].str.strip()
    #     krx_ind['date'] = self.biz_day
        
    #     return krx_ind
        


In [16]:
a = GetKoreanStockDataSet()
data = a.get_ind_data_krx()
data

generating OTP mktID: ALL


Unnamed: 0,종목코드,종목명,종가,대비,등락률,EPS,PER,선행 EPS,선행 PER,BPS,PBR,주당배당금,배당수익률,date
0,060310,3S,2790,0,0.00,30.0,93.00,,,947.0,2.95,0,0.00,20240306
1,095570,AJ네트웍스,4560,-15,-0.33,201.0,22.69,685.0,6.65,8076.0,0.56,270,5.92,20240306
2,006840,AK홀딩스,15940,-100,-0.62,,,,,41948.0,0.38,200,1.25,20240306
3,054620,APS,6770,-50,-0.73,505.0,13.41,,,10864.0,0.62,0,0.00,20240306
4,265520,AP시스템,21350,650,3.14,5463.0,3.91,,,17980.0,1.19,270,1.26,20240306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2615,000540,흥국화재,4755,-110,-2.26,2142.0,2.22,,,11208.0,0.42,0,0.00,20240306
2616,000545,흥국화재우,6240,-60,-0.95,,,,,,,0,0.00,20240306
2617,003280,흥아해운,2690,-40,-1.47,94.0,28.62,,,544.0,4.94,0,0.00,20240306
2618,037440,희림,6780,-50,-0.73,567.0,11.96,,,5186.0,1.31,150,2.21,20240306


In [50]:
# data.rename(columns={
#     '종목코드': 'company_code',
#     '종목명': 'company',
#     '시장구분': 'market',
#     '종가': 'close',
#     '시가총액': 'market_cap',
     
# })

Unnamed: 0,종목코드,종목명,시장구분,업종명,종가,대비,등락률,시가총액
0,60310,3S,KOSDAQ,기계·장비,2790,0,0.0,135417231180
1,54620,APS,KOSDAQ,금융,6770,-50,-0.73,138068876170
2,265520,AP시스템,KOSDAQ,반도체,21350,650,3.14,326258338350
3,211270,AP위성,KOSDAQ,통신장비,16060,170,1.07,242221802240
4,139050,BF랩스,KOSDAQ,컴퓨터서비스,7490,220,3.03,64354911390
