In [1]:
from pathlib import Path
from tqdm import tqdm
import re
import pandas as pd
from datetime import datetime, timedelta
import time
import random
import sqlite3
import QueryStockDataDB
import stockdata
import OpenDartReader
from pykrx import stock

import re

import requests
from bs4 import BeautifulSoup

In [2]:
api_key = '92c176817e681dcc4ad263eb3fa5182792b0b7a3'
dart = OpenDartReader(api_key)

In [3]:
folderPath = Path.home().joinpath('Desktop', "dataBackUp(211021)")

In [4]:
def getTickersList(folderPath):
    stockN = pd.read_parquet(folderPath/'stockNumberOfSharesDB.parquet')
    con = stockN.loc[:,'kindOfStock'] == 'commonStock'
    tickers = stockN.loc[con, 'ticker'].unique().tolist()
    return tickers

In [5]:
def getNameList(tickers, folderPath) :
    stockList = pd.read_parquet(folderPath/'stockListDB.parquet')
    names = []
    for ticker in tqdm(tickers) :
        con = stockList.loc[:, 'ticker'] == ticker
        name = stockList.loc[con, 'name'].iat[-1]
        names.append(name)
    return names

In [6]:
def getCorpCodeList(tickers) :
    corp_codes = []
    fail_corp_codes = [] 
    for ticker in tqdm(tickers) :
        try:
            corp_code = dart.find_corp_code(ticker)
            corp_codes.append(corp_code)
        except :
            fail_corp_codes.append(ticker)
    return corp_codes, fail_corp_codes

In [7]:
def createCorpCodesDf(folderPath) :
    tickers = getTickersList(folderPath)
    nameList = getNameList(tickers, folderPath)
    corpCodeList, failCorpCodeList = getCorpCodeList(tickers)
    infoDf = pd.DataFrame({'tickers' : tickers, 
                        'names' :nameList,
                        'corp_codes' :corpCodeList})
    return infoDf, failCorpCodeList

In [8]:
def getTickerNameCorpcode(k, infoDf):
    ticker = infoDf.loc[k, 'tickers']
    name = infoDf.loc[k, 'names']
    corp_code = infoDf.loc[k, 'corp_codes']

    return ticker, name, corp_code

In [9]:
def getParamsDic(apiKey, corp_code, pblntf_ty, start, end) :
    paramsDic = {
        'crtfc_key' : apiKey,
        'corp_code' : corp_code,
        'bgn_de' : start,
        'end_de' : end,
        'pblntf_ty': pblntf_ty,
        'last_reprt_at' : 'N',
        'page_count' : 100
    }
    return paramsDic

In [10]:
def rceptNoDf(apiKey, infoDf, start, end) :

    url = 'https://opendart.fss.or.kr/api/list.json'

    cols = ['corp_code', 'corp_name', 'stock_code', 'corp_cls', 'report_nm', 'rcept_no', 'flr_nm', 'rcept_dt', 'rm']
    rceptNoDf = pd.DataFrame(columns=cols)
    failsDic = {'ticker':[], 'name':[], 'corp_code':[]}
    status_codes = []

    for k in tqdm(range(len(infoDf))) :
        ticker, name, corp_code = getTickerNameCorpcode(k, infoDf)
        A_paramsDic = getParamsDic(apiKey, corp_code, 'A', start, end)
        r = requests.get(url, A_paramsDic)
        status = r.json()['status']
        status_codes.append(status)

        if status == '000' :
            rceptNoDf = rceptNoDf.append(pd.DataFrame(r.json()['list']))
            
        elif status == '013' :
            F_paramsDic = getParamsDic(apiKey, corp_code, 'F', start, end)
            r = requests.get(url, F_paramsDic)
            status = r.json()['status']

            if status == '000' : 
                rceptNoDf = rceptNoDf.append(pd.DataFrame(r.json()['list']))

            else :
                failsDic['ticker'].append(ticker)
                failsDic['name'].append(name)
                failsDic['corp_code'].append(corp_code)
                
        else :
            failsDic['ticker'].append(ticker)
            failsDic['name'].append(name)
            failsDic['corp_code'].append(corp_code)

    return rceptNoDf, status_codes, failsDic

In [11]:
apiKey = '92c176817e681dcc4ad263eb3fa5182792b0b7a3'
folderPath = Path.home().joinpath('Desktop','dataBackUp(211027)')
infoDf, failCorpCodeList = createCorpCodesDf(folderPath)
rceptNoDf, status_codes, failsDic = rceptNoDf(apiKey, infoDf, '20100101', '20211106')

100%|██████████| 1027/1027 [00:00<00:00, 3433.19it/s]
100%|██████████| 1027/1027 [00:04<00:00, 232.31it/s]
100%|██████████| 1027/1027 [01:41<00:00, 10.10it/s]


In [18]:
# display(pd.DataFrame(fails))
print(len(pd.DataFrame(failsDic)))
print(len(infoDf))
print(len(rceptNoDf.corp_code.unique().tolist()))
con = rceptNoDf.report_nm.str.contains('감사')
rceptNoDf.loc[con]
pd.DataFrame(failsDic)
con = infoDf.corp_codes.isnull()
display(infoDf.loc[con])
failCorpCodeList


35
1027
992


Unnamed: 0,tickers,names,corp_codes
30,071955,코아스웰1우B,
106,003945,삼양제넥우,
133,004149,동방전환상환3우B,
134,004147,동방전환상환2우B,
157,00341D,쌍용양회5우B,
158,00341A,쌍용양회4우B,
159,003419,쌍용양회3우B,
160,003417,쌍용양회2우B,
162,003415,쌍용양회우,
243,000997,동부하이텍2우B,


[]

In [19]:
addDf = rceptNoDf.report_nm.str.extract(r'\[?(\w*)\]?(사업보고서|반기보고서|분기보고서|감사보고서|연결감사보고서).*\((\d{4}\.\d{2})\)', expand=True).rename(columns={0:'add_info', 1:'kind', 2:'date'})
addedReportInfoDf = pd.concat([rceptNoDf, addDf], axis=1)
addedReportInfoDf.kind.unique().tolist()
dropaddedReportInfoDf = addedReportInfoDf.dropna()  # 사업보고서, 반기보고서, 분기보고서, 감사보고서, 연결감사보거서 이외의 보고서는 삭제(필요없음)
con = dropaddedReportInfoDf.loc[:, 'kind'].isnull()
dropaddedReportInfoDf.loc[con]
dropaddedReportInfoDf

Unnamed: 0,corp_code,corp_name,stock_code,corp_cls,report_nm,rcept_no,flr_nm,rcept_dt,rm,add_info,kind,date
0,00119195,동화약품,000020,Y,반기보고서 (2021.06),20210817000691,동화약품,20210817,,,반기보고서,2021.06
1,00119195,동화약품,000020,Y,분기보고서 (2021.03),20210514000833,동화약품,20210514,,,분기보고서,2021.03
2,00119195,동화약품,000020,Y,사업보고서 (2020.12),20210317000578,동화약품,20210317,연,,사업보고서,2020.12
3,00119195,동화약품,000020,Y,분기보고서 (2020.09),20201113000729,동화약품,20201113,,,분기보고서,2020.09
4,00119195,동화약품,000020,Y,반기보고서 (2020.06),20200814000382,동화약품,20200814,,,반기보고서,2020.06
...,...,...,...,...,...,...,...,...,...,...,...,...
1,01390344,현대중공업,329180,Y,분기보고서 (2021.03),20210517001476,현대중공업,20210517,,,분기보고서,2021.03
2,01390344,현대중공업,329180,Y,사업보고서 (2020.12),20210401001108,현대중공업,20210401,연,,사업보고서,2020.12
0,01323032,케이카,381970,Y,감사보고서 (2020.12),20210401000915,삼정회계법인,20210401,,,감사보고서,2020.12
1,01323032,케이카,381970,Y,감사보고서 (2019.12),20200407001065,삼정회계법인,20200407,,,감사보고서,2019.12


In [20]:
def getString(reprt_code) : 
    
    url = f'http://dart.fss.or.kr/dsaf001/main.do?rcpNo={reprt_code}'
    r = requests.get(url)
    if r.status_code == 200 :
        string = r.text 
    return string

In [21]:
def get_regex_search_node1(key, scriipt_tag) :
    return re.findall(r'node1\[\'' + key + '\'\]\s*=\s*(".*?")', scriipt_tag)

In [22]:
def get_regex_search_node2(key, scriipt_tag) :
    return re.findall(r'node2\[\'' + key + '\'\]\s*=\s*(".*?")', scriipt_tag)

In [23]:

def getMainReportDf(report_code) :
        string = getString(report_code)

        dic = {'text' : get_regex_search_node2('text', string),
                'id' : get_regex_search_node2('id', string),
                'rcpNo' : get_regex_search_node2('rcpNo', string), 
                'dcmNo' : get_regex_search_node2('dcmNo', string), 
                'eleId' : get_regex_search_node2('eleId', string), 
                'offset' : get_regex_search_node2('offset', string),
                'length' : get_regex_search_node2('length', string),
                'dtd' : get_regex_search_node2('dtd', string),
                'tocNo' : get_regex_search_node2('tocNo', string),
        }

        reportDf = pd.DataFrame(dic)
        reportDf = reportDf.applymap(lambda x: x.strip('"'))         
        return reportDf

In [24]:
def getConsolidatedDic(df) :
    con = df.text.str.contains('.*연결재무제표$')
    dic = df.loc[con].to_dict('records')[0]
    return dic

In [25]:
infoLst = dropaddedReportInfoDf.sample(1).to_dict('records')
for dic in infoLst :
    ticker = dic['stock_code']
    reprt_code = dic['rcept_no']
    date = dic['date']
    print(ticker, date, reprt_code, sep=', ')

005980, 2012.12, 20130430000560


In [120]:


dic = dropaddedReportInfoDf.sample(1).to_dict('records')[0]
ticker = dic['stock_code']
reprt_code = dic['rcept_no']
date = dic['date']
print(ticker, date, reprt_code, sep=', ')

ticker = '016360'  # 000020
con = dropaddedReportInfoDf.stock_code == ticker
# rcept_noLst = dropaddedReportInfoDf.loc[con].rcept_no.unique().tolist()
dropaddedReportInfoDf.loc[con]
reprt_code = '20201113000758' # 20150331002176  20141114001023      

url = f'http://dart.fss.or.kr/dsaf001/main.do?rcpNo={reprt_code}'
r = requests.get(url)
string = r.text 
dic = {'text' : get_regex_search_node2('text', string),
                'id' : get_regex_search_node2('id', string),
                'rcpNo' : get_regex_search_node2('rcpNo', string), 
                'dcmNo' : get_regex_search_node2('dcmNo', string), 
                'eleId' : get_regex_search_node2('eleId', string), 
                'offset' : get_regex_search_node2('offset', string),
                'length' : get_regex_search_node2('length', string),
                'dtd' : get_regex_search_node2('dtd', string),
                'tocNo' : get_regex_search_node2('tocNo', string),
        }
reportDf = pd.DataFrame(dic)
reportDf = reportDf.applymap(lambda x: x.strip('"'))
display(reportDf)
con = reportDf.text.str.contains('.*연결재무제표$')
dic = reportDf.loc[con].to_dict('records')
if dic :
        dic = dic[0]
        print(dic)
else:
        dic = {'text' : get_regex_search_node1('text', string),
                'id' : get_regex_search_node1('id', string),
                'rcpNo' : get_regex_search_node1('rcpNo', string), 
                'dcmNo' : get_regex_search_node1('dcmNo', string), 
                'eleId' : get_regex_search_node1('eleId', string), 
                'offset' : get_regex_search_node1('offset', string),
                'length' : get_regex_search_node1('length', string),
                'dtd' : get_regex_search_node1('dtd', string),
                'tocNo' : get_regex_search_node1('tocNo', string),
        }
        reportDf = pd.DataFrame(dic)
        reportDf = reportDf.applymap(lambda x: x.strip('"'))
        display(reportDf)
        con = reportDf.text.str.contains('.*재무제표.*$')
        dic = reportDf.loc[con].to_dict('records')
        dic = dic[0]
        print(dic)

url = f'http://dart.fss.or.kr/report/viewer.do?'
r = requests.get(url, params=dic)
html = r.text
soup = BeautifulSoup(html, 'html.parser')
table = soup.select('table')
if table :
        table = pd.read_html(html)
        length = len(table)
        for k in range(0, length):
                title_candi = table[k]
                print(title_candi)
                if isinstance(title_candi, str) :
                        lst = re.findall('.*재무상태표$|.*손익계산서$|.*현금흐름표$', title_candi)
                        print(lst)
                if lst :
                        print(k, title_candi, sep=':')
                        display(table[k+1])
else : 
        con = reportDf.text.str.contains('\d{1}\.\s?재무제표$')
        dic = reportDf.loc[con].to_dict('records')[0]
        print(dic)

        url = f'http://dart.fss.or.kr/report/viewer.do?'
        r = requests.get(url, params=dic)
        html = r.text
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.select('table')
        if table :
                table = pd.read_html(html)
                length = len(table)
                for k in range(0, length):
                        title_candi = table[k].iat[0,0]
                        if isinstance(title_candi, str) :
                                lst = re.findall('.*재무상태표$|.*손익계산서$|.*현금흐름표$', title_candi)
                        if lst :
                                print(k, title_candi, sep=':')
                                display(table[k+1])

007630, 2016.12, 20170814001657


Unnamed: 0,text,id,rcpNo,dcmNo,eleId,offset,length,dtd,tocNo
0,1. 회사의 개요,4,20201113000758,7544903,4,14414,114934,dart3.xsd,4
1,2. 회사의 연혁,5,20201113000758,7544903,5,129352,21281,dart3.xsd,5
2,3. 자본금 변동사항,6,20201113000758,7544903,6,150637,4112,dart3.xsd,6
3,4. 주식의 총수 등,7,20201113000758,7544903,7,154753,7626,dart3.xsd,7
4,5. 의결권 현황,8,20201113000758,7544903,8,162383,5969,dart3.xsd,8
5,6. 배당에 관한 사항 등,9,20201113000758,7544903,9,168356,8995,dart3.xsd,9
6,1. 요약재무정보,12,20201113000758,7544903,12,751748,40844,dart3.xsd,12
7,2. 연결재무제표,13,20201113000758,7544903,13,792799,96035,dart3.xsd,13
8,3. 연결재무제표 주석,14,20201113000758,7544903,14,888838,718811,dart3.xsd,14
9,4. 재무제표,15,20201113000758,7544903,15,1607653,81212,dart3.xsd,15


{'text': '2. 연결재무제표', 'id': '13', 'rcpNo': '20201113000758', 'dcmNo': '7544903', 'eleId': '13', 'offset': '792799', 'length': '96035', 'dtd': 'dart3.xsd', 'tocNo': '13'}
                          0
0  제39기 3분기 2020년 9월 30일 현재
1     제38기 2019년 12월 31일 현재
2     제37기 2018년 12월 31일 현재
                   0         1
0  삼성증권 주식회사와 그 종속기업  (단위 : 원)
                                                  과 목          주석  \
0                                                 자 산         NaN   
1                                         I. 현금및현금성자산       536.0   
2                                 II. 당기손익-공정가치측정금융자산   6333536.0   
3                              III. 기타포괄손익-공정가치측정금융자산   7333536.0   
4                                     IV. 상각후원가측정금융자산   8333536.0   
5                                         V. 관계기업투자지분         9.0   
6                                            VI. 유형자산        10.0   
7                                           VII. 무형자산        11.0   
8                                  

In [33]:
stockList = pd.read_parquet(folderPath/'stockListDB.parquet')
con = stockList.loc[:, 'name'] == '동화약품'
stockList.loc[con]

Unnamed: 0,ticker,name,firstDay,endDay
328,20,동화약품,2010-01-01,2021-10-26


In [29]:
tickerLst = dropaddedReportInfoDf.stock_code.unique().tolist()
tickerLst = tickerLst[:5]

dics = {}
for ticker in tqdm(tickerLst) :
    time.sleep(1)
    con = dropaddedReportInfoDf.stock_code == ticker
    rcept_noLst = dropaddedReportInfoDf.loc[con].rcept_no.unique().tolist()
    for rcept_no in tqdm(rcept_noLst) :
        mainReportDf = getMainReportDf(rcept_no)
        dics[ticker] = getConsolidatedDic(mainReportDf)

 60%|██████    | 30/50 [00:03<00:02,  9.81it/s]
  0%|          | 0/5 [00:04<?, ?it/s]


IndexError: list index out of range

In [90]:
dics
consolidatedDf = pd.DataFrame(data = dics.values(), index=dics.keys())

In [105]:
consolidatedDf.head(5)

Unnamed: 0,text,id,rcpNo,dcmNo,eleId,offset,length,dtd,tocNo
20,2. 연결재무제표,14,20210309000744,7861003,14,687340,115750,dart3.xsd,14
78420,2. 연결재무제표,14,20210309000744,7861003,14,687340,115750,dart3.xsd,14
2840,2. 연결재무제표,14,20210309000744,7861003,14,687340,115750,dart3.xsd,14
78930,2. 연결재무제표,14,20210309000744,7861003,14,687340,115750,dart3.xsd,14
2820,2. 연결재무제표,14,20210309000744,7861003,14,687340,115750,dart3.xsd,14


In [104]:
tickerLst = list(dics.keys())
dic = dics[tickerLst[0]]
params={
    'rcpNo': dic['rcpNo'],
    'dcmNo': dic['dcmNo'],
    'eleId': dic['eleId'],
    'offset': dic['offset'],
    'length': dic['length'],
    'dtd': dic['dtd']
    }

url = f'https://dart.fss.or.kr/report/viewer.do?'
r = requests.get(url, params=params)
html = r.text

table = pd.read_html(html)
length = len(table)
for k in range(0, length):
    title_candi = table[k].iat[0,0]
    lst = re.findall('.*재무상태표|손익계산서|현금흐름표', title_candi)
    if lst :
        print(k, title_candi, sep=':')
        display(table[k+1])

0:연결 재무상태표


Unnamed: 0.1,Unnamed: 0,제 52 기,제 51 기,제 50 기
0,자산,,,
1,유동자산,198215579,181385260,174697424
2,현금및현금성자산,29382578,26885999,30340505
3,단기금융상품,92441703,76252052,65893797
4,단기상각후원가금융자산,2757111,3914216,2703693
5,단기당기손익-공정가치금융자산,71451,1727436,2001948
6,매출채권,30965058,35131343,33867733
7,미수금,3604539,4179120,3080733
8,선급비용,2266100,2406220,4136167
9,재고자산,32043145,26766464,28984704


3:연결 손익계산서


Unnamed: 0.1,Unnamed: 0,제 52 기,제 51 기,제 50 기
0,수익(매출액),236806988.0,230400881.0,243771415.0
1,매출원가,144488296.0,147239549.0,132394411.0
2,매출총이익,92318692.0,83161332.0,111377004.0
3,판매비와관리비,56324816.0,55392823.0,52490335.0
4,영업이익,35993876.0,27768509.0,58886669.0
5,기타수익,1384068.0,1778666.0,1485037.0
6,기타비용,2488902.0,1414707.0,1142018.0
7,지분법이익,506530.0,412960.0,539845.0
8,금융수익,12267600.0,10161632.0,9999321.0
9,금융비용,11318055.0,8274871.0,8608896.0


6:연결 포괄손익계산서


Unnamed: 0.1,Unnamed: 0,제 52 기,제 51 기,제 50 기
0,당기순이익(손실),26407832,21738865,44344857
1,기타포괄손익,"(3,673,905)",3016194,"(12,242)"
2,후속적으로 당기손익으로 재분류되지 않는 포괄손익,1788764,"(50,765)","(656,647)"
3,기타포괄손익-공정가치금융자산평가손익,2502733,1146599,"(235,865)"
4,관계기업 및 공동기업의 기타포괄손익에 대한 지분,5591,"(16,896)","(10,631)"
5,순확정급여부채(자산) 재측정요소,"(719,560)","(1,180,468)","(410,151)"
6,후속적으로 당기손익으로 재분류되는 포괄손익,"(5,462,669)",3066959,644405
7,관계기업 및 공동기업의 기타포괄손익에 대한 지분,"(48,888)",48649,6688
8,해외사업장환산외환차이,"(5,380,375)",3016499,590638
9,현금흐름위험회피파생상품평가손익,"(33,406)",1811,47079


12:연결 현금흐름표


Unnamed: 0.1,Unnamed: 0,제 52 기,제 51 기,제 50 기
0,영업활동 현금흐름,65287009,45382915,67031863
1,영업에서 창출된 현금흐름,68148810,56635791,78025064
2,당기순이익,26407832,21738865,44344857
3,조정,41618554,37442682,43604573
4,영업활동으로 인한 자산부채의 변동,122424,"(2,545,756)","(9,924,366)"
5,이자의 수취,2220209,2306401,1788520
6,이자의 지급,"(555,321)","(579,979)","(548,272)"
7,배당금 수입,243666,241801,215992
8,법인세 납부액,"(4,770,355)","(13,221,099)","(12,449,441)"
9,투자활동 현금흐름,"(53,628,591)","(39,948,171)","(52,240,453)"


In [68]:
url = '	https://opendart.fss.or.kr/api/corpCode.xml'
params = {'crtfc_key' : '92c176817e681dcc4ad263eb3fa5182792b0b7a3'}

r = requests.get(url, params)

path = Path.home() / 'Desktop'

with open(path / 'dartDownload.zip', 'wb') as f :
    f.write(r.content)

In [54]:
def checkOrder(current_m, before_m):
    dic = {'03':'12', '06':'03', '09':'06', '12':'09'}

    if dic[current_m] == before_m:
        return 'correct'
    else :
        return 'wrong'

In [56]:
dfs = []
fails = []
for k in range(len(infoDf)) :
    ticker = infoDf.loc[k].tickers
    print(ticker)
    name = infoDf.loc[k].names
    corp_code = infoDf.loc[k].corp_codes
    try:
        reportInfoDf = dart.list(ticker, start='2010-01-01', end='2021-11-01', kind='A', final=False)
        addDf = reportInfoDf.report_nm.str.extract(r'\[?(\w*)\]?(사업보고서|반기보고서|분기보고서).*\((\d{4}\.\d{2})\)', expand=True).rename(columns={0:'add_info', 1:'kind', 2:'date'})
        addedReportInfoDf = pd.concat([reportInfoDf, addDf], axis=1)
        addedReportInfoDf = addedReportInfoDf[['corp_code', 'corp_name', 'stock_code','rcept_dt', 'add_info', 'kind', 'date', 'rcept_no']]
        con = addedReportInfoDf.loc[:, 'add_info'] == '기재정정'
        checkDf = addedReportInfoDf.loc[~con].sort_values(by='rcept_dt').reset_index(drop=True)
        checkDf = checkDf.assign(current_m = checkDf.date.str.extract(r'\d{4}.(\d{2})'))
        checkDf = checkDf.assign(before_m = checkDf.current_m.shift(1))
        checkDf['test_result'] = checkDf.apply(lambda row : 'correct' if row.name==0 else checkOrder(row['current_m'], row['before_m']), axis=1, result_type='expand')
        dfs.append(checkDf)
    except :
        fails.append(k)
    # try:
    #     reportInfoDf = dart.list(ticker, start='2010-01-01', end='2021-11-01', kind='A', final=False)
    #     addDf = reportInfoDf.report_nm.str.extract(r'\[?(\w*)\]?(사업보고서|반기보고서|분기보고서).*\((\d{4}\.\d{2})\)', expand=True).rename(columns={0:'add_info', 1:'kind', 2:'date'})
    #     addedReportInfoDf = pd.concat([reportInfoDf, addDf], axis=1)
    #     addedReportInfoDf = addedReportInfoDf[['corp_code', 'corp_name', 'stock_code','rcept_dt', 'add_info', 'kind', 'date', 'rcept_no']]
    #     con = addedReportInfoDf.loc[:, 'add_info'] == '기재정정'
    #     checkDf = addedReportInfoDf.loc[~con].sort_values(by='rcept_dt').reset_index(drop=True)
    #     checkDf = checkDf.assign(current_m = checkDf.date.str.extract(r'\d{4}.(\d{2})'))
    #     checkDf = checkDf.assign(before_m = checkDf.current_m.shift(1))
    #     checkDf['test_result'] = checkDf.apply(lambda row : 'correct' if row.name==0 else checkOrder(row['current_m'], row['before_m']), axis=1, result_type='expand')
    #     dfs.append(checkDf)
    # except :
    #     try :
    #         reportInfoDf = dart.list(name, start='2010-01-01', end='2021-11-01', kind='A', final=False)
    #         addDf = reportInfoDf.report_nm.str.extract(r'\[?(\w*)\]?(사업보고서|반기보고서|분기보고서).*\((\d{4}\.\d{2})\)', expand=True).rename(columns={0:'add_info', 1:'kind', 2:'date'})
    #         addedReportInfoDf = pd.concat([reportInfoDf, addDf], axis=1)
    #         addedReportInfoDf = addedReportInfoDf[['corp_code', 'corp_name', 'stock_code','rcept_dt', 'add_info', 'kind', 'date', 'rcept_no']]
    #         con = addedReportInfoDf.loc[:, 'add_info'] == '기재정정'
    #         checkDf = addedReportInfoDf.loc[~con].sort_values(by='rcept_dt').reset_index(drop=True)
    #         checkDf = checkDf.assign(current_m = checkDf.date.str.extract(r'\d{4}.(\d{2})'))
    #         checkDf = checkDf.assign(before_m = checkDf.current_m.shift(1))
    #         checkDf['test_result'] = checkDf.apply(lambda row : 'correct' if row.name==0 else checkOrder(row['current_m'], row['before_m']), axis=1, result_type='expand')
    #         dfs.append(checkDf)
    #     except :
    #         fails.append(k)
total_result = pd.concat(dfs)

000020
078420
002840
078930
002820
002810
079160
079430
002790
002870
079660
080030
002780
080180
080410
002760
080960
080970
002720
080980
002710
078000
002880
077970
003090
071090
003080
003070
071950
003060
071955
003030
071970
003010
072130
003000
072710
073240
002990
074610
002960
075180
002920
077500
002900
081000
002700
081190
081200
083380
002320
083390
002310
083420
002300
083570
002270
002250
083580
002240
083590
002220
083600
002210
083610
002200
083620
002170
084010
084160
{"status":"013","message":"조회된 데이타가 없습니다."}
002140
084240
084670
002100
002350
003120
083370
083360
002630
081210
002620
081930
002600
002550
002540
002530
081940
002460
082110
082240
002450
082250
002420
082260
002410
082640
002390
002380
083120
{"status":"013","message":"조회된 데이타가 없습니다."}
083350
002360
071050
003160
069960
047050
003960
003945
003940
047400
049770
003920
003850
049800
003830
051310
003780
051600
003720
051630
003690
051900
003680
003650
003640
051910
003620
003610
004000
004020
004200
00

In [58]:
len(fails)
infoDf.loc[fails]

Unnamed: 0,tickers,names,corp_codes
30,071955,코아스웰1우B,
55,083570,아시아10호,00533580
62,083600,아시아13호,00533757
66,083620,아시아15호,00533678
69,084160,골든경매일호,00540784
...,...,...,...
1021,139990,아주스틸,00486705
1022,377190,디앤디플랫폼리츠,01475609
1023,271940,일진하이솔루스,00972503
1024,395400,SK리츠,01535150


In [74]:
dart.list('케이카', start='2010-01-01', end='2021-11-01', kind='F', final=False)
# dart.find_corp_code('아주스틸')

Unnamed: 0,corp_code,corp_name,stock_code,corp_cls,report_nm,rcept_no,flr_nm,rcept_dt,rm
0,1323032,케이카,381970,Y,감사보고서 (2020.12),20210401000915,삼정회계법인,20210401,
1,1323032,케이카,381970,Y,감사보고서 (2019.12),20200407001065,삼정회계법인,20200407,
2,1323032,케이카,381970,Y,감사보고서 (2018.12),20190401001715,삼정회계법인,20190401,


In [78]:
# 삼성전자 1999년~2019년 모든 정기보고서(최종보고서) (82건)
dart.list('083570', start='2010-01-01', end='2021-11-01', kind='A', final=False)

Unnamed: 0,corp_code,corp_name,stock_code,corp_cls,report_nm,rcept_no,flr_nm,rcept_dt,rm
0,533580,아시아퍼시픽10호선박투자회사,83570,E,분기보고서 (2017.02),20170414001888,아시아퍼시픽10호선박투자회사,20170414,
1,533580,아시아퍼시픽10호선박투자회사,83570,E,반기보고서 (2016.11),20170113000530,아시아퍼시픽10호선박투자회사,20170113,
2,533580,아시아퍼시픽10호선박투자회사,83570,E,분기보고서 (2016.08),20161017000200,아시아퍼시픽10호선박투자회사,20161017,
3,533580,아시아퍼시픽10호선박투자회사,83570,E,[기재정정]사업보고서 (2016.05),20160831000917,아시아퍼시픽10호선박투자회사,20160831,
4,533580,아시아퍼시픽10호선박투자회사,83570,E,사업보고서 (2016.05),20160829000305,아시아퍼시픽10호선박투자회사,20160829,정
5,533580,아시아퍼시픽10호선박투자회사,83570,E,분기보고서 (2016.02),20160412001787,아시아퍼시픽10호선박투자회사,20160412,
6,533580,아시아퍼시픽10호선박투자회사,83570,E,반기보고서 (2015.11),20160114000260,아시아퍼시픽10호선박투자회사,20160114,
7,533580,아시아퍼시픽10호선박투자회사,83570,E,분기보고서 (2015.08),20151015000225,아시아퍼시픽10호선박투자회사,20151015,
8,533580,아시아퍼시픽10호선박투자회사,83570,E,사업보고서 (2015.05),20150831000651,아시아퍼시픽10호선박투자회사,20150831,
9,533580,아시아퍼시픽10호선박투자회사,83570,E,분기보고서 (2015.02),20150414001845,아시아퍼시픽10호선박투자회사,20150414,


In [88]:
addDf = reportInfoDf.report_nm.str.extract(r'\[?(\w*)\]?(사업보고서|반기보고서|분기보고서).*\((\d{4}\.\d{2})\)', expand=True).rename(columns={0:'add_info', 1:'kind', 2:'date'})
addedReportInfoDf = pd.concat([reportInfoDf, addDf], axis=1)
addedReportInfoDf = addedReportInfoDf[['corp_code', 'corp_name', 'stock_code','rcept_dt', 'add_info', 'kind', 'date', 'rcept_no']]
addedReportInfoDf

Unnamed: 0,corp_code,corp_name,stock_code,rcept_dt,add_info,kind,date,rcept_no
0,126380,삼성전자,5930,20210817,,반기보고서,2021.06,20210817001416
1,126380,삼성전자,5930,20210517,,분기보고서,2021.03,20210517001185
2,126380,삼성전자,5930,20210309,,사업보고서,2020.12,20210309000744
3,126380,삼성전자,5930,20201116,,분기보고서,2020.09,20201116001248
4,126380,삼성전자,5930,20200814,,반기보고서,2020.06,20200814001766
5,126380,삼성전자,5930,20200515,,분기보고서,2020.03,20200515001451
6,126380,삼성전자,5930,20200330,,사업보고서,2019.12,20200330003851
7,126380,삼성전자,5930,20191114,,분기보고서,2019.09,20191114001273
8,126380,삼성전자,5930,20190814,,반기보고서,2019.06,20190814002218
9,126380,삼성전자,5930,20190515,,분기보고서,2019.03,20190515001605


In [101]:
def checkOrder(current_m, before_m):
    dic = {'03':'12', '06':'03', '09':'06', '12':'09'}

    if dic[current_m] == before_m:
        return 'correct'
    else :
        return 'wrong'

In [113]:
con = addedReportInfoDf.loc[:, 'add_info'] == '기재정정'
checkDf = addedReportInfoDf.loc[~con].sort_values(by='rcept_dt').reset_index(drop=True)
checkDf = checkDf.assign(current_m = checkDf.date.str.extract(r'\d{4}.(\d{2})'))
checkDf = checkDf.assign(before_m = checkDf.current_m.shift(1))
checkDf['test_result'] = checkDf.apply(lambda row : 'correct' if row.name==0 else checkOrder(row['current_m'], row['before_m']), axis=1, result_type='expand')
checkDf

Unnamed: 0,corp_code,corp_name,stock_code,rcept_dt,add_info,kind,date,rcept_no,current_m,before_m,test_result
0,126380,삼성전자,5930,20100331,첨부추가,사업보고서,2009.12,20100331001680,12,,correct
1,126380,삼성전자,5930,20100531,,분기보고서,2010.03,20100531001169,3,12.0,correct
2,126380,삼성전자,5930,20100830,,반기보고서,2010.06,20100830000360,6,3.0,correct
3,126380,삼성전자,5930,20101129,,분기보고서,2010.09,20101129000419,9,6.0,correct
4,126380,삼성전자,5930,20110331,첨부추가,사업보고서,2010.12,20110331002193,12,9.0,correct
5,126380,삼성전자,5930,20110530,,분기보고서,2011.03,20110530000628,3,12.0,correct
6,126380,삼성전자,5930,20110829,,반기보고서,2011.06,20110829000539,6,3.0,correct
7,126380,삼성전자,5930,20111129,,분기보고서,2011.09,20111129000501,9,6.0,correct
8,126380,삼성전자,5930,20120330,첨부추가,사업보고서,2011.12,20120330002110,12,9.0,correct
9,126380,삼성전자,5930,20120515,,분기보고서,2012.03,20120515001281,3,12.0,correct


In [116]:
reprt_code = '20210309000744'
url = f'https://dart.fss.or.kr/dsaf001/main.do?rcpNo={reprt_code}'
r = requests.get(url)
if r.status_code == 200 :
    string = r.text
else :
    print('''it's fail''')

In [117]:
def get_regex_search_node2(key, scriipt_tag) :
    return re.findall(r'node2\[\'' + key + '\'\]\s*=\s*(".*?")', scriipt_tag)

In [118]:
text_list = get_regex_search_node2('text', string)
id_list = get_regex_search_node2('id', string)
rcpNo_list = get_regex_search_node2('rcpNo', string)
dcmNo_list = get_regex_search_node2('dcmNo', string)
eleId_list = get_regex_search_node2('eleId', string)
offset_list = get_regex_search_node2('offset', string)
length_list = get_regex_search_node2('length', string)
dtd_list = get_regex_search_node2('dtd', string)
tocNo_list = get_regex_search_node2('tocNo', string)


print(len(text_list))
print(len(id_list))
print(len(rcpNo_list))
print(len(dcmNo_list))
print(len(eleId_list))
print(len(offset_list))
print(len(length_list))
print(len(dtd_list))
print(len(tocNo_list))


20
20
20
20
20
20
20
20
20


In [119]:
dic = {'text' : text_list,
        'id' : id_list,
        'rcpNo' : rcpNo_list, 
        'dcmNo' : dcmNo_list, 
        'eleId' : eleId_list, 
        'offset' : offset_list,
        'length' : length_list,
        'dtd' : dtd_list,
        'tocNo' : tocNo_list,
}

df = pd.DataFrame(dic)

In [120]:
df = df.applymap(lambda x: x.strip('"'))
df

Unnamed: 0,text,id,rcpNo,dcmNo,eleId,offset,length,dtd,tocNo
0,1. 회사의 개요,4,20210309000744,7861003,4,13800,241107,dart3.xsd,4
1,2. 회사의 연혁,5,20210309000744,7861003,5,254911,18093,dart3.xsd,5
2,3. 자본금 변동사항,6,20210309000744,7861003,6,273008,3576,dart3.xsd,6
3,4. 주식의 총수 등,7,20210309000744,7861003,7,276588,62302,dart3.xsd,7
4,5. 의결권 현황,8,20210309000744,7861003,8,338894,7901,dart3.xsd,8
5,6. 배당에 관한 사항 등,9,20210309000744,7861003,9,346799,18765,dart3.xsd,9
6,7. 정관에 관한 사항,10,20210309000744,7861003,10,365568,2291,dart3.xsd,10
7,1. 요약재무정보,13,20210309000744,7861003,13,664835,22260,dart3.xsd,13
8,2. 연결재무제표,14,20210309000744,7861003,14,687340,115750,dart3.xsd,14
9,3. 연결재무제표 주석,15,20210309000744,7861003,15,803094,555908,dart3.xsd,15


In [121]:
con = df.text.str.contains('.*연결재무제표$')
display(df.loc[con])
dic = df.loc[con].to_dict('records')[0]
dic

Unnamed: 0,text,id,rcpNo,dcmNo,eleId,offset,length,dtd,tocNo
8,2. 연결재무제표,14,20210309000744,7861003,14,687340,115750,dart3.xsd,14


{'text': '2. 연결재무제표',
 'id': '14',
 'rcpNo': '20210309000744',
 'dcmNo': '7861003',
 'eleId': '14',
 'offset': '687340',
 'length': '115750',
 'dtd': 'dart3.xsd',
 'tocNo': '14'}

In [149]:
params={
    'rcpNo': dic['rcpNo'],
    'dcmNo': dic['dcmNo'],
    'eleId': dic['eleId'],
    'offset': dic['offset'],
    'length': dic['length'],
    'dtd': dic['dtd']
    }

url = f'https://dart.fss.or.kr/report/viewer.do?'
r = requests.get(url, params=params)
html = r.text

table = pd.read_html(html)
length = len(table)
for k in range(0, length):
    title_candi = table[k].iat[0,0]
    lst = re.findall('.*재무상태표|손익계산서|현금흐름표', title_candi)
    if lst :
        print(k, title_candi, sep=':')
        display(table[k+1])

0:연결 재무상태표


Unnamed: 0.1,Unnamed: 0,제 52 기,제 51 기,제 50 기
0,자산,,,
1,유동자산,198215579,181385260,174697424
2,현금및현금성자산,29382578,26885999,30340505
3,단기금융상품,92441703,76252052,65893797
4,단기상각후원가금융자산,2757111,3914216,2703693
5,단기당기손익-공정가치금융자산,71451,1727436,2001948
6,매출채권,30965058,35131343,33867733
7,미수금,3604539,4179120,3080733
8,선급비용,2266100,2406220,4136167
9,재고자산,32043145,26766464,28984704


3:연결 손익계산서


Unnamed: 0.1,Unnamed: 0,제 52 기,제 51 기,제 50 기
0,수익(매출액),236806988.0,230400881.0,243771415.0
1,매출원가,144488296.0,147239549.0,132394411.0
2,매출총이익,92318692.0,83161332.0,111377004.0
3,판매비와관리비,56324816.0,55392823.0,52490335.0
4,영업이익,35993876.0,27768509.0,58886669.0
5,기타수익,1384068.0,1778666.0,1485037.0
6,기타비용,2488902.0,1414707.0,1142018.0
7,지분법이익,506530.0,412960.0,539845.0
8,금융수익,12267600.0,10161632.0,9999321.0
9,금융비용,11318055.0,8274871.0,8608896.0


6:연결 포괄손익계산서


Unnamed: 0.1,Unnamed: 0,제 52 기,제 51 기,제 50 기
0,당기순이익(손실),26407832,21738865,44344857
1,기타포괄손익,"(3,673,905)",3016194,"(12,242)"
2,후속적으로 당기손익으로 재분류되지 않는 포괄손익,1788764,"(50,765)","(656,647)"
3,기타포괄손익-공정가치금융자산평가손익,2502733,1146599,"(235,865)"
4,관계기업 및 공동기업의 기타포괄손익에 대한 지분,5591,"(16,896)","(10,631)"
5,순확정급여부채(자산) 재측정요소,"(719,560)","(1,180,468)","(410,151)"
6,후속적으로 당기손익으로 재분류되는 포괄손익,"(5,462,669)",3066959,644405
7,관계기업 및 공동기업의 기타포괄손익에 대한 지분,"(48,888)",48649,6688
8,해외사업장환산외환차이,"(5,380,375)",3016499,590638
9,현금흐름위험회피파생상품평가손익,"(33,406)",1811,47079


12:연결 현금흐름표


Unnamed: 0.1,Unnamed: 0,제 52 기,제 51 기,제 50 기
0,영업활동 현금흐름,65287009,45382915,67031863
1,영업에서 창출된 현금흐름,68148810,56635791,78025064
2,당기순이익,26407832,21738865,44344857
3,조정,41618554,37442682,43604573
4,영업활동으로 인한 자산부채의 변동,122424,"(2,545,756)","(9,924,366)"
5,이자의 수취,2220209,2306401,1788520
6,이자의 지급,"(555,321)","(579,979)","(548,272)"
7,배당금 수입,243666,241801,215992
8,법인세 납부액,"(4,770,355)","(13,221,099)","(12,449,441)"
9,투자활동 현금흐름,"(53,628,591)","(39,948,171)","(52,240,453)"


In [59]:
qeury = QueryStockDataDB.QueryStockData(folderPath)
nDf = qeury.queryStockNumberOfSharesDB(by='parquet')

In [62]:
con = nDf.name.str.contains(r'\d+호')
lst = nDf.loc[con].name.unique().tolist()
lst

['동북아1호',
 '동북아2호',
 '아시아1호',
 '동북아6호',
 '동북아3호',
 '동북아4호',
 '동북아5호',
 '아시아2호',
 '아시아3호',
 '동북아13호',
 '동북아14호',
 '아시아10호',
 '아시아11호',
 '아시아12호',
 '아시아13호',
 '아시아14호',
 '아시아15호',
 '동북아15호',
 '동북아12호',
 '동북아11호',
 '아시아4호',
 '아시아8호',
 '아시아9호',
 '동북아8호',
 '아시아5호',
 '아시아6호',
 '아시아7호',
 '동북아9호',
 '동북아10호',
 '거북선4호',
 '거북선5호',
 '거북선6호',
 '케이알제2호',
 '거북선3호',
 '거북선2호',
 '코크렙8호',
 '코리아01호',
 '코리아02호',
 '코리아03호',
 '코리아04호',
 '코크렙7호',
 '동북아21호',
 '코리아07호',
 '하나니켈1호',
 '하나니켈2호',
 '바다로3호',
 '코리아05호',
 '코리아06호',
 '코크렙15호',
 '우리스팩1호',
 '거북선7호',
 '하이골드2호',
 '트러스제7호',
 '하이골드3호',
 '바다로19호',
 '하이골드8호',
 '하이골드12호',
 '엔에이치스팩19호']