In [2]:
# modules and functions
# 1. 전체 기업 정보 불러오기 -> 파일럿 데이터: stock 있는 기업만 따로 분류하기(parse2)
# 2. 기업들 최근 공시 불러오기 -> 최근 문서 번호 가지고 오기(parse1)
# 3. 최근 문서 불러오기 -> 파싱하기(parse2)
# 모듈

# request 모듈. argument(url, params). 특이사항: josn과 xml이 다르다. json은 josn으로 xml은 io byte으로 변환해야 한다


import requests
import xml.etree.ElementTree as ET
import zipfile
import io
from bs4 import BeautifulSoup
import lxml
import time
import tqdm.notebook as tq
import pickle
from pathos.pools import ProcessPool

api_key = 'd81e78aa719d1c1e4ec7867ef22a737ab6cbb4c7' # 어디서 받아온 거 ㅎㅎ

def init_bs4(xml_str):
    return BeautifulSoup(xml_str, "lxml")


def xml_parser(response):
    zf = zipfile.ZipFile(io.BytesIO(response.content))
    info_list = zf.infolist()
    xml_data = zf.read(info_list[0].filename)
    try:
        xml_text = xml_data.decode('euc-kr')
        # print("xml paring case 1")
    except UnicodeDecodeError as e:
        xml_text = xml_data.decode('utf-8')
        # print("xml paring case 2")
    except UnicodeDecodeError as e:
        # print("xml paring case #")
        xml_text = xml_data
    return init_bs4(xml_text)

def request_dart(url, params):
    r = requests.get(url, params=params)
    return r

def create_params(keys = None, values= None):
    params = {
        'crtfc_key': api_key,
    }

    if keys:
        assert len(keys) == len(values), "key and value lengths must be same."
        for i in range(len(keys)):
            params[keys[i]] = values[i]

    return params

def request_dart_corps_info():
    url = 'https://opendart.fss.or.kr/api/corpCode.xml'
    params = create_params()
    res = request_dart(url, params)
    return res
    

def get_corp(only_stock = False):
    res = request_dart_corps_info()

    parser = xml_parser(res)
    corps_xml = parser.find_all("list")

    corps = []
    num_stock = 0

    for l in corps_xml:
        corp_dict = {}
        stock = None
        if l.stock_code.string != " ":
            stock = l.stock_code.string
            num_stock += 1

        if ( only_stock == True and stock ) or only_stock == False:
            corp_dict["주식 코드"] = str(stock)
            corp_dict["기업 코드"] = str(l.corp_code.string)
            corp_dict["기업 이름"] = str(l.corp_name.string)
            
            corp_dict["수정 일자"] = str(l.modify_date.string)
            corps.append( corp_dict )
    return corps


def get_corp_code(only_stock = False):
    corps = get_corp(only_stock)
    corp_code = []
    for c in corps:
        corp_code.append(c['기업 코드'])
    return corp_code


def get_report_index(corps):
    keys = ['bgn_de', 'pblntf_ty']
    values = ['20210101', 'A']

    params = create_params(keys, values)

    active_corps = []
    
    print(len(corps))
    for i, corp in tq.tqdm(enumerate(corps)):
        act_corp = {}

        code = corp['기업 코드']
        params['corp_code'] = code

        url = 'https://opendart.fss.or.kr/api/list.json'

        res = request_dart(url, params)
        res = res.json()
        if res['status'] == "000":
            latest_doc = res['list'][0]
            for k, v in corp.items():
                act_corp[k] = v
                act_corp['report_idx'] = latest_doc['rcept_no']
            active_corps.append( act_corp )
        else:
            print(f"{res['status']}: {res['message']}: {corp} ")

        LIMIT = 800
        if i > 0 and i % LIMIT == 0:
            print(f"so far {len(active_corps)} has been collected...")
            # debug
            # if len(active_corps) > 0:
            #   return active_corps
            print(f"{LIMIT} requests has done. sleep 60 secs...")

            time.sleep(60)
            

          
    return active_corps

def parse_xml(soup):
    contents = {}
    for s in soup.find_all("section-1"):
        title = str(s.title.string)
        sub_chapters = {}
        for ss in s.find_all("section-2"):
            sub_title = str(ss.title.string)

            clean_text = str(ss.get_text())
            sub_chapters[sub_title] = clean_text
        contents[title] = sub_chapters
    return contents

def remove_table(soup):
    for t in soup.find_all('table'):
        t.decompose()
    assert len(soup.find_all('table')) == 0, "table still exists!"

def get_report(corp):
    r = corp['report_idx']
    keys = ['rcept_no']
    values = [r]

    params = create_params(keys, values)

    url = 'https://opendart.fss.or.kr/api/document.xml'
    r = requests.get(url, params=params)
    return r

def get_doc(active_corps):

    for i, c in tq.tqdm(enumerate(active_corps)):
        r = get_report(c)
        LIMIT = 800
        if i > 0 and i % LIMIT == 0:
            print("cur i = ", i)
            # print(f"so far {len(report_idx)} has been collected...")
            print(f"{LIMIT} requests has done. sleep 60 secs...")

            time.sleep(60)
        try:
            soup = xml_parser(r)
            c['original_xml'] = str(soup)
            remove_table(soup)
            contents = parse_xml(soup)
            c['contents'] = contents
        except Exception as e:
            print(e)
            print(c)
            c['original_xml'] = None
            # c['contents'] = None
    return active_corps

def get_active_corp(c):
      
        r = c['report_idx']
        keys = ['rcept_no']
        values = [r]

        params = create_params(keys, values)

        url = 'https://opendart.fss.or.kr/api/document.xml'
        r = requests.get(url, params=params)  
        
        try:
            soup = xml_parser(r)
            c['original_xml'] = str(soup)
            remove_table(soup)
            contents = parse_xml(soup)
            c['contents'] = contents
        except Exception as e:
            print(e)
            print(c)
            c['original_xml'] = None
            c['contents'] = None
        return c
        
def parallel_get_doc(active_corps):
    assert isinstance(active_corps, dict)
    active_corps = get_active_corp(active_corps)
    return active_corps

In [None]:
# 기업 정보
corps = get_corp(only_stock = True)
with open("all_corp_list.pickle", "wb") as fp:   #Pickling
  pickle.dump(corps, fp)

In [None]:
# from copr information -> get latest report id
active_corps = get_report_index(corps)
with open("active_corps.pickle", "wb") as fp:   #Pickling
  pickle.dump(active_corps, fp)

In [None]:
# from latest report id -> get xml and parse them. 
# without multiprocess, it takes about 1 hour...
pool = ProcessPool()
LIMIT = 500
content_orig_total = []
for i in tq.tqdm(range(0, len(active_corps), LIMIT)):
    print(f"from {i} to {i+LIMIT}")
    content_orig = pool.map(parallel_get_doc, active_corps[i:i+LIMIT])
    print(len(content_orig))
    content_orig_total.extend(content_orig)
    
with open("content_orig_total.pickle", "wb") as fp:   #Pickling
    pickle.dump(content_orig_total, fp)