# urllib 패키지를 이용하여 웹크롤링하기

## 삼성전자 재무제표 데이터 수집

In [4]:
from urllib.request import urlopen

URL = "http://comp.fnguide.com/SVO2/ASP/SVD_Finance.asp?pGB=1&gicode=A005930"

req = urlopen(URL)
html = req.read()

In [5]:
from bs4 import BeautifulSoup
    
soup = BeautifulSoup(html, 'html.parser')

In [6]:
soup.find("table")

<table class="us_table_ty1 h_fix zigbg_no">
<caption class="cphidden">포괄손익계산서</caption>
<colgroup>
<col style="width: 35%;"/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th class="clf tbold" scope="col">IFRS(연결)</th>
<th scope="col">2016/12</th>
<th scope="col">2017/12</th>
<th scope="col">2018/12</th>
<th scope="col">2019/12</th>
<th scope="col">전년동기</th>
<th class="cle" scope="col">전년동기(%)</th>
</tr>
</thead>
<tbody>
<tr class="rwf rowBold">
<th class="l clf" scope="row">
<div class="th_b">매출액</div>
</th>
<td class="r">2,018,667</td>
<td class="r">2,395,754</td>
<td class="r">2,437,714</td>
<td class="r">2,304,009</td>
<td class="r">2,437,714</td>
<td class="r cle"><span class="tcr">-5.5</span></td>
</tr>
<tr class="rwf ">
<th class="l clf" scope="row">
<div class="">매출원가</div>
</th>
<td class="r">1,202,777</td>
<td class="r">1,292,907</td>
<td class="r">1,323,944</td>
<td class="r">1,472,395</td>
<td class="r">1,323,944</td>
<td class="r cle">11.2</td>
</tr>


In [7]:
soup_table_all = soup.find_all("table")

In [8]:
len(soup_table_all)

6

In [9]:
soup_table_all[0]

<table class="us_table_ty1 h_fix zigbg_no">
<caption class="cphidden">포괄손익계산서</caption>
<colgroup>
<col style="width: 35%;"/>
<col/>
<col/>
<col/>
<col/>
<col/>
<col/>
</colgroup>
<thead>
<tr>
<th class="clf tbold" scope="col">IFRS(연결)</th>
<th scope="col">2016/12</th>
<th scope="col">2017/12</th>
<th scope="col">2018/12</th>
<th scope="col">2019/12</th>
<th scope="col">전년동기</th>
<th class="cle" scope="col">전년동기(%)</th>
</tr>
</thead>
<tbody>
<tr class="rwf rowBold">
<th class="l clf" scope="row">
<div class="th_b">매출액</div>
</th>
<td class="r">2,018,667</td>
<td class="r">2,395,754</td>
<td class="r">2,437,714</td>
<td class="r">2,304,009</td>
<td class="r">2,437,714</td>
<td class="r cle"><span class="tcr">-5.5</span></td>
</tr>
<tr class="rwf ">
<th class="l clf" scope="row">
<div class="">매출원가</div>
</th>
<td class="r">1,202,777</td>
<td class="r">1,292,907</td>
<td class="r">1,323,944</td>
<td class="r">1,472,395</td>
<td class="r">1,323,944</td>
<td class="r cle">11.2</td>
</tr>


In [10]:
soup.find(attrs={"class":"us_table_ty1 h_fix zigbg_no"})
soup_table = soup.find("table", attrs={"class":"us_table_ty1 h_fix zigbg_no"})

In [11]:
from html_table_parser import parser_functions as parser
import pandas as pd

table = parser.make2d(soup_table)
df = pd.DataFrame(table[1:], columns=table[0])
df.head()

Unnamed: 0,IFRS(연결),2016/12,2017/12,2018/12,2019/12,전년동기,전년동기(%)
0,매출액,2018667,2395754,2437714,2304009,2437714,-5.5
1,매출원가,1202777,1292907,1323944,1472395,1323944,11.2
2,매출총이익,815890,1102847,1113770,831613,1113770,-25.3
3,판매비와관리비계산에 참여한 계정 펼치기,523484,566397,524903,553928,524903,5.5
4,인건비,59763,67972,64514,64226,64514,-0.4


## 함수로 만들고, 시도건수 추가

In [12]:
from urllib.error import HTTPError
import logging
import time

def collect_sheet(code, try_cnt):
    try:
        URL = "http://comp.fnguide.com/SVO2/ASP/SVD_Finance.asp?pGB=1&gicode={}".format(code)
    
        req = urlopen(URL)
        html = req.read()
        soup = BeautifulSoup(html, 'html.parser')
        soup_table_all = soup.find_all("table")
        soup.find(attrs={"class":"us_table_ty1 h_fix zigbg_no"})
        soup_table = soup.find("table", attrs={"class":"us_table_ty1 h_fix zigbg_no"})
        table = parser.make2d(soup_table)
        df = pd.DataFrame(table[1:], columns=table[0])
    
        return df
    
    except HTTPError as e:
        if try_cnt>=3:
            logging.warning(e)
            return None
        else:
            time.sleep(3)
            collect_div(corp_code,try_cnt=+1)    

In [13]:
df=collect_sheet("A005930",1)

In [14]:
df.head()

Unnamed: 0,IFRS(연결),2016/12,2017/12,2018/12,2019/12,전년동기,전년동기(%)
0,매출액,2018667,2395754,2437714,2304009,2437714,-5.5
1,매출원가,1202777,1292907,1323944,1472395,1323944,11.2
2,매출총이익,815890,1102847,1113770,831613,1113770,-25.3
3,판매비와관리비계산에 참여한 계정 펼치기,523484,566397,524903,553928,524903,5.5
4,인건비,59763,67972,64514,64226,64514,-0.4
