# ipython 과 함께 춤을 

K-stat, 지자체 수출입, 통계 페이지 크롤링

In [2]:
import csv
import re
import requests
from codecs import BOM_UTF8
from bs4 import BeautifulSoup
import lxml.etree as ET


class KitaCrawler(object):
    def __init__(self):
        self.init()

    def init(self):
        html = requests.get('http://stat.kita.net/stat/kts/prod/ProdItemImpExpList.screen').text

        try:
            this_yy = re.search('var\s+thisYY\s+=\s+\"(.+?)\"', html).group(1)
            last_yy = re.search('var\s+lastYY\s+=\s+\"(.+?)\"', html).group(1)
        except AttributeError:
            raise ValueError('not found thisYY/lastYY')

        matched = re.search(r'initdata.Cols = \[(.*?)\]', html, re.DOTALL | re.MULTILINE)
        self.header_cols = []
        if matched:
            headers_js = matched.group(1)
            headers_js = headers_js.replace('lastYY+\"', u'\"{}'.format(last_yy))
            headers_js = headers_js.replace('thisYY+\"', u'\"{}'.format(this_yy))
            self.header_cols = re.findall(r'Header\s*:\s*\"(.+?)\"', headers_js)

        if not self.header_cols:
            raise ValueError('not found header_cols')

        soup = BeautifulSoup(html)
        self.initial_form_data = {}
        tags = soup.select('form[name=form1] input') + soup.select('form[name=form1] select')
        for tag in tags:
            self.initial_form_data[tag['name']] = tag.get('value', '')

        self.initial_form_data.update({
            'event_udap': 'Search',
            'searchType': 'SHEET',
            'sheet_col_length': len(self.header_cols),
            'search_gbn': 'Prod',       # Prod_Ctr(지역/국가 검색), Prod(지역검색)
            's_measure': '1000',        # 금액단위 : 1(US$), 1000(천불), 1000000(백만불)

            's_year': '2015',           # 년도
            's_month': '02',            # 월
            's_cond_gb': 'HS',          # HS(HSK), MTI(MTI), SITC(SITC)
            's_cond_unit': '6',         # 품목단위 : 0(전단위), 2(2단위), 4(4단위), 6(6단위), 10(10단위)
            's_field': 'AMT',           # 화면선택 : AMT(금액), WGT(중량), QTY(수량)
            's_monthsum_gb': '2',       # 선택 : 1(당월), 2(누계)
            's_sort': 'THIS_EXP_AMT',   # 정렬기준
            's_language': 'kor_name',   # kor_name(한글), eng_name(영문)
            'pie_legend': 'Exp',        # Exp(수출), Imp(수입)
        })
        self.initial_form_data.update({
            'p_cond_gb': self.initial_form_data['s_cond_gb'],
            'p_prod_cd': self.initial_form_data['s_prod_code'],
            'p_prod_nm': self.initial_form_data['s_prod_name'],
        })

    def get_page(self, page=1, max=10, **kwargs):
        form_data = self.initial_form_data.copy()
        form_data.update({
            'pageNum': page,
            'listCount': max,
        })
        form_data.update(kwargs)
        r = requests.post('http://stat.kita.net/stat/kts/prod/ProdItemImpExpListWorker.screen', data=form_data)
        utf8_xml = r.text.strip().encode(r.encoding)
        doc = ET.fromstring(utf8_xml)
        rows = []
        for tr in doc.find('DATA').findall('TR'):
            row = [td.text for td in tr.findall('TD')]
            rows.append(row)
        return rows

ipython notebook 상에서 html table 로서 보여주기 위해, ListTable 클래스를 구현

In [4]:
class ListTable(list):
    """ Overridden list class which takes a 2-dimensional list of 
        the form [[1,2,3],[4,5,6]], and renders an HTML Table in 
        IPython Notebook. """
    
    def _repr_html_(self):
        html = ["<table>"]
        for row in self:
            html.append("<tr>")
            
            for col in row:
                html.append("<td>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)

1페이지에 10개씩, 10페이지의 내용을 크롤링

In [5]:
crawer = KitaCrawler()

page = 3
max = 10

table = ListTable()
table.append(crawer.header_cols)
rows = crawer.get_page(page, max)
for row in rows:
    table.append(row)

table

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
순번|순번,총계구분|총계구분,코드|코드,품목명|품목명,수량\n단위|수량\n단위,2014년|수출액,2014년|수출증감률,2014년|수입액,2014년|수입증감률,2014년|수지,2015년|수출액,2015년|수출증감률,2015년|수입액,2015년|수입증감률,2015년|수지,품목 구분
21,2,290243,파라 -크실렌,,4776720,5.8,155684,-83.1,4621035.938,719662,8.7,11520,-70.3,708142,HS
22,2,854370,그 밖의 기기,,3583414,-13.9,1813055,2.6,1770359.763,653611,25.6,309092,10.9,344519,HS
23,2,853890,기타,,2761676,-23.7,1044008,-27.8,1717667.939,610471,33.1,162378,1.5,448093,HS
24,2,854140,감광성 반도체 디바이스(광전지는 모듈에 조립되었거나 패널로 구성되었는지 여부와 관계없이 포함한다)와 발광다이오드,,3419654,-9.8,2892161,-12.4,527492.924,584124,9.4,483785,2.6,100339,HS
25,2,847170,기억장치,,2895261,21.4,1409863,7.4,1485398.472,576962,48.5,228331,12.2,348631,HS
26,2,870840,기어박스,,3395036,12.7,912199,-11.2,2482837.053,575604,2.3,130334,-9.1,445270,HS
27,2,870829,자동차용 차체의 기타 부분품과 부속품,,3156620,17.4,344497,21.1,2812123.714,505214,8.4,56997,10.3,448217,HS
28,2,854239,기타,,3873914,-7.6,7436939,5.1,-3563024.69,499484,-27,909488,-32,-410004,HS
29,2,842952,360도 회전의 상부구조를 가진 기계,,3359708,0,83060,41.2,3276647.903,476751,-10,8902,-38,467849,HS
