# **Web Crawling**
## **01 웹페이지 정보 확인**
```bash
$ pip install python-whois
```

In [1]:
import whois
whois.whois('http://used.aladin.co.kr')

{'domain_name': 'aladin.co.kr',
 'registrant_name': 'Aladin Communications Inc.',
 'registrant_address': None,
 'registrant_zip': None,
 'admin_name': 'Aladdin communications Inc',
 'admin_email': 'webmaster@aladin.co.kr',
 'admin_phone': None,
 'creation_date': datetime.datetime(1999, 1, 15, 0, 0),
 'updated_date': datetime.datetime(2019, 1, 10, 0, 0),
 'expiration_date': datetime.datetime(2023, 10, 15, 0, 0),
 'registrar': 'Inames Co., Ltd.(http://www.inames.co.kr)',
 'name_servers': ['dns.aladdin.co.kr',
  'ns.intellicenter.co.kr',
  'ns3.intellicenter.co.kr']}

## **02 다운로드 재시도**
- 500 오류가 발생시 재시도를 한다
- `hasattr()` : 객체에 속성값을 갖고 있는지 판정 (`boolean`) 하는 함수

In [2]:
from urllib.request import urlopen
from urllib.error   import HTTPError

def download(url, number_retries = 2):
    print('Downloading: ', url)
    try:
        html = urlopen(url).read().decode('utf-8')
    except HTTPError as e:
        print('Download error: ', e.reason)
        html = None
        if number_retries > 0 :
            # 5xx HTML 오류시 재시도
            if hasattr(e, 'code') and (500 <= e.code <= 600):
                return download(url, number_retries - 1)
    return html

response = download('http://httpstat.us/500')
type(response)

Downloading:  http://httpstat.us/500
Download error:  Internal Server Error
Downloading:  http://httpstat.us/500
Download error:  Internal Server Error
Downloading:  http://httpstat.us/500
Download error:  Internal Server Error


NoneType

In [3]:
# 파이선 크롤링 자체를 막는 사이트
download('http://www.samsung.com')

Downloading:  http://www.samsung.com
Download error:  Forbidden


## **03 웹링크자료 다운로드**

In [4]:
import re
from urllib.request import Request

def crawl_sitemap(url):
    sitemap = download(url)                          # sitemap 파일 다운로드
    if sitemap is not None:
        links = re.findall('<loc>(.*?)</loc>', sitemap)  # sitemap 링크 추출 
        for link in links:                               # 추출된 각 링크 다운로드
            html = download(link)                        # 스크랩된 html
    else:
        return None

crawl_sitemap('http://example.webscraping.com/sitemap.xml')

Downloading:  http://example.webscraping.com/sitemap.xml
Download error:  Not Found


## **04 프록시 지원 접근**

In [5]:
from urllib.request import Request
import urllib.robotparser

def download(url, user_agent='wswp', proxy=None, num_retries=2):
    print('Downloding:', url)
    headers = {'User-agent': user_agent}
    request = Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.request.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = urlopen(request).read().decode('utf-8')
    except HTTPError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                # 5xx HTTP 오류시 재시도
                return download(url, num_retries - 1)
    return html

In [6]:
def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]      # 이전에 다운로드한 URL 저장
    seen = set(crawl_queue)
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url('http://example.webscraping.com/robots.txt')
    rp.read()
    user_agent = 'GoodCrawler'     # 이 아래 user_agent를 번갈아 보면서 실행하면 결과를 알 수 있다
    # user_agent = 'BadCrawler'
    while crawl_queue:
        url = crawl_queue.pop()
        if rp.can_fetch(user_agent, url):
            html = download(url)
            for link in get_links(html):
                if re.match(link_regex, link):   # 링크가 regexd와 일치하는지 확인
                    link = urllib.parse.urljoin(seed_url, link) # 절대 링크 생성
                    if link not in seen:  # 현재 링크가 이전에 다운로드한 링크인지 확인
                        seen.add(link)
                        crawl_queue.append(link)
        else:
            print('Blocked by robots.txt:', seed_url)

In [7]:
def get_links(html):
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)  # 웹페이지에서 모든 링크를 추출하는 정규식
    return webpage_regex.findall(html)                                       # 웹페이지의 모든 링크 목록

link_crawler('http://example.webscraping.com', '/(index|view)/')

Downloding: http://example.webscraping.com


## **05 다운로드 조절하기**

In [8]:
# link_crawler_delay.py
import datetime
import time

class Throttle:
    def __init__(self, delay):
        self.delay = delay # 각 도메인의 다운로드 사이에 지연 시간
        self.domains = {}  # 도메인을 마지막으로 접속한 시간
    def wait(self, url):
        domain = urllib.parse.urlparse(url).netloc
        last_accessed = self.domains.get(domain)
        if self.delay > 0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
            if sleep_secs > 0:          # 도메인을 최근에 접속하였으며, 지연을 할 필요가 있다
                time.sleep(sleep_secs)  # 최근에 접속한 시간 최신화
        self.domains[domain] = datetime.datetime.now()

In [9]:
def download(url, user_agent='wswp', proxy=None, num_retries=2):
    print('Downloding:', url)
    headers = {'User-agent': user_agent}
    request = Request(url, headers=headers)
    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urllib.request.urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = urlopen(request).read().decode('utf-8')
    except HTTPError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, num_retries - 1)   # 5xx HTTP 오류시 재시도
    return html

In [10]:
def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]
    seen = set(crawl_queue)     # 이전에 다운로드한 URL 저장
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url('http://example.webscraping.com/robots.txt') ; rp.read()
    user_agent = 'GoodCrawler'     # 이 아래 user_agent를 번갈아 보면서 실행하면 결과를 알 수 있다
    # user_agent = 'BadCrawler'
    delay = 2 ; throttle = Throttle(delay)
    while crawl_queue:
        url = crawl_queue.pop()
        if rp.can_fetch(user_agent, url):
            throttle.wait(url) ; html = download(url)
            for link in get_links(html):
                if re.match(link_regex, link):                  # 링크가 regexd와 일치하는지 확인
                    link = urllib.parse.urljoin(seed_url, link) # 절대 링크 생성
                    if link not in seen:                        # 현재 링크가 이전에 다운로드한 링크인지 확인
                        seen.add(link) ; crawl_queue.append(link)
        else:
            print('Blocked by robots.txt:', seed_url)

In [12]:
def get_links(html):
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
    return webpage_regex.findall(html)

link_crawler('http://example.webscraping.com', '/(index|view)/')

Downloding: http://example.webscraping.com


## **06 Download**

In [None]:
# hotel_name = tree.xpath(xpath)
# print(len(tree.xpath(xpath)))
# temp = re.sub("\n","",hotel_name[0])
# temp = re.sub("  ","",temp); temp

In [None]:
from lxml import html
import requests, re
url = 'https://developer.mozilla.org/en-US/docs/Tools/Settings'
page = requests.get(url)
tree = html.fromstring(page.content) ; #page.text[:300]
node = '//*[@id="wikiArticle"]/ul/li[1]/node()'
for txt in tree.xpath(node):
    print('Node :', txt)
text = '//*[@id="wikiArticle"]/ul/li[1]/text()'
print('----------------------------------------------------')
for txt in tree.xpath(text):
    print('Text :', txt)

# MultiThread Download
- https://bitbucket.org/wswp/code/src/9e6b82b47087c2ada0e9fdf4f5e037e151975f0f/chapter04/threaded_test.py?at=default&fileviewer=file-view-default
- https://stackoverflow.com/questions/32285453/why-does-multithreading-do-not-speed-up-parsing-html-with-lxmlm

In [None]:
from lxml.html import fromstring, HTMLParser
import time
from threading import Thread
from urllib.request import urlopen
def func(number):
    parser = HTMLParser()
    for x in range(number):
        fromstring(DATA, parser=parser)

In [None]:
DATA = urlopen('http://lxml.de/FAQ.html').read()
print('Testing one thread (100 job per thread)')
start = time.time()
t1 = Thread(target=func, args=[100])
t1.start();    t1.join();    elapsed = time.time() - start
print('Time: %.5f' % elapsed)
print('Testing two threads (50 jobs per thread)')
start = time.time() 
t1 = Thread(target=func, args=[50])
t2 = Thread(target=func, args=[50])
t1.start();   t2.start();   t1.join();  t2.join();  elapsed = time.time() - start
print('Time: %.5f' % elapsed)

## Web Crawling

In [None]:
# Bs4 01
#url = 'http://example.webscraping.com/view/United Kingdom-239'
#html = download(url)
# soup.find( text = re.compile("sisters") )              # text 내용이 sisters 를 포함시
# soup.find( attrs = {'id' : 'place_area_row'} )
# soup.find( attrs = {'class' : 'w2p_fw'} )

In [None]:
# Bs4 02 
# soup.find_all( "p")           # <tag>
# soup.find_all( "p", "title" )  # CSS클래스 title 인 <p> tag의 값을 추출
# soup.find_all( id = "link2" )    # id
# soup.find_all( id = True)                              # id 속성을 포함시
# soup.find_all( href = re.compile("elsie") )            # elsie URL 링크 포함시
# soup.find_all( href = re.compile("elsie"), id='link1')

In [None]:
# Regular Expression 01
# print(re.findall('<td class="w2p_fw">(.*?)</td>', html)[0])
# print(re.findall('<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>', html))
# print(re.findall('<tr id="places_area__row"><td class="w2p_fl"><label for="places_area" id="places_area__label">\
#                   Area: </label></td><td class="w2p_fw">(.*?)</td>', html))

In [None]:
# Regular Expression 02
# url = 'http://example.webscraping.com/view/United Kingdom-239'
# html = download(url)
# re.findall("<td class='w2p_fw'>(.*?)</td>", html)

In [None]:
# Regular Expression (제일 빠름)
# results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]
# lxml  (중간)
# tree = lxml.html.fromstring(html)
# results[field] = tree.cssselect('table > tr#places_%s__row> td.w2p_fw' % field)[0].text_content()
# BS4   (제일 느리다)
# soup = BeautifulSoup(html, 'html.parser')
# results[field] = soup.find('table').find('tr',id='places_%s__row' % field).find('td', class_='w2p_fw').text

## LXML
- C로 만든 XML 분석용 라이브러리
- 설치는 어렵지만 가장 강력한 모듈

In [None]:
#bs4로 활용찾고, Re로 완성하기
# Regular Expression (용이성은 떨어지지만, 제일 빠르고 파이썬 내부 모듈을 활용) - bs4로 활용찾고, Re로 완성하기
# results[field] = re.search('<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0]

In [None]:
# find_population_lxml.py
import re
from lxml import html
broken_html = '<ul class=country><li>Area<li>Population</ul>'
tree = html.fromstring(broken_html)  # HTML 분석
fixed_html = html.tostring(tree, pretty_print=True); fixed_html.decode('utf-8')

In [None]:
from lxml import html
import requests
url = 'http://media.daum.net/ranking/empathy'
page = requests.get(url)
tree = html.fromstring(page.content)
page.text[:300]

In [None]:
xpath = '//*[@id="mArticle"]/div[2]/ul[2]/li[1]/div[2]/div/span/text()'
hotel_name = tree.xpath(xpath)
temp = re.sub("\n","",hotel_name[0])
temp = re.sub("  ","",temp); temp

In [None]:
# .list_news2 > li:nth-child(1) > div:nth-child(3) > strong:nth-child(1) > a:nth-child(1)
# //*[@id="mArticle"]/div[2]/ul[2]/li[1]/div[2]/strong/a
# //*[@id="mArticle"]/div[2]/ul[2]/li[2]/div[2]/strong/a

In [None]:
n=10

In [None]:
xpath = '//*[@id="mArticle"]/div[2]/ul[2]/li['+str(n)+']/div[2]/strong/a/text()'
hotel_name = tree.xpath(xpath)
temp = re.sub("\n","",hotel_name[0])
temp = re.sub("  ","",temp); temp

In [None]:
xpath = '//*[@id="mArticle"]/div[2]/ul[2]/li['+str(n)+']/div[2]/div/span/text()'
hotel_name = tree.xpath(xpath)
temp = re.sub("\n","",hotel_name[0]) 
temp = re.sub("  ","",temp); temp

## Web Crawling to CSV DB

In [None]:
from lxml import etree
from io import StringIO
from csv import DictWriter
f= StringIO('''
    <html><body>
    <a class="ui-magnifier-glass" 
       href="here goes the link that i want to extract" data-spm-anchor-id="0.0.0.0" 
       style="width: 258px; height: 258px; position: absolute; left: -1px; top: -1px; display: none;"></a>
    <a href="link to extract" title="title to extract" rel="category tag" data-spm-anchor-id="0.0.0.0">
    or maybe this word instead of title</a>
    </body></html>''')

In [None]:
doc = etree.parse(f); data=[]
r = doc.xpath('//a[@data-spm-anchor-id="0.0.0.0"]')
for elem in r:
    link=elem.get('href')
    title=elem.get('title')
    text=elem.text
    data.append({'link': link,'title': title,'text': text})

In [None]:
with open('file.csv', 'w') as csvfile:
    fieldnames=['link', 'title', 'text']
    writer = DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in data:
        writer.writerow(row)

In [None]:
import pandas as pd
pd.read_csv('./file.csv')