## 제주도 관광 이슈 크롤링
네이버 뉴스에서 해당하는 달의 관광 이슈 크롤링

In [1]:
from ssl import SSLError
from urllib import parse
from urllib.error import URLError
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
import socket
import random
import argparse
import datetime
import pandas as pd

In [96]:
def crawl(query, save_as, begin, end, sort=0, field=1, delay=0.5, timeout=30, page_limit=50):
    '''
    :param query: 네이버 '뉴스'란에서 검색할 검색어
    :param save_as: 검색 결과 저장 경로
    :param begin: '기간' -> 검색 기간 시작
    :param end: '기간' -> 검색 기간 끝
    :param sort: '유형' -> 0(관련도순) 1(최신순) 2(오래된순)
    :param field: '영역' -> 0(전체) 1(제목)
    :param delay: (옵션) 검색 리퀘스트 간격 (초)
    :param timeout: (옵션) 타임아웃 시 기다릴 시간 (초)
    :param page_limit: (옵션) 검색 결과에서 몇 페이지까지 갈 것인지 결정
    :return:
    '''

    # prerequisite
    df = pd.DataFrame(columns=['link', 'title', 'date', 'article'])

    # index settings
    # a single pages includes 10 news, starting from page 1 (index 1~10)
    current_index = 1
    max_index = 2

    while (current_index <= max_index) and (1 + current_index // 10 <= page_limit):
        print('\n' + 'crawling... %s (current_page / max_page %i/%i)' % (query, 1 + current_index // 10, 1 + max_index // 10))
        url = make_url(query, sort, field, begin, end, current_index)
        print('making url', url)

        print('making beautifulsoup object from html')
        bsobj = make_bsobj(url, delay, timeout, trial=10)

        if bsobj is None:
            continue
        print('extracting naver news urls from bsobj')
        naver_news_urls = make_naver_news_urls(bsobj)
        naver_news_title = get_naver_news_title(bsobj)
        print(naver_news_urls)
        print(naver_news_title)
        naver_news_articles = []
        for i in range(len(naver_news_urls)):
            print('\topening:', naver_news_urls[i])
            news_bsobj = BeautifulSoup(naver_news_urls[i], 'html.parser')
            
            naver_news_article = get_naver_news_article(news_bsobj)
            naver_news_articles.append(naver_news_article)

#             date, article, title, newspaper = attributes
        df = pd.DataFrame([ x for x in zip(naver_news_urls, naver_news_title)])
        print(df)
        print('saving updated df')
#         df = df.sort_values(by=['date'])
#         df.to_excel(save_as, engine='xlsxwriter')
        current_index += 10
        max_index = get_max_index(bsobj)
        if max_index is None:
            break
    return naver_news_articles

In [97]:
df = crawl('제주도관광', 'test.xlsx', '2015.01.01', '2015.01.05')


crawling... 제주도관광 (current_page / max_page 1/1)
making url https://search.naver.com/search.naver?&where=news&query=%EC%A0%9C%EC%A3%BC%EB%8F%84%EA%B4%80%EA%B4%91&sort=0&field=1&ds=2015.01.01&de=2015.01.05&nso=so:r,p:from20150101to20150105&start=1&refresh_start=0
making beautifulsoup object from html
extracting naver news urls from bsobj
['http://www.wowtv.co.kr/newscenter/news/view.asp?bcode=T30001000&artid=A201501040200', 'http://www.asiatoday.co.kr/view.php?key=20150104010001014']
['부영그룹, 관광레저산업 전략사업 설정...제주도 시내면세점 추진', '제주도, 올해 1300만 관광객 유치목표..."질적성장 도민소득 ↑"']
	opening: http://www.wowtv.co.kr/newscenter/news/view.asp?bcode=T30001000&artid=A201501040200
	opening: http://www.asiatoday.co.kr/view.php?key=20150104010001014
                                                   0  \
0  http://www.wowtv.co.kr/newscenter/news/view.as...   
1  http://www.asiatoday.co.kr/view.php?key=201501...   

                                        1  
0     부영그룹, 관광레저산업 전략사업 설정...제주도 시내면세점 추진  
1  제주도, 올해 



In [98]:
df

['http://www.wowtv.co.kr/newscenter/news/view.asp?bcode=T30001000&artid=A201501040200',
 'http://www.asiatoday.co.kr/view.php?key=20150104010001014']

In [85]:
def make_url(query, sort, field, begin, end, page):
    url = "https://search.naver.com/search.naver?&where=news&query=" + parse.quote(query)
    url += "&sort=%i" % sort
    url += "&field=%i" % field
    url += "&ds=" + begin + "&de=" + end
    url += "&nso=so:r,p:"
    url += "from" + begin.replace(".", "") + "to" + end.replace(".", "")
    url += "&start=" + str(page)
    url += "&refresh_start=0"
    return url

In [86]:
def make_bsobj(url, delay=0.5, timeout=30, trial=10):
    ua = UserAgent(verify_ssl=False)
    count = 0

    while count < trial:
        try:
            time.sleep(delay + random.random())
            html = urlopen(Request(url=url, headers={'User-Agent': ua.random}), timeout=timeout)
            bsobj = BeautifulSoup(html, 'html.parser')
            return bsobj
        except (URLError, SSLError, socket.timeout) as e:
            print('(Error)', e)
            print('reloading...')
            count += 1
            time.sleep(timeout)
    return None

In [87]:
def make_naver_news_urls(bsobj):
    return [link['href'] for link in bsobj.find_all('a', 'news_tit')]

def get_naver_news_title(bsobj):
    return [link.text for link in bsobj.find_all('a', 'news_tit')]

def get_naver_news_article(news_bsobj):
    return news_bsobj.text

In [88]:
def get_attributes(bsobj):
    def _get_title(bsobj):
        ls = []
        title = bsobj.find('a', 'news_tit').text
        ls.append(title)
        print(ls)
        return title

    def _get_article(bsobj):
        article = bsobj.select('#articleBodyContents')[0].text
        article = article.encode('utf-8', 'replace').decode()
        return article

    def _get_date(news_bsobj):
        splits = bsobj.select('.t11')[0].text.split(' ')
        date = splits[0] + ' ' + splits[2]
        date = datetime.datetime.strptime(date, '%Y.%m.%d. %H:%M')
        date += datetime.timedelta(hours=12 * int(splits[1] == '오후'))
        print(date)
        return date

    def _get_newspaper(bsobj):
        newspaper = bsobj.find("div", class_="press_logo").find('img', alt=True).get('alt')
        return newspaper

    try:
        return _get_date(bsobj), _get_article(bsobj), _get_title(bsobj), _get_newspaper(bsobj)
    except IndexError:
        print('(Error) crawling failed (maybe url is redirected to somewhere else)')
        return None

In [89]:
def get_max_index(bsobj):
    paging = bsobj.find("div", {"class": "sc_page_inner"})
    if not paging:
        print('(WARNING!) no results found')
        return None

    atags = paging.find_all('a')
    if not atags:
        print('(WARNING!) there is only one page')
        return None

    return max([int(atag["href"].split('start=')[1]) for atag in atags])

In [90]:
def get_arguments():
    # Argument configuration
    parser = argparse.ArgumentParser()
    parser.add_argument('--query', type=str, required=True, help='query to search on NAVER')
    parser.add_argument('--begin', type=str, required=True, help='crawling begin point (%%Y.%%m.%%d format)')
    parser.add_argument('--end', type=str, required=True, help='crawling end point (%%Y.%%m.%%d format)')
    parser.add_argument('--save_as', type=str, default='test2.xlsx', help='excel save path')
    parser.add_argument('--sort', type=int, default=0, help='search result sorting: 0(relevant), 1(newest), 2(oldest)')
    parser.add_argument('--field', type=int, default=1, help='search field: 0(all), 1(title)')
    return parser.parse_args()

In [94]:
df = crawl('제주도관광', 'test.xlsx', '2015.01.01', '2015.01.05')


crawling... 제주도관광 (current_page / max_page 1/1)
making url https://search.naver.com/search.naver?&where=news&query=%EC%A0%9C%EC%A3%BC%EB%8F%84%EA%B4%80%EA%B4%91&sort=0&field=1&ds=2015.01.01&de=2015.01.05&nso=so:r,p:from20150101to20150105&start=1&refresh_start=0
making beautifulsoup object from html
extracting naver news urls from bsobj
['http://www.wowtv.co.kr/newscenter/news/view.asp?bcode=T30001000&artid=A201501040200', 'http://www.asiatoday.co.kr/view.php?key=20150104010001014']
['부영그룹, 관광레저산업 전략사업 설정...제주도 시내면세점 추진', '제주도, 올해 1300만 관광객 유치목표..."질적성장 도민소득 ↑"']
	opening: http://www.wowtv.co.kr/newscenter/news/view.asp?bcode=T30001000&artid=A201501040200
	opening: http://www.asiatoday.co.kr/view.php?key=20150104010001014
                                                   0  \
0  http://www.wowtv.co.kr/newscenter/news/view.as...   
1  http://www.asiatoday.co.kr/view.php?key=201501...   

                                        1  
0     부영그룹, 관광레저산업 전략사업 설정...제주도 시내면세점 추진  
1  제주도, 올해 



In [95]:
df

['http://www.wowtv.co.kr/newscenter/news/view.asp?bcode=T30001000&artid=A201501040200',
 'http://www.asiatoday.co.kr/view.php?key=20150104010001014']