## 제주도 관광 이슈 크롤링
네이버 뉴스에서 해당하는 달의 관광 이슈 크롤링

In [11]:
from ssl import SSLError
from urllib import parse
from urllib.error import URLError
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
import socket
import random
import argparse
import datetime
import pandas as pd
import requests
import nltk
import warnings
import random
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

In [22]:
def crawl(query, save_as, begin, end, sort=0, field=1, delay=0.5, timeout=30, page_limit=50):
    '''
    :param query: 네이버 '뉴스'란에서 검색할 검색어
    :param save_as: 검색 결과 저장 경로
    :param begin: '기간' -> 검색 기간 시작
    :param end: '기간' -> 검색 기간 끝
    :param sort: '유형' -> 0(관련도순) 1(최신순) 2(오래된순)
    :param field: '영역' -> 0(전체) 1(제목)
    :param delay: (옵션) 검색 리퀘스트 간격 (초)
    :param timeout: (옵션) 타임아웃 시 기다릴 시간 (초)
    :param page_limit: (옵션) 검색 결과에서 몇 페이지까지 갈 것인지 결정
    :return:
    '''

    # prerequisite
    df = pd.DataFrame(columns=['link', 'title', 'date', 'article'])

    # index settings
    # a single pages includes 10 news, starting from page 1 (index 1~10)
    current_index = 1
    max_index = 2

    while (current_index <= max_index) and (1 + current_index // 10 <= page_limit):
        url = make_url(query, sort, field, begin, end, current_index)
        bsobj = make_bsobj(url, delay, timeout, trial=10)
        print('making url', url)

        if bsobj is None:
            continue
        naver_news_urls = make_naver_news_urls(bsobj)
        naver_news_title = get_naver_news_title(bsobj)
        naver_news_articles = []
        for i in range(len(naver_news_urls)):
            url = naver_news_urls[i]
            news_bsobj = BeautifulSoup(url, 'lxml')
            
            naver_news_article = get_naver_news_article(url, news_bsobj)
            naver_news_articles.append(naver_news_article)

#             date, article, title, newspaper = attributes
        df = pd.DataFrame([ x for x in zip(naver_news_title, naver_news_articles)])
        df.columns = ['naver_news_title', 'naver_news_articles']
#         df = df.sort_values(by=['date'])
        df.to_excel(save_as, engine='xlsxwriter')
        current_index += 10
    
        max_index = get_max_index(bsobj)
        if max_index is None:
            break
        if len(df) > 50:
            break
        
    print(f'{begin}_complete!')
    return df

In [13]:
def make_naver_news_urls(bsobj):
    return [link['href'] for link in bsobj.find_all('a', 'news_tit')]

def get_naver_news_title(bsobj):
    return [link.text for link in bsobj.find_all('a', 'news_tit')]

def get_naver_news_article(url, news_bsobj):
    try:
        html = urlopen(url).read()
        soup = BeautifulSoup(html, "lxml")
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        return text
    except:
        return None

In [14]:
def make_url(query, sort, field, begin, end, page):
    url = "https://search.naver.com/search.naver?&where=news&query=" + parse.quote(query)
    url += "&sort=%i" % sort
    url += "&field=%i" % field
    url += "&ds=" + begin + "&de=" + end
    url += "&nso=so:r,p:"
    url += "from" + begin.replace(".", "") + "to" + end.replace(".", "")
    url += "&start=" + str(page)
    url += "&refresh_start=0"
    return url

In [15]:
def make_bsobj(url, delay=0.5, timeout=30, trial=10):
    ua = UserAgent(verify_ssl=False)
    count = 0

    while count < trial:
        try:
            time.sleep(delay + random.random())
            html = urlopen(Request(url=url, headers={'User-Agent': ua.random}), timeout=timeout)
            bsobj = BeautifulSoup(html, 'lxml')
            return bsobj
        except (URLError, SSLError, socket.timeout) as e:
            print('(Error)', e)
            print('reloading...')
            count += 1
            time.sleep(timeout)
    return None

In [52]:
def get_max_index(bsobj):
    paging = bsobj.find_all('a', 'btn_next')
    print(paging)
    if not paging:
        print('(WARNING!) no results found')
        return None
    print(1)
    return True

In [56]:
url = 'https://search.naver.com/search.naver?&where=news&query=%EC%A0%9C%EC%A3%BC%EB%8F%84%EA%B4%80%EA%B4%91&sort=0&field=1&ds=2015.01.01&de=2015.01.30&nso=so:r,p:from20150101to20150130&start=1&refresh_start=21'
delay=0.5
timeout=30
bsobj = make_bsobj(url, delay, timeout, trial=10)
max_index = get_max_index(bsobj)
if max_index is None:
    print(1)

[<a aria-disabled="false" class="btn_next" href="?where=news&amp;sm=tab_pge&amp;query=%EC%A0%9C%EC%A3%BC%EB%8F%84%EA%B4%80%EA%B4%91&amp;sort=0&amp;photo=0&amp;field=1&amp;pd=3&amp;ds=2015.01.01&amp;de=2015.01.30&amp;cluster_rank=13&amp;mynews=0&amp;office_type=0&amp;office_section_code=0&amp;news_office_checked=&amp;nso=so:r,p:from20150101to20150130,a:t&amp;start=11" onclick="return goOtherCR(this, 'a=nws.paging&amp;r=2&amp;u='+urlencode(urlexpand(this.href)));" role="button"><i class="spnew ico_page_arr">다음</i></a>]
1


In [27]:
def get_arguments():
    # Argument configuration
    parser = argparse.ArgumentParser()
    parser.add_argument('--query', type=str, required=True, help='query to search on NAVER')
    parser.add_argument('--begin', type=str, required=True, help='crawling begin point (%%Y.%%m.%%d format)')
    parser.add_argument('--end', type=str, required=True, help='crawling end point (%%Y.%%m.%%d format)')
    parser.add_argument('--sort', type=int, default=0, help='search result sorting: 0(relevant), 1(newest), 2(oldest)')
    parser.add_argument('--field', type=int, default=1, help='search field: 0(all), 1(title)')
    return parser.parse_args()

In [28]:
def ran_num(n):
    ls = []
    num = random.randint(1, n)
    while n in ls :
        num = random.randint(1, n)
        ls.append(num)
    return num

In [29]:
def saving_data(start_year, end_year):
    for i in range(start_year, end_year + 1):
        for j in range(1, 13):
            if j < 10:
                j = '0' + str(j)
            df = crawl('제주도관광', f'./news_data/{i}_{j}.xlsx', f'{i}.{j}.01', f'{i}.{j}.30')

In [30]:
saving_data(2015, 2021)

making url https://search.naver.com/search.naver?&where=news&query=%EC%A0%9C%EC%A3%BC%EB%8F%84%EA%B4%80%EA%B4%91&sort=0&field=1&ds=2015.01.01&de=2015.01.30&nso=so:r,p:from20150101to20150130&start=1&refresh_start=0
2015.01.01_complete!
making url https://search.naver.com/search.naver?&where=news&query=%EC%A0%9C%EC%A3%BC%EB%8F%84%EA%B4%80%EA%B4%91&sort=0&field=1&ds=2015.02.01&de=2015.02.30&nso=so:r,p:from20150201to20150230&start=1&refresh_start=0
2015.02.01_complete!
making url https://search.naver.com/search.naver?&where=news&query=%EC%A0%9C%EC%A3%BC%EB%8F%84%EA%B4%80%EA%B4%91&sort=0&field=1&ds=2015.03.01&de=2015.03.30&nso=so:r,p:from20150301to20150330&start=1&refresh_start=0
2015.03.01_complete!
making url https://search.naver.com/search.naver?&where=news&query=%EC%A0%9C%EC%A3%BC%EB%8F%84%EA%B4%80%EA%B4%91&sort=0&field=1&ds=2015.04.01&de=2015.04.30&nso=so:r,p:from20150401to20150430&start=1&refresh_start=0
2015.04.01_complete!
making url https://search.naver.com/search.naver?&where=news

KeyboardInterrupt: 