In [None]:
# !pip install bs4
# !pip install selenium

In [None]:
from bs4 import BeautifulSoup
from urllib.request import Request
from urllib.request import urlopen
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

import numpy as np
import time
import pandas as pd
import urllib
from tqdm.notebook import tqdm
import re
import os

In [None]:
def get_url(query, start_date, end_date, start_page):
    # 네이버 뉴스검색에서 검색할 뉴스의 카테고리, 시간 범위 옵션을 설정했을 때의 url을 생성
    base = "https://search.naver.com/search.naver?where=news&query={search_words}"
    ds = start_date
    de = end_date
    start = start_page
    url = base + "&sm=tab_opt&sort=0&photo=0&field=0&pd=3&ds=" + ds + "&de=" + de + "&start=" + start
    request_url = Request(url.format(search_words = urllib.parse.quote(query)))
    return request_url

In [None]:
def set_selenium_opt():
    # 크롤링을 하기 위한 selenlium 크롬 드라이버의 설정값
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # 브라우저를 표시하지 않음
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--dns-prefetch-disable")
    chrome_options.add_argument("--disable-logging")
    chrome_options.add_argument("--disable-background-networking")
    chrome_options.add_argument("--disable-default-apps")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [None]:
def get_page_html(driver, url):
    # 해당 url 페이지의 전체 html 추출
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'title')))
    page_html = driver.page_source
    # print(page_html)
    return page_html


def filter_naver_links(links):
    # 추출한 링크들 중 네이버 뉴스 링크들만 선별
    return [link for link in links if 'news.naver.com' in link]

In [None]:
def do_link_crawling(driver, url):
    # 특정 키워드, 범위 안의 네이버 기사들을 크롤링
    
    link_list = []

    # 해당 url 페이지에서 추출한 html이 제대로 존재하는지 여부를  확인
    while True:
        html_content = get_page_html(driver, url.full_url)

        if html_content:
            break
        print("Warning!!!")
        time.sleep(1)
        
    # BeautifulSoup을 사용하여 페이지 파싱
    soup = BeautifulSoup(html_content, "html.parser")
    group = soup.find("ul", "list_news")
    
    if not group:
        return []
    
    bx = group.find_all("li", "bx")
    for li_tag in bx:
        a_tags = li_tag.find_all("a")
        for a_tag in a_tags:
            link_list.append(a_tag["href"])
    return link_list

In [None]:
def do_article_crawling(driver, url):
    
    date = []
    title = []
    text = []
    
    for row in url:
        # 태그 형식이 하나라도 다르다면 NAN값을 집어넣어줌
        try:
            driver.get(row)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "h2")))
        
            date.append(driver.find_element(By.CLASS_NAME, "media_end_head_info_datestamp_time._ARTICLE_DATE_TIME").get_attribute("data-date-time"))
            title.append(driver.find_element(By.ID, "title_area").get_attribute("innerText"))
            text.append(driver.find_element(By.ID, "dic_area").get_attribute("innerText"))
        except NoSuchElementException:
            date.append(np.NAN)
            title.append(np.NAN)
            text.append(np.NAN)
            
    
    return pd.DataFrame({"date" : date, "title" : title, "text" : text})

In [None]:
def get_refined_article(article):
    
    # 중복된 기사 삭제
    article = article.drop_duplicates(["date", "text"], keep = "first")
    
    # 기사 내용이 없는 데이터의 경우 삭제처리
    article[article["text"] == "nan"] = np.nan
    article.dropna(subset=["text"],how="any", axis = 0, inplace=True)

    # 제목에서 문자, 스페이스만 남기고 제거
    article["title"] = article["title"].apply(lambda x : re.sub("[^A-Za-z0-9가-힣]", " ", x))
    article["title"] = article["title"].apply(lambda x : re.sub(" +", " ", x))

    # 내용에서 문자, 스페이스만 남기고 제거
    article["text"] = article["text"].apply(lambda x : re.sub("[^A-Za-z0-9가-힣.]", " ", x))
    article["text"] = article["text"].apply(lambda x : re.sub(" +", " ", x))
    
    return article

In [None]:
def main(year, month, term, topic):
    link_list = []
    news_info_list = []

    try:
        start_date = datetime.strptime(f"{year}.{month:02d}.01", "%Y.%m.%d")
        end_date = datetime.strptime(f"{year}.{month:02d}.31", "%Y.%m.%d")

    except ValueError as e:
        print(e)
        if month == 2:
            end_date = datetime.strptime(f"{year}.{month:02d}.28", "%Y.%m.%d")
        else:
            end_date = datetime.strptime(f"{year}.{month:02d}.30", "%Y.%m.%d")

    current_date = start_date
    formatted_date = str(current_date.strftime("%Y.%m.%d"))
    
    print(f"Crawling {start_date} to {end_date}")
    start_page = 1

    driver = set_selenium_opt()
        
    while current_date <= end_date:
        formatted_date = str(current_date.strftime("%Y.%m.%d"))

        for start_page in range(1, 101, 10):
            url = get_url(f"{topic}", str(start_date.strftime("%Y.%m.%d")), formatted_date, str(start_page))
            link = do_link_crawling(driver, url)
            link_list.extend(link)
        current_date += timedelta(days=term)
        
    # 네이버 링크만 남겨주기
    naver_link_list = filter_naver_links(link_list)
    
    # 얻은 링크를 바탕으로 기사정보를 크롤링  
    news_info_list = do_article_crawling(driver, naver_link_list)
    
    # 특수문자 혹은 불필요한 문자 삭제
    refined_news_info = get_refined_article(news_info_list)
    
    driver.close()
    driver.quit()
    
    
    # 형태소 분리 후 기사 데이터프레임에 형태소 열 추가
    # refined_news_info["title_tokenized"] = get_tokenized_article(refined_news_info["title"])
    # refined_news_info["text_tokenized"] = get_tokenized_article(refined_news_info["text"])
    # refined_news_info["title_tokenized"] = None
    # refined_news_info["text_tokenized"] = None    
    
    # print(naver_link_list)
    df = pd.DataFrame(refined_news_info)
    # df.to_csv(f"../news_crawling/data/{topic.replace(' ', '')}/{topic.replace(' ', '')}_{year}_{month:02d}.csv")


In [None]:
if __name__ == "__main__":
	year = 2023
	term = 10
	topic = '전기차'
	
	directory_path = f"../news_crawling/data/{topic.replace(' ', '')}"
	try:
		os.mkdir(directory_path)
	except FileExistsError:
		print(f"The directory '{directory_path}' already exists.")

	for i in range(1,12,1):
		month = i
		main(year, month, term, topic)