In [1]:
from __future__ import annotations

import bs4
# Tested in Jupyter.

!pip install selenium
!pip install nest_asyncio
!pip install tqdm
!pip install aiohttp
!pip install bs4



In [2]:
import re
import asyncio
from bs4 import BeautifulSoup
import aiohttp
import csv
import selenium as se
import csv
import os
try:
    from tqdm import tqdm
except (ImportError, ModuleNotFoundError):
    def tqdm(x):
        return x
# using selenium to get the page source
from selenium import webdriver
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement
import nest_asyncio

In [3]:
SEARCH_BASE_LINK = r'https://kookbang.dema.mil.kr/newsWeb/search.do'
# Categories to allow
CATEGORIES = {
    '국방', '기획연재', '무기백과'
}
CHOSUN_SEARCH_BASE_LINK = 'https://www.chosun.com/nsearch/?query={kwd}&opt_chk=true&sort=1&siteid=bemil%2Cbemil_news'

In [4]:
try:
    page = webdriver.Chrome()
except:
    # Locate the chromedriver.exe in same directory
    page = webdriver.Chrome('chromedriver.exe')

In [5]:
# Standard method to inspect, use F12 -> select element -> copy XPATH.
nest_asyncio.apply()

In [6]:
from selenium.common.exceptions import NoSuchElementException
class PageManager:
    """
    Base page manager.
    Extended class implements search(kwd) and get_result_genexpr(kwd)-like methods.
    Main purpose is to get generator expression of links that can be parsed by BeautifulSoup.
    """
    def __init__(self, driver: WebDriver):
        """
        :type driver: WebDriver
        :param driver: Chrome driver, or 'process' that will be controlled by agent
        """
        self.page = driver

    def reset(self):
        """
        If Static SEARCH_BASE_LINK is set, set the page to BASE_LINK.
        :return:
        """
        if hasattr(self, 'SEARCH_BASE_LINK'):
            self.page.get(self.SEARCH_BASE_LINK)
        else:
            pass

    def get_element_by_xpath(self, xpath):
        """
        Finds element by xpath
        :param xpath: str(raw)
        :return: element
        """
        return self.page.find_element('xpath', xpath)

    def get_element_and_click(self, xpath):
        """
        Finds and clicks element by xpath
        :param xpath: str(raw)
        :return: element
        """
        elem = self.get_element_by_xpath(xpath)
        elem.click()
        return elem

    def get_element_and_send_keys(self, xpath, keys):
        """
        Finds and sends keys to element by xpath, mainly search
        :param xpath: str(raw)
        :param keys: str(raw)
        :return: element
        """
        elem = self.get_element_by_xpath(xpath)
        elem.send_keys(keys)
        return elem

    def get_element_by_selector(self, selector):
        """
        Using CSS selector, find element.
        :param selector: str(raw)
        :return: element
        """
        try:
            return self.page.find_element('css selector', selector)
        except NoSuchElementException:
            return None

class KookbangPageManager(PageManager):
    WAIT_TIME = 1
    def __init__(self, driver: WebDriver):
        super().__init__(driver)
        self.reset()

    def reset(self):
        self.page.get(SEARCH_BASE_LINK)

    def search(self, keyword: str):
        #  input the search word, find #kwd
        self.get_element_and_send_keys('//*[@id="kwd"]', keyword)
        self.get_element_and_click('//*[@id="container"]/div[2]/div/form/div/div[2]/button')
        self.page.implicitly_wait(self.WAIT_TIME)

    def find_category_elems(self) -> [WebElement]:
        for i in range(9):
            selector = f'#container > div.full_search_box > div > ul > li:nth-child({i}) > a'
            elem = self.get_element_by_selector(selector)
            if elem is not None:
                if elem.text in CATEGORIES:
                    yield elem
    def get_list_of_news_category(self, maxidx:int = -1):
        # get next page
        # 3 -> 1 page, 4 -> 2 page, ...
        # if title is "다음페이지" then stop
        maxidx = maxidx if maxidx != -1 else 100
        for pages in range(maxidx):
            if pages != 0:
                page_selector = f"#container > div.full_search_box > div > div.pagination > a:nth-child({pages + 3})"
                # starts from 2 page, until we get to the last page
                page_elem = self.get_element_by_selector(page_selector)
                if page_elem is None or page_elem.get_attribute('title') == '다음페이지':
                    break
                page_elem.click()
                self.page.implicitly_wait(self.WAIT_TIME)
            for idx in range(16): # single page may have 15 news
                selector = f'#container > div.full_search_box > div > div.box > ul > li:nth-child({idx}) > a'
                elem = self.get_element_by_selector(selector)
                if elem is None:
                    continue
                yield elem
        #container > div.full_search_box > div > div.pagination > a:nth-child(4)

    def get_news_genexpr(self, keyword : str, maxidx:int = -1):
        """
        Get news generator expression.
        :param keyword: Keyword to search
        :param maxidx: Maximum number of news to return. -1 for all.
        :return: Generator object that returns news link.
        """
        self.reset()
        self.search(keyword)
        for categoryClickButtons in self.find_category_elems():
            categoryClickButtons.click()
            self.page.implicitly_wait(self.WAIT_TIME)
            for news in self.get_list_of_news_category(maxidx):
                yield news.get_attribute('href')

class ChosunMillitaryPageManager(PageManager):
    WAIT_TIME = 3
    PER_PAGE = 10  # 10 news per page.
    BANNED_BBSID = {
        10044,
        10040,
        10129,
        10046,
        10037
    }

    def __init__(self, driver: WebDriver):
        super().__init__(driver)
        self.reset()

    def reset(self):
        pass

    def search(self, keyword: str):
        #  input the search word, find #kwd
        self.page.get(CHOSUN_SEARCH_BASE_LINK.format(kwd=keyword))
        self.page.implicitly_wait(self.WAIT_TIME)
        self.page.maximize_window()

    def get_page_count(self):
        elem = self.get_element_by_selector('#main > div.search-option > div.flex.flex--justify-space-between.flex--align-items-center.box--pad-bottom-sm.box--border.box--border-horizontal.box--border-horizontal-bottom > div:nth-child(1) > p')
        text = elem.text
        m = re.search(r'(\d+)건', text)
        return int(m.group(1))

    def get_page_for_kwd(self, kwd: str, subidx: int):
        if subidx == 0:
            return
        sub_search_page = f'https://www.chosun.com/nsearch/?query={kwd}&page={subidx}&siteid=bemil,bemil_news&sort=1'
        self.page.get(sub_search_page)
        self.page.implicitly_wait(self.WAIT_TIME)

    def get_news_subidx(self, kwd: str, subidx: int):
        self.get_page_for_kwd(kwd, subidx)
        for idx in range(1, 11):
            xpath = f'//*[@id="main"]/div[4]/div[{idx}]/div/div[1]/div[2]/div[1]/div/a'
            elem = self.get_element_by_xpath(xpath)
            if elem is None:
                pass
            href = elem.get_attribute('href')
            m = re.search(r'bbs_id=(\d+)', href)
            bbsid = int(m.group(1))
            if bbsid in self.BANNED_BBSID:
                continue
            yield href

    def get_news_genexpr(self, keyword: str, maxidx: int = -1):
        self.search(keyword)
        page_count = self.get_page_count()
        total_pages = page_count // self.PER_PAGE
        if total_pages > maxidx > 0:
            total_pages = maxidx
        for subidx in range(1, total_pages + 1):
            yield from self.get_news_subidx(keyword, subidx)

In [7]:
import requests
# We can ignore empty fields
# just join at http://www.riss.kr/search/Search.do? with &
# isDetailSearch : N
# searchGubun : true
# viewYn : OP
# strQuery : $keyword.replace(' ', '+')
# order : /DESC
# onHanja : false
# strSort : RANK
# iStartCount : $pageScale * ($pageNumber - 1)
# fsearchMethod : search
# sflag : 1
# isFDetailSearch : N
# pageNumber : $page
# resultKeyword : $keyword.replace(' ', '+')
# icate : bib_t
# colName : bib_t | re_a_kor - if academic, use bib_t
# pageScale : $pageScale oneOf(10, 100)
# isTab : Y
# query : $keyword.replace(' ', '+')

class WriteableRecord:
    def __init__(self, *args, **kwargs):
        """
        usage : WriteableRecord(*[0,1,2,3]).write_to_csv(csv_writer)
        :param args:
        :param kwargs:
        """
        self.listed_args = args
        self.dicted_args = kwargs
    def write_to_csv(self, csv_writer: csv.writer):
        csv_writer.writerow(*list(self.listed_args) + [self.dicted_args[arg] for arg in self.dicted_args])

class RISSSearchManager(PageManager):
    SEARCH_BASE_LINK = r'http://www.riss.kr/index.do'
    WAIT_TIME = 3
    """
    RISS Search Manager
    We do not use selenium here
    """
    def __init__(self, driver: WebDriver):
        super().__init__(driver)

    def wait(self):
        self.page.implicitly_wait(RISSSearchManager.WAIT_TIME)

    def get_page_for_search(self, keyword:str, is_academic:bool=False, page_scale:int=10, page_idx:int=1):
        # use beautiful soup to parse the page, we will use session with urllib
        _query = keyword.replace(' ', '+')
        _start_count = page_scale * (page_idx - 1)
        _col_name = 'bib_t' if is_academic else 're_a_kor'
        _page_scale = page_scale
        _page_number = page_idx
        search_page = fr'http://www.riss.kr/search/Search.do?isDetailSearch=N&searchGubun=true&viewYn=OP&query={_query}&queryText=&iStartCount={_start_count}&iGroupView=5&icate=all&colName={_col_name}&exQuery=&exQueryText=&order=%2FDESC&onHanja=false&strSort=RANK&pageScale={_page_scale}&orderBy=&fsearchMethod=search&isFDetailSearch=N&sflag=1&searchQuery={_query}&fsearchSort=&fsearchOrder=&limiterList=&limiterListText=&facetList=&facetListText=&fsearchDB=&resultKeyword={_query}&pageNumber={_page_number}&p_year1=&p_year2=&dorg_storage=&mat_type=&mat_subtype=&fulltext_kind=&t_gubun=&learning_type=&language_code=&ccl_code=&language=&inside_outside=&fric_yn=&image_yn=&regnm=&gubun=&kdc=&ttsUseYn='
        request = requests.get(search_page)
        if request.status_code != 200:
            raise Exception('Request failed, status code : ' + str(request.status_code) + ', url : ' + search_page)
        return BeautifulSoup(request.text, 'html.parser')

    def find_result_number(self, soup:BeautifulSoup):
        # find number
        parse = soup.findAll('span', class_='num')
        for p in parse:
            if p.getText().replace(',', '').isdigit():
                return int(p.getText().replace(',', ''))
        return 0

    def search_and_parse(self, keyword:str, is_academic:bool=False, page_scale:int=10, max_results:int=1000):
        soup = self.get_page_for_search(keyword, is_academic, page_scale, 1)
        total_expected_pages = self.get_page_count(soup, page_scale)
        total_expected_pages = min(total_expected_pages, max_results // page_scale)
        collected = []
        for page_idx in tqdm(range(1, total_expected_pages + 1)):
            soup = self.get_page_for_search(keyword, is_academic, page_scale, page_idx)
            for idx in range(page_scale):
                collected.append(self.parse_result_list(soup, idx))
                if len(collected) >= max_results:
                    return collected
        return collected

    def search_and_parse_genexpr(self, keyword:str, is_academic:bool=False, page_scale:int=10, max_results:int=1000):
        soup = self.get_page_for_search(keyword, is_academic, page_scale, 1)
        total_expected_pages = self.get_page_count(soup, page_scale)
        total_expected_pages = min(total_expected_pages, max_results // page_scale)
        _i=0
        for page_idx in tqdm(range(1, total_expected_pages + 1)):
            soup = self.get_page_for_search(keyword, is_academic, page_scale, page_idx)
            for idx in range(page_scale):
                yield self.parse_result_list(soup, idx)
                _i+=1
                if _i >= max_results:
                    return

    def save_search(self, keyword:str, is_academic:bool=False, page_scale:int=10, max_results:int=1000,
                    filename:str='result.csv', encoding:str='utf-8'):
        if os.path.exists(filename):
            raise Exception(f'File {filename} already exists')
        with open(filename, 'w', encoding=encoding, newline='') as f:
            csv_writer = csv.writer(f)
            for _result in self.search_and_parse_genexpr(keyword, is_academic, page_scale, max_results):
                WriteableRecord(_result).write_to_csv(csv_writer)

    def get_page_count(self, soup:BeautifulSoup, page_scale:int=10):
        total_results = self.find_result_number(soup)
        return total_results // page_scale + 1 if total_results % page_scale != 0 else total_results // page_scale

    @staticmethod
    def parse_result_list(result_soup:BeautifulSoup, list_idx:int=0):
        result_soup_list = result_soup.find('div', class_='srchResultListW')
        lists = [l for l in result_soup_list.findAll('li') if l.find('div', class_='cont') is not None]
        parts = lists[list_idx]
        # title = find p class="title"
        title = parts.find('p', class_='title')
        # writer = span class="writer"
        writer = parts.find('span', class_='writer')
        # year = <span>20xx</span>
        year = parts.find('p', class_='etc').findAll('span')[2]
        # degree = 3rd
        degree = parts.find('p', class_='etc').findAll('span')[3]
        # abstract = p class="preAbstract"
        abstract = parts.find('p', class_='preAbstract')
        collection = title, writer, year, degree, abstract
        return [c.getText() for c in collection if c is not None]


In [8]:
result = RISSSearchManager(page).save_search(keyword='간호 AND 안전', is_academic=True, page_scale=10, max_results=5)

  6%|▌         | 9/147 [00:34<08:52,  3.86s/it]


In [9]:
class AsyncJupyterChecker:
    # wrapper class that wraps run_until_complete
    # run_until_complete is not allowed in jupyter notebook. This class will check if it is in jupyter notebook
    def __init__(self, loop: asyncio.AbstractEventLoop):
        self.loop = loop

    def run_until_complete(self, coroutine):
        if self.loop.is_closed():
            self.loop = asyncio.new_event_loop()
        if AsyncJupyterChecker.is_jupyter():
            task = self.loop.create_task(coroutine)
            self.loop.run_until_complete(task)
            return task.result()
        else:
            return self.loop.run_until_complete(coroutine)

    @staticmethod
    def is_jupyter():
        try:
            import IPython
            return True
        except ImportError:
            return False

    def __getattr__(self, item):
        if item == 'run_until_complete':
            return self.run_until_complete
        return getattr(self.loop, item)

class NewsData:  # (date, title, content, url)
    def __init__(self, date: str, title: str, content: str, url: str):
        self.date = date
        self.title = title
        self.content = content
        self.url = url

    # Accepts csv writer object, writes data to csv
    def write_to_csv(self, csv_writer: csv.writer):
        csv_writer.writerow([self.date, self.title, self.content, self.url])



class PageParser:
    # url -> returns NewsData
    # static semaphores
    ASYNC_LOOP = AsyncJupyterChecker(asyncio.get_event_loop())  # we will use this loop for async
    semaphores = asyncio.Semaphore(10)  # only allow 10 concurrent requests
    def __init__(self, url):
        if PageParser.ASYNC_LOOP.is_closed():
            PageParser.ASYNC_LOOP = AsyncJupyterChecker(asyncio.new_event_loop())
        # get date and index from url
        self.url = url
        # url may be https://kookbang.dema.mil.kr/newsWeb/20230411/4/ATCE_CTGR_0010010000/view.do
        # then extract 20230411, 4
        self.content = None

    def parse(self, text: str) -> NewsData | None:
        pass

    async def get_html(self) -> str:
        async with self.semaphores:
            async with aiohttp.ClientSession() as session:
                async with session.get(self.url) as response:
                    if response.status != 200:
                        raise ConnectionError('Error')
                    return await response.text()

    async def get_content(self) -> NewsData:
        text = await self.get_html()
        return self.parse(text)

    def get_content_sync(self) -> NewsData:
        return self.ASYNC_LOOP.run_until_complete(self.get_content())

    @classmethod
    def get_content_sync_from_url(cls, url: str) -> NewsData:
        return cls(url).get_content_sync()

    @classmethod
    def get_content_sync_from_urls(cls, urls: [str]) -> [NewsData]:
        for url in urls:
            yield cls.get_content_sync_from_url(url)

    @classmethod
    async def get_content_async_from_urls(cls, urls: [str]) -> [NewsData]:
        result = []
        for contents in asyncio.as_completed([cls(url).get_content() for url in urls]):
            result.append(await contents)
        return result

    @classmethod
    def get_contents_from_urls(cls, urls: [str]) -> [NewsData]:
        generator = cls.get_content_async_from_urls(urls)
        result = cls.ASYNC_LOOP.run_until_complete(generator)
        return result

class KookbangPageParser(PageParser):
    """
    Parses kookbang page
    """
    def __init__(self, url):
        super().__init__(url)
        self.date = url.split('/')[4]
        self.index = url.split('/')[5]

    def parse(self, text: str) -> NewsData | None:
        soup = BeautifulSoup(text, 'html.parser')
        # title : <meta property="og:title" content="$content">
        # content : <meta property="og:description" content="$content">
        # we don't modify date even if there were fixes for content
        title = soup.find('meta', property='og:title')
        if title is None:
            print(f'Error parsing {self.url}')
            return None
        title = title['content']
        contents = soup.find_all('meta', property='og:description')
        if contents is None:
            merged_string = ""
        else:
            merged_string = ""
            for content in contents:
                merged_string += content['content']
        return NewsData(self.date, title, merged_string, self.url)

class ChosunPageParser(PageParser):
    """
    Parses chosun page
    """
    def __init__(self, url):
        super().__init__(url)
        self.date = None

    def parse(self, text: str) -> NewsData | None:
        soup = BeautifulSoup(text, 'html.parser')
        # title : <div class="conSubject">$content</div>
        # content : #container-area > div.area.subpage > div.news_zone_01 > div.news_zone_01_01 > div.board_detail > div.board_body > div:nth-child(4)
        title = soup.find('div', class_='conSubject')

        if title is None:
            print(f'Error parsing {self.url}')
            return None
        self.date = soup.find('div', class_='wdate')
        if self.date is None:
            print(f'Error parsing {self.url} (date)')
            return None
        title = title.get_text(strip=True)
        self.date = self.date.get_text(strip=True)
        # date = 입력 $date
        self.date = self.date.split(' ')[1]
        body = soup.find('div', class_='board_body')
        if body is None:
            merged_string = ""
        else:
            merged_string = ""
            for contents in body.contents:
                text = contents.get_text(strip=True)
                text = text.replace(",", "")
                if 'https://' in text or '대표 이미지' in text:
                    break
                if text not in ['\n', '', ' ', '\t']:
                    merged_string += text
        return NewsData(self.date, title, merged_string, self.url)

In [10]:
# example url
# example_url = 'https://kookbang.dema.mil.kr/newsWeb/20230417/2/ATCE_CTGR_0010040000/view.do'
# usage -> PageParser(url).get_content_sync()
# test_parser = PageParser(example_url)
# content = test_parser.get_content_sync()
# print(content.content)
def crawl_kookbang(kwd, max_page:int = -1):
    with open(f'kookbang{kwd}.csv', 'w', newline='') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(['date', 'title', 'content', 'url'])
        page_manager = KookbangPageManager(page)
        for content in KookbangPageParser.get_contents_from_urls(page_manager.get_news_genexpr(kwd, max_page)):
            if content is not None:
                content.write_to_csv(csv_writer)

def crawl_chosun(kwd, max_page:int = -1):
    with open(f'chosun{kwd}.csv', 'w', encoding='utf-8', newline='') as f:
        # use delimiter = '\t' for tab separated
        csv_writer = csv.writer(f, dialect='excel')
        csv_writer.writerow(['date', 'title', 'content', 'url'])
        page_manager = ChosunMillitaryPageManager(page)
        for content in ChosunPageParser.get_contents_from_urls(page_manager.get_news_genexpr(kwd, max_page)):
            if content is not None:
                content.write_to_csv(csv_writer)

In [11]:
#crawl_chosun('USV') # searches USV and saves as USV.csv

In [12]:
#crawl_kookbang('USV') # searches USV and saves as USV.csv