# Configuration

In [9]:
import os
import time
import json
import numpy as np
from collections import defaultdict
from urllib import request, parse
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

__file__ = os.getcwd()

In [None]:
FDIR_DATA = os.path.join(__file__, 'data')

FDIR_URL_LIST = os.path.join(FDIR_DATA, 'news/hyundai_motors/url_list/')
make_dir_soft(FDIR_URL_LIST)

FDIR_ARTICLE = os.path.join(FDIR_DATA, 'news/hyundai_motors/article/')
make_dir_soft(FDIR_ARTICLE)

# Web Crawling

In [13]:
def make_dir_soft(fpath):
    dirpath = os.path.dirname(fpath)
    if os.path.isdir(dirpath):
        pass
    else:
        os.makedirs(dirpath)
        
    return None

def remove_date_sep(date):
    return data.replace('.', '')

def calculate_date_list(date_start, duration):
    date_start_by_datetime = datetime.strptime(date_start, '%Y.%m.%d')

    date_list = []
    for days_after in range(duration):
        date_end_by_datetime = date_start_by_datetime + timedelta(days=days_after)
        date_end = datetime.strftime(date_end_by_datetime, '%Y.%m.%d')

        date_list.append(date_end)
        
    return date_list

def request_for_soup(url, headers):
    req = request.Request(url=url, headers=headers)
    html = request.urlopen(req).read()
    soup = BeautifulSoup(html, 'lxml')
    
    return soup

def is_list_page_empty(soup):
    if soup.find('div', class_='not_found02'):
        return True
    else:
        return False
    
def get_sleep_time_random():
    random_number = np.random.normal(1,0.1)
    if random_number > 0:
        pass
    elif random_number < 0:
        random_number = random_number * (-1)
    else:
        random_number = 1
    
    return random_number

def parse_article(soup):
    title = soup.find('h2').get_text()
    date, _, _ = soup.find('span', class_='media_end_head_info_datestamp_time').get_text().split()
    content = soup.find('div', id='dic_area').get_text()
    
    return title, date, content

def import_url_dict(fdir):
    url_dict = {}
    for fname in os.listdir(fdir):
        date, _ = fname.split('.')
        
        fpath = os.path.join(fdir, fname)
        with open(fpath, 'r', encoding='utf-8') as f:
            url_dict[date] = json.load(f)
            
    return url_dict

In [4]:
url_base = 'https://search.naver.com/search.naver?where=news&sm=tab_pge&query={}&sort=1&photo=0&field=0&pd=3&ds={}&de={}&start={}'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"}

QUERY = '현대자동차'
QUERY_PARSED = parse.quote(QUERY)

DATE_START = '2023.07.01'
DURATION = 30
DATE_LIST = calculate_date_list(DATE_START, DURATION)

print('Query parsed: {}'.format(QUERY_PARSED))
print('Date list: {}'.format('\n'.join(DATE_LIST)))

Query parsed: %ED%98%84%EB%8C%80%EC%9E%90%EB%8F%99%EC%B0%A8
Date list: 2023.07.01
2023.07.02
2023.07.03
2023.07.04
2023.07.05
2023.07.06
2023.07.07
2023.07.08
2023.07.09
2023.07.10
2023.07.11
2023.07.12
2023.07.13
2023.07.14
2023.07.15
2023.07.16
2023.07.17
2023.07.18
2023.07.19
2023.07.20
2023.07.21
2023.07.22
2023.07.23
2023.07.24
2023.07.25
2023.07.26
2023.07.27
2023.07.28
2023.07.29
2023.07.30


In [109]:
URL_COUNT = 0

for date in DATE_LIST:
    print('--'*30)
    print('Date: {}'.format(date))
    
    # Init URL list and start index for a new day
    url_list = []
    URL_START_IDX = 1
    
    while True:
        # Get URL
        url_list_page = url_base.format(QUERY_PARSED, date, date, URL_START_IDX)

        # Parse HTML
        soup = request_for_soup(url_list_page, headers)
        if is_list_page_empty(soup):
            break
        else:
            pass
        
        # Extend URL list of the current date
        url_list.extend([url.get('href') for url in soup.find_all('a', class_='info') if '네이버뉴스' in url])
        
        # Sleep and move to next page of the URLs
        SLEEP_TIME = get_sleep_time_random()
        time.sleep(SLEEP_TIME)
        URL_START_IDX += 10
    
    # Export URL list
    date_for_dirname = remove_date_sep(date)
    FNAME_URL_LIST = '{}.json'.format(date_for_dirname)
    FPATH_URL_LIST = os.path.join(FDIR_URL_LIST, FNAME_URL_LIST)
    with open(FPATH_URL_LIST, 'w', encoding='utf-8') as f:
        json.dump(url_list, f)
        
    # Extend total URL list
    URL_COUNT += len(url_list)
    print('URL list (current day): {}'.format(len(url_list)))
    print('URL list (total)      : {}'.format(URL_COUNT))
    
print('=='*30)
print('Done: {}'.format(URL_COUNT))

------------------------------------------------------------
Date: 2023.07.01
URL list (current day): 21
URL list (total)      : 21
------------------------------------------------------------
Date: 2023.07.02
URL list (current day): 62
URL list (total)      : 83
------------------------------------------------------------
Date: 2023.07.03
URL list (current day): 193
URL list (total)      : 276
------------------------------------------------------------
Date: 2023.07.04
URL list (current day): 153
URL list (total)      : 429
------------------------------------------------------------
Date: 2023.07.05
URL list (current day): 223
URL list (total)      : 652
------------------------------------------------------------
Date: 2023.07.06
URL list (current day): 126
URL list (total)      : 778
------------------------------------------------------------
Date: 2023.07.07
URL list (current day): 47
URL list (total)      : 825
------------------------------------------------------------
Date: 

In [12]:
url_dict = import_url_dict(FDIR_URL_LIST)

url_count = len(url_dict.values())
article_count_total = 0
errors = []

for url_date, url_list in url_dict.items():
    article_count_date = 0
    for url_article in url_list:
        # Check existence
        article_id = '{:04d}'.format(article_count_date)
        FNAME_ARTICLE = '{}/{}.json'.format(remove_date_sep(url_date), article_id)
        FPATH_ARTICLE = os.path.join(FDIR_ARTICLE, FNAME_ARTICLE)
        
        if os.path.isfile(FPATH_ARTICLE):
            article_count_total += 1
            continue
        else:
            pass
        
        # Parse HTML
        soup = request_for_soup(url_article, headers)

        # Parse article
        try:
            title, date, content = parse_article(soup)
        except:
            errors.append(url_article)
            continue

        article = {
            'title': title,
            'date': date,
            'content': content
        }

        # Save article
        make_dir_soft(FPATH_ARTICLE)
        with open(FPATH_ARTICLE, 'w', encoding='utf-8') as f:
            json.dump(article, f)
            article_count_date += 1
            article_count_total += 1
            
        # Sleep and move to next page of the URLs
        SLEEP_TIME = get_sleep_time_random()
        time.sleep(SLEEP_TIME)

    # Print status
    print('Date: {} / Total Success: {:,} / Errors: {:,}'.format(url_date, article_count_total, len(errors)))
    
# Print result
print('=='*30)
print('Done')
print('Total   : {:,}'.format(url_count))
print('Articles: {:,}'.format(article_count_total))
print('Errors  : {:,}'.format(len(errors)))
print('--'*30)
print(errors)

Date: 20230701 / Total Success: 21 / Errors: 0
Date: 20230702 / Total Success: 83 / Errors: 0
Date: 20230703 / Total Success: 276 / Errors: 0
Date: 20230704 / Total Success: 429 / Errors: 0
Date: 20230705 / Total Success: 652 / Errors: 0
Date: 20230706 / Total Success: 774 / Errors: 4
Date: 20230707 / Total Success: 818 / Errors: 7
Date: 20230708 / Total Success: 835 / Errors: 13
Date: 20230709 / Total Success: 905 / Errors: 14
Date: 20230710 / Total Success: 1,051 / Errors: 17
Date: 20230711 / Total Success: 1,164 / Errors: 22
Date: 20230712 / Total Success: 1,292 / Errors: 23
Date: 20230713 / Total Success: 1,508 / Errors: 25
Date: 20230714 / Total Success: 1,604 / Errors: 25
Date: 20230715 / Total Success: 1,612 / Errors: 25
Date: 20230716 / Total Success: 1,662 / Errors: 25
Date: 20230717 / Total Success: 1,716 / Errors: 25
Date: 20230718 / Total Success: 1,852 / Errors: 25
Date: 20230719 / Total Success: 1,949 / Errors: 27
Date: 20230720 / Total Success: 2,161 / Errors: 27
Date: 2

# Data Partition

In [15]:
from sklearn.model_selection import train_test_split

In [1]:
def iter_fpath_article(fdir_article):
    for date in os.listdir(fdir_article):
        fdir_article_date = os.path.join(fdir_article, date)
        flist_article_date = os.listdir(fdir_article_date)
        for fname in flist_article_date:
            fpath = os.path.join(fdir_article_date, fname)
            yield fpath
            
def load_articles(fdir_article):
    articles = []
    for fpath in iter_fpath_article(fdir_article):
        article_id, _ = '/'.join(fpath.replace('\\', '/').split('/')[-2:]).split('.')
        with open(fpath, 'r', encoding='utf-8') as f:
            article = json.load(f)
            article['id'] = article_id
            
            articles.append(article)
            
    return articles

train_test_split

- test_size: 전체 데이터에서 test 데이터의 비율
- random_state: 데이터를 train과 test로 나누는 방식 --> 동일한 random_state라면 동일하게 구분됨

# Data Labeling

데이터 라벨링을 해봅시다.
- 1: 자동차 판매 관련 기사
- 0: 그 외

In [75]:
import pandas as pd

자동차 판매 관련 기사의 label을 1로 변경한 뒤 "\~\~\_after_<이름>.xlsx"의 파일명으로 저장
- 김가영: 0~99
- 김서연: 100~199
- 김현태: 200~299
- 민경제: 300~399
- 박경훈: 400~499
- 박규리: 500~599
- 박유민: 600~699
- 변성준: 700~799
- 서유탁: 800~899
- 서준혁: 900~999
- 성무선: 1,000~1,099
- 성해준: 1,100~1,199
- 신서빈: 1,200~1,299
- 이석희: 1,300~1,399
- 이주연: 1,400~1,499
- 한승우: 1,500~1,599
- 홍정화: 1,600~1,699

---
파일명 예시: "\~/from20230701to20230730_after_홍길동.xlsx"