# 기사 크롤링

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from newspaper import Article
from konlpy.tag import Okt
import pandas as pd
import time
from tqdm import tqdm
import nltk

# Download NLTK stopwords
nltk.download('stopwords')

# Create an Okt object
okt = Okt()

# Selenium options
options = Options()
prefs = {
    "download.default_directory": "/home/dev_ws/eda",
    "download.prompt_for_download": False
}
options.add_experimental_option('prefs', prefs)

# Path to the ChromeDriver
webdriver_service = Service("../eda/driver/chromedriver")

# Start the WebDriver
driver = webdriver.Chrome(service=webdriver_service, options=options)

# Naver news search URL
search_url = """https://search.naver.com/search.naver?where=news&query=%EA%B3%A0%EC%98%81&sm=tab_opt&sort=2&photo=0&field=0&pd=3&ds=2020.01.01&de=2024.06.01&docid=&related=0&mynews=0&office_type=0&office_section_code=0&news_office_checked=&nso=so%3Ar%2Cp%3Afrom20200101to20240601&is_sug_officeid=0&office_category=0&service_area=0"""
# Load the page
driver.get(search_url)
time.sleep(0.1)  # Wait for the page to load

# Scroll down until all articles are loaded
SCROLL_PAUSE_TIME = 0.1
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll to the bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # Wait for the new page segment to load
    time.sleep(SCROLL_PAUSE_TIME)
    
    # Calculate the new scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    
    # Break if the scroll height hasn't changed (no more content to load)
    if new_height == last_height:
        break
    last_height = new_height

# Use BeautifulSoup to extract article information
soup = BeautifulSoup(driver.page_source, 'html.parser')
all_articles = []
for item in tqdm(soup.select('.news_area'), desc='Extracting articles'):
    title_tag = item.select_one('.news_tit')
    title = title_tag.get_text(strip=True)
    link = title_tag['href']

    press_tag = item.select_one('.info_group > .press')
    press = press_tag.get_text(strip=True) if press_tag else ''

    date_tag = item.select_one('.info_group > span')
    date = date_tag.get_text(strip=True) if date_tag else ''

    all_articles.append({
        'title': title,
        'press': press,
        'date': date,
        'link': link
    })

# Close the browser
driver.quit()

# Function to extract article content
def get_article_content(url):
    try:
        article = Article(url, language='ko')
        article.download()
        article.parse()
        return article.text
    except Exception as e:
        print(f"Error fetching article: {e}")
        return None

# Function to extract keywords from content
def get_keywords(content):
    nouns = okt.nouns(content)
    filtered_nouns = [noun for noun in nouns if len(noun) > 1]
    return ', '.join(filtered_nouns)

# Fetch content and keywords for each article
for article in tqdm(all_articles, desc='Fetching article contents'):
    content = get_article_content(article['link'])
    article['content'] = content
    
    if content:
        keywords = get_keywords(content)
        article['keywords'] = keywords
    else:
        article['keywords'] = None

# Create a DataFrame
df = pd.DataFrame(all_articles)

# Function to parse dates and remove rows with invalid dates
def parse_date(date_str):
    try:
        return pd.to_datetime(date_str).date()
    except ValueError:
        return None

# Apply date parsing and remove rows with invalid dates
df['date'] = df['date'].apply(parse_date)
df = df.dropna(subset=['date'])

# Save the DataFrame to an Excel file
excel_file = 'kohyoung1.xlsx'
df.to_excel(excel_file, index=False, engine='openpyxl')

print(f'Saved article information to {excel_file}')


[nltk_data] Downloading package stopwords to /home/jh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Extracting articles: 100%|██████████| 4000/4000 [00:01<00:00, 2832.35it/s]
Fetching article contents:   3%|▎         | 101/4000 [01:11<3:38:54,  3.37s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.hankyung.com', port=443): Read timed out. on URL https://www.hankyung.com/finance/article/202001133536L


Fetching article contents:   3%|▎         | 105/4000 [01:13<1:10:35,  1.09s/it]

Error fetching article: Article `download()` failed with Response ended prematurely on URL http://yna.kr/AKR20200113121500527?did=1195m


Fetching article contents:   3%|▎         | 114/4000 [01:30<4:46:00,  4.42s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.hankyung.com', port=443): Read timed out. on URL https://www.hankyung.com/finance/article/202001134031L


Fetching article contents:   3%|▎         | 130/4000 [01:48<3:28:22,  3.23s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.hankyung.com', port=443): Read timed out. on URL https://www.hankyung.com/finance/article/202001145107L


Fetching article contents:   4%|▎         | 149/4000 [02:03<2:32:50,  2.38s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.hankyung.com', port=443): Read timed out. (read timeout=7) on URL https://www.hankyung.com/finance/article/202001147305L


Fetching article contents:   4%|▍         | 157/4000 [02:10<1:15:12,  1.17s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/567058 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/567058


Fetching article contents:   5%|▍         | 187/4000 [02:37<4:24:29,  4.16s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.hankyung.com', port=443): Read timed out. on URL https://www.hankyung.com/finance/article/202001150224L


Fetching article contents:   5%|▍         | 188/4000 [02:44<5:26:17,  5.14s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.hankyung.com', port=443): Read timed out. on URL https://www.hankyung.com/finance/article/202001150223L


Fetching article contents:   7%|▋         | 265/4000 [03:32<2:26:53,  2.36s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.hankyung.com', port=443): Read timed out. (read timeout=7) on URL https://www.hankyung.com/finance/article/202001200721L


Fetching article contents:   9%|▉         | 369/4000 [04:30<3:14:56,  3.22s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20200131_0000904752&cID=10813&pID=10800


Fetching article contents:  13%|█▎        | 502/4000 [05:47<2:30:45,  2.59s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.hankyung.com', port=443): Read timed out. (read timeout=7) on URL https://www.hankyung.com/finance/article/202002134666L


Fetching article contents:  13%|█▎        | 538/4000 [06:14<43:17,  1.33it/s]  

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/568565 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/568565


Fetching article contents:  20%|█▉        | 786/4000 [08:39<4:44:03,  5.30s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20200311_0000951055&cID=10401&pID=10400


Fetching article contents:  26%|██▋       | 1051/4000 [12:36<34:36,  1.42it/s]  

Error fetching article: Article `download()` failed with HTTPConnectionPool(host='www.newsprime.co.kr', port=80): Max retries exceeded with url: /news/article.html?no=498810 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7343ddbb2290>: Failed to establish a new connection: [Errno 111] Connection refused')) on URL http://www.newsprime.co.kr/news/article.html?no=498810


Fetching article contents:  38%|███▊      | 1535/4000 [20:07<4:13:05,  6.16s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20200507_0001017037&cID=10401&pID=10400


Fetching article contents:  45%|████▍     | 1790/4000 [24:19<39:51,  1.08s/it]  

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://www.hankyung.com/economy/article/202005286716i on URL https://www.hankyung.com/economy/article/202005286716i


Fetching article contents:  50%|████▉     | 1987/4000 [27:50<3:40:31,  6.57s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20200618_0001064053&cID=10401&pID=10400


Fetching article contents:  51%|█████▏    | 2059/4000 [28:58<19:21,  1.67it/s]  

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/06/29/2020062902480.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/06/29/2020062902480.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2065/4000 [29:02<18:41,  1.72it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063000590.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063000590.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2066/4000 [29:03<16:26,  1.96it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063000593.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063000593.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2067/4000 [29:03<17:00,  1.89it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063001020.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063001020.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2068/4000 [29:04<16:18,  1.97it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063001090.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063001090.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2069/4000 [29:04<16:40,  1.93it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063001131.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063001131.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2070/4000 [29:05<20:25,  1.58it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063001173.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063001173.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2073/4000 [29:06<14:05,  2.28it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063003101.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063003101.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2074/4000 [29:06<13:32,  2.37it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063003533.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/06/30/2020063003533.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2078/4000 [29:08<12:17,  2.61it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/01/2020070100955.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/01/2020070100955.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2079/4000 [29:09<12:17,  2.61it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/01/2020070101361.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/01/2020070101361.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2083/4000 [29:10<12:24,  2.57it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/01/2020070101748.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/01/2020070101748.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2088/4000 [29:14<16:26,  1.94it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/01/2020070102829.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/01/2020070102829.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2090/4000 [29:23<1:30:28,  2.84s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20200701_0001080273&cID=10401&pID=10400


Fetching article contents:  52%|█████▏    | 2091/4000 [29:24<1:12:56,  2.29s/it]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/01/2020070103253.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/01/2020070103253.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2098/4000 [29:29<30:07,  1.05it/s]  

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/02/2020070200573.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/02/2020070200573.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▏    | 2099/4000 [29:29<24:37,  1.29it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/02/2020070200576.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/02/2020070200576.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  52%|█████▎    | 2100/4000 [29:29<21:57,  1.44it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/02/2020070201440.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/02/2020070201440.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2104/4000 [29:32<16:52,  1.87it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/03/2020070300657.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/03/2020070300657.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2105/4000 [29:32<16:20,  1.93it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/03/2020070300966.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/03/2020070300966.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2106/4000 [29:33<15:58,  1.98it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/03/2020070301090.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/03/2020070301090.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2107/4000 [29:33<16:28,  1.91it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/03/2020070301458.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/03/2020070301458.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2108/4000 [29:34<17:04,  1.85it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/03/2020070302371.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/03/2020070302371.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2111/4000 [29:35<15:45,  2.00it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070600403.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070600403.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2112/4000 [29:36<19:04,  1.65it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070600400.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070600400.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2113/4000 [29:37<16:49,  1.87it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070600607.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070600607.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2115/4000 [29:38<16:33,  1.90it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070600880.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070600880.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2116/4000 [29:38<14:53,  2.11it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070601011.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070601011.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2117/4000 [29:39<15:58,  1.96it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070602202.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070602202.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2118/4000 [29:39<14:42,  2.13it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070602245.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/06/2020070602245.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2127/4000 [30:02<3:03:37,  5.88s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20200707_0001085965&cID=13001&pID=13000


Fetching article contents:  53%|█████▎    | 2132/4000 [30:03<40:11,  1.29s/it]  

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/07/2020070700942.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/07/2020070700942.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  53%|█████▎    | 2133/4000 [30:04<37:32,  1.21s/it]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/07/2020070700954.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/07/2020070700954.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▎    | 2144/4000 [30:11<10:08,  3.05it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/07/2020070702486.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/07/2020070702486.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▎    | 2147/4000 [30:14<21:45,  1.42it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/07/2020070702858.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/07/2020070702858.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2154/4000 [30:19<22:08,  1.39it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070800574.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070800574.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2155/4000 [30:20<20:06,  1.53it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070800828.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070800828.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2156/4000 [30:20<19:17,  1.59it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070800829.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070800829.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2157/4000 [30:21<17:30,  1.75it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070800882.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070800882.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2159/4000 [30:21<15:14,  2.01it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070801113.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070801113.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2160/4000 [30:22<14:00,  2.19it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070802038.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070802038.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2161/4000 [30:23<17:15,  1.78it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070802762.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/08/2020070802762.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2166/4000 [30:26<17:37,  1.73it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070900545.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070900545.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2168/4000 [30:27<14:32,  2.10it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070900794.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070900794.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2169/4000 [30:27<13:35,  2.24it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070900796.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070900796.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2171/4000 [30:28<11:41,  2.61it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070901334.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070901334.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2173/4000 [30:28<12:35,  2.42it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902062.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902062.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2174/4000 [30:29<15:50,  1.92it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902113.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902113.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2175/4000 [30:30<14:26,  2.11it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902125.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902125.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2176/4000 [30:30<14:40,  2.07it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902216.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902216.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2177/4000 [30:30<13:38,  2.23it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902401.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902401.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2178/4000 [30:31<12:56,  2.35it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902506.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902506.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  54%|█████▍    | 2179/4000 [30:31<13:36,  2.23it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902522.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902522.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2180/4000 [30:32<13:02,  2.33it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902521.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902521.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2181/4000 [30:32<13:46,  2.20it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902546.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/09/2020070902546.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2184/4000 [30:34<16:04,  1.88it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071000417.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071000417.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2185/4000 [30:34<14:59,  2.02it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071000420.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071000420.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2186/4000 [30:35<18:17,  1.65it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071000936.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071000936.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2187/4000 [30:36<16:10,  1.87it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071001407.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071001407.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2188/4000 [30:36<15:41,  1.92it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071001406.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071001406.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2189/4000 [30:37<15:13,  1.98it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071001412.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071001412.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2191/4000 [30:37<11:59,  2.52it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071002209.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/10/2020071002209.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2194/4000 [30:39<15:44,  1.91it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/13/2020071300383.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/13/2020071300383.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▍    | 2196/4000 [30:40<18:16,  1.64it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/13/2020071302510.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/13/2020071302510.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▌    | 2210/4000 [30:46<14:23,  2.07it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071400430.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071400430.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▌    | 2212/4000 [30:47<10:50,  2.75it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071400427.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071400427.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▌    | 2216/4000 [30:48<08:24,  3.53it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071400786.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071400786.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  55%|█████▌    | 2219/4000 [30:49<08:43,  3.40it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071401370.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071401370.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  56%|█████▌    | 2220/4000 [30:49<13:46,  2.15it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071401690.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071401690.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  56%|█████▌    | 2221/4000 [30:50<14:34,  2.03it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071401903.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071401903.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  56%|█████▌    | 2222/4000 [30:51<14:33,  2.04it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071402553.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071402553.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  56%|█████▌    | 2224/4000 [30:51<11:03,  2.68it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071402995.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/14/2020071402995.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  56%|█████▌    | 2227/4000 [30:53<17:00,  1.74it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071500686.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071500686.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  56%|█████▌    | 2228/4000 [30:54<16:02,  1.84it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071500904.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071500904.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  56%|█████▌    | 2229/4000 [30:54<16:07,  1.83it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071500994.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071500994.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  56%|█████▌    | 2230/4000 [30:55<14:55,  1.98it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071501036.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071501036.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  56%|█████▌    | 2231/4000 [30:55<14:30,  2.03it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071501234.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071501234.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  56%|█████▌    | 2233/4000 [30:56<12:52,  2.29it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071502326.html?utm_source=naver&utm_medium=original&utm_campaign=biz on URL https://biz.chosun.com/site/data/html_dir/2020/07/15/2020071502326.html?utm_source=naver&utm_medium=original&utm_campaign=biz


Fetching article contents:  62%|██████▏   | 2469/4000 [34:01<2:33:29,  6.02s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20201113_0001233409&cID=10404&pID=15000


Fetching article contents:  64%|██████▎   | 2549/4000 [34:42<10:46,  2.24it/s]  

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/584210 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/584210


Fetching article contents:  64%|██████▍   | 2551/4000 [34:56<1:23:05,  3.44s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20201207_0001260133&cID=10401&pID=10400


Fetching article contents:  65%|██████▍   | 2590/4000 [35:11<05:46,  4.07it/s]  

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://www.medicaltimes.com/Users/Inform/index.html?mode=View&ID=9385 on URL http://www.medicaltimes.com/Users/Inform/index.html?mode=View&ID=9385


Fetching article contents:  66%|██████▌   | 2631/4000 [35:27<08:14,  2.77it/s]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsen.com', port=443): Max retries exceeded with url: /news_view.php?uid=202101180500027410 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL https://www.newsen.com/news_view.php?uid=202101180500027410


Fetching article contents:  67%|██████▋   | 2676/4000 [35:43<08:34,  2.57it/s]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsen.com', port=443): Max retries exceeded with url: /news_view.php?uid=202102041436037410 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL https://www.newsen.com/news_view.php?uid=202102041436037410


Fetching article contents:  68%|██████▊   | 2733/4000 [36:14<52:11,  2.47s/it]

Error fetching article: Article `download()` failed with HTTPConnectionPool(host='www.nongmin.com', port=80): Read timed out. (read timeout=7) on URL http://www.nongmin.com/article/ar_detail.htm?ar_id=604427


Fetching article contents:  70%|██████▉   | 2786/4000 [36:53<1:20:22,  3.97s/it]

Error fetching article: Article `download()` failed with HTTPConnectionPool(host='www.nongmin.com', port=80): Read timed out. (read timeout=7) on URL http://www.nongmin.com/article/ar_detail.htm?ar_id=605029


Fetching article contents:  70%|███████   | 2817/4000 [37:18<1:06:38,  3.38s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20210413_0001404340&cID=10401&pID=10400


Fetching article contents:  74%|███████▎  | 2941/4000 [38:59<05:58,  2.95it/s]  

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/592908 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/592908


Fetching article contents:  75%|███████▍  | 2987/4000 [39:31<42:10,  2.50s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20210506_0001432174&cID=15001&pID=15000


Fetching article contents:  75%|███████▍  | 2992/4000 [39:51<1:46:47,  6.36s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20210506_0001432449&cID=15001&pID=15000


Fetching article contents:  77%|███████▋  | 3080/4000 [40:41<51:25,  3.35s/it]  

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20210531_0001458483&cID=10401&pID=10400


Fetching article contents:  77%|███████▋  | 3086/4000 [40:56<52:05,  3.42s/it]  

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISI20210601_0017513204


Fetching article contents:  79%|███████▉  | 3160/4000 [42:04<1:21:17,  5.81s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISI20210624_0017595804


Fetching article contents:  79%|███████▉  | 3165/4000 [42:12<28:18,  2.03s/it]  

Error fetching article: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.mydaily.co.kr/new_yk/html/read.php?newsid=202106241548467605&ext=na&utm_campaign=naver_news&utm_source=naver&utm_medium=related_news on URL http://www.mydaily.co.kr/new_yk/html/read.php?newsid=202106241548467605&ext=na&utm_campaign=naver_news&utm_source=naver&utm_medium=related_news


Fetching article contents:  80%|███████▉  | 3187/4000 [42:29<44:44,  3.30s/it]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.newsis.com', port=443): Read timed out. on URL http://www.newsis.com/view/?id=NISX20210628_0001492462&cID=10401&pID=10400


Fetching article contents:  83%|████████▎ | 3315/4000 [43:29<04:23,  2.60it/s]

Error fetching article: Article `download()` failed with 404 Client Error: Not Found for url: https://isplus.com/news/article/article.asp?total_id=24113413 on URL http://isplus.live.joins.com/news/article/article.asp?total_id=24113413


Fetching article contents:  84%|████████▍ | 3368/4000 [43:55<03:42,  2.84it/s]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.vop.co.kr', port=443): Max retries exceeded with url: /A00001593708.html (Caused by SSLError(SSLError(1, '[SSL: DH_KEY_TOO_SMALL] dh key too small (_ssl.c:1007)'))) on URL https://www.vop.co.kr/A00001593708.html


Fetching article contents:  85%|████████▍ | 3395/4000 [44:28<27:27,  2.72s/it]

Error fetching article: Article `download()` failed with HTTPConnectionPool(host='www.kyosu.net', port=80): Read timed out. (read timeout=7) on URL http://www.kyosu.net/news/articleView.html?idxno=76156


Fetching article contents:  90%|█████████ | 3604/4000 [46:00<02:09,  3.06it/s]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/604239 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/604239


Fetching article contents:  91%|█████████ | 3622/4000 [46:10<02:42,  2.32it/s]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/604692 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/604692


Fetching article contents:  91%|█████████ | 3636/4000 [46:15<02:31,  2.41it/s]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/605143 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/605143


Fetching article contents:  91%|█████████ | 3647/4000 [46:23<03:29,  1.69it/s]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/605534 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/605534


Fetching article contents:  92%|█████████▏| 3668/4000 [46:39<02:06,  2.61it/s]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/604240 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/604240
Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/604697 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/604697
Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/604695 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certifica

Fetching article contents:  93%|█████████▎| 3706/4000 [47:32<02:28,  1.98it/s]

Error fetching article: Article `download()` failed with HTTPSConnectionPool(host='www.sentv.co.kr', port=443): Max retries exceeded with url: /news/view/607294 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1007)'))) on URL http://www.sentv.co.kr/news/view/607294


Fetching article contents: 100%|██████████| 4000/4000 [50:08<00:00,  1.33it/s]


Saved article information to kohyoung1.xlsx


# date 타입 바꾸고 nat 값 제거 

In [None]:
import pandas as pd

# 엑셀 파일 읽기
df = pd.read_excel("kohyoung.xlsx")

# 'date' 열을 datetime으로 변환, 유효하지 않은 날짜는 NaT로 처리
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# NaT 값을 가진 행 출력 (선택사항)
invalid_dates = df[df['date'].isna()]
if not invalid_dates.empty:
    print("유효하지 않은 날짜를 가진 행:")
    print(invalid_dates)

# NaT 값을 가진 행 제거
df_cleaned = df.dropna(subset=['date'])

# date 열에서 날짜만 추출
df_cleaned['date'] = df_cleaned['date'].dt.date

# 날짜순으로 정렬 (선택사항)
df_cleaned = df_cleaned.sort_values('date')

# 결과 출력
print(df_cleaned)

# 엑셀 파일로 저장
df_cleaned.to_excel('kohyoung2.xlsx', index=False)

print(f"처리된 데이터를 'kohyoung2.xlsx' 파일로 저장했습니다.")
print(f"총 {len(df_cleaned)} 개의 유효한 행이 저장되었습니다.")
print(f"첫 번째 날짜: {df_cleaned['date'].iloc[0]}, 마지막 날짜: {df_cleaned['date'].iloc[-1]}")

유효하지 않은 날짜를 가진 행:
                                       title       press date  \
19                  [고영의 문헌 속 ‘밥상’]고사리 꺾은 뜻은        경향신문  NaT   
134             [이주의 관.종]고영, HBM 검사장비로 영토 확장       아시아경제  NaT   
169   도쿄일렉트론·고영테크…'산단 지정 1년] 용인에 400개 기업 몰렸다        서울경제  NaT   
281                 고영"명품만 생존…K의료장비도 최고 될 것"  한국경제언론사 선정  NaT   
524                  반도체벨트 효과…소부장 기업들 용인에 둥지       아시아경제  NaT   
...                                      ...         ...  ...   
3728         조중생 前 교수·고광일 대표 ‘자랑스러운 서울인상’ 선정        문화일보  NaT   
3731                          [부음] 이동칠씨 별세 외        매일경제  NaT   
3732                           [부음] 송한경 별세 외        조선일보  NaT   
3736         조중생 前 교수·고광일 대표 ‘자랑스러운 서울인상’ 선정        문화일보  NaT   
3741         조중생 前 교수·고광일 대표 ‘자랑스러운 서울인상’ 선정        문화일보  NaT   

                                                   link  \
19    https://www.khan.co.kr/opinion/column/article/...   
134   https://view.asiae.co.kr/article/2023121914161...   
169         https://www.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['date'] = df_cleaned['date'].dt.date


처리된 데이터를 'kohyoung2.xlsx' 파일로 저장했습니다.
총 3846 개의 유효한 행이 저장되었습니다.
첫 번째 날짜: 2022-03-06, 마지막 날짜: 2024-06-01


# unwanted data 처리

In [None]:
import pandas as pd
import re

# Load the Excel file into a DataFrame
excel_file = 'kohyoung3.xlsx'
df = pd.read_excel(excel_file, engine='openpyxl')

# Define a function to check if a row contains any unwanted content
def has_unwanted_content(text):
    unwanted_patterns = [
        r'저작권', r'재배포', r'출처',r'사진',r'카카오스토리',r'시인',r'독서',r'희망이',r'소개',r'네이버',r'로그인',r'고양이', r'식단',# Korean keywords
        r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b',  # Email pattern
        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'  # URL pattern
    ]
    return any(re.search(pattern, str(text)) for pattern in unwanted_patterns)

# Filter out rows with unwanted content
filtered_meere_df = df[~df['content'].apply(has_unwanted_content)]

# Drop rows containing NaN values
filtered_meere_df = filtered_meere_df.dropna()


# Reset index
filtered_meere_df.reset_index(drop=True, inplace=True)

# Save the filtered DataFrame to a new csv file
filtered_csv_file = 'kohyoung4.xlsx'
filtered_meere_df.to_excel(filtered_csv_file, index=False)

print(f'Filtered articles saved to {filtered_csv_file}.')

Filtered articles saved to kohyoung4.xlsx.


In [None]:
import ast
df = pd.read_excel('kohyoung3.xlsx')
df.dropna(subset=['title', 'date', 'content','keyword'], inplace=True)
df = df[~df['content'].apply(lambda x: x =='카카오스토리(으)로 기사보내기 URL복사(으)로 기사보내기 이메일(으)로 기사보내기 다른 공유 찾기 기사스크랩하기' or
                             x =="가장 많이 읽힌 뉴스를 제공합니다. 집계 기준에 따라 최대 3일 전 기사까지 제공될 수 있습니다." or
                             x == "카카오톡(으)로 기사보내기 네이버밴드(으)로 기사보내기 네이버블로그(으)로 기사보내기 네이버라인(으)로 기사보내기 핀터레스트(으)로 기사보내기 URL복사(으)로 기사보내기" or
                             x == "라이브리 댓글 작성을 위해 JavaScript를 활성화해주세요" or
                             x == "이 기사를 공유합니다")]
df = df[~df['press'].apply(lambda x:x =="대구신문" or 
                           x == '뉴시스' )]
df = df[~df['keyword'].apply(lambda x:x == "네이버" or x == "카카오스토리" or x== '시인' or x=='영화' or x=='보험')]

df.to_excel('kohyoung4.xlsx', index=False, engine='openpyxl')


# pipline 연결 하고 데이터 정리

In [None]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
# Continue with your other imports and script logic

import pandas as pd
from transformers import pipeline
import torch
df = pd.read_excel("kohyoung4.xlsx")


# GPU 사용 가능 여부 확인
device = 0 if torch.cuda.is_available() else -1

# 감성 분석 파이프라인 로드
classifier = pipeline('sentiment-analysis', model='snunlp/KR-FinBert-SC', device=device)
df['combined_text'] = df['title'] + ' ' + df['content']
def analyze_sentiment(text):
    if pd.isna(text) or text.strip() == '':
        return 'Unknown'  # 빈 문자열이나 NaN 값에 대한 처리
    result = classifier(text[:512])  # 모델의 최대 입력 길이 제한
    return result[0]['label']

# 감성 분석 및 새로운 열 생성
df['senti_ctext'] = df['combined_text'].apply(analyze_sentiment)
df['senti_keyword'] = df['keyword'].apply(analyze_sentiment)

# senti_value 칼럼 생성, sentiment_from_ctext와 sentiment_from_keyword이 같으면, 0, 다르면 1
df['match_status'] = df.apply(
    lambda row: 0 if row['senti_ctext'] == row['senti_keyword'] else 1, axis=1)
df['date'] = pd.to_datetime(df['date']).dt.date

# 파일 저장시, 칼럼값 지정

df[['date','title', 'content','link' ,'keyword', 'senti_ctext', 'senti_keyword', 'match_status']].to_excel("kohyoung5.xlsx", index=False, engine='xlsxwriter')
print("Excel file created Successfully")
print(df.head(1))

2024-07-05 17:28:49.609693: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-05 17:28:49.715677: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-05 17:28:49.716307: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-05 17:28:49.891701: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
You seem to be using the pipelines sequentially on G

Excel file created Successfully
                               title   press        date  \
0  넥슨지티, 코스닥시장 시가총액 순위 한주간 17계단 '점프'  핀포인트뉴스  2022-03-06   

                                                link  \
0  http://www.pinpointnews.co.kr/news/articleView...   

                                             content  \
0  코스닥 종목 중 넥슨지티 종목이 한주간 시총 순위가 가장 크게 뛰어 올랐다.\n\n...   

                                      keyword  \
0  종목, 계단, 코스닥, 순위, 위로, 바이오, 넥슨, 주간, 한국, 시가총액   

                                       combined_text senti_ctext  \
0  넥슨지티, 코스닥시장 시가총액 순위 한주간 17계단 '점프' 코스닥 종목 중 넥슨지...    positive   

  senti_keyword  match_status  
0       neutral             1  


# 데이터베이스에 저장

In [None]:
import mysql.connector

conn = mysql.connector.connect(
    host = "database-1.cvqcay0g4c1o.ap-northeast-2.rds.amazonaws.com",
    port = 3306,
    user = "jook",
    password = "************",
    database = "antking"
)
cursor = conn.cursor(buffered=True)

df = pd.read_excel('kohyoung5.xlsx')

insert_data_to_article_and_sentiment(conn, cursor, 'NAVER', '고영', df)

conn.close()

Inserted row 1 into article: 넥슨지티, 코스닥시장 시가총액 순위 한주간 17계단 '점프' (ID: 21298)
Inserted sentiment data for article ID 21298
Inserted keyword data for article ID 21298
Inserted row 2 into article: 넥슨지티, 코스닥시장 시가총액 순위 한주간 17계단 '점프' (ID: 21299)
Inserted sentiment data for article ID 21299
Inserted keyword data for article ID 21299
Inserted row 3 into article: 넥슨지티, 코스닥시장 시가총액 순위 한주간 17계단 '점프' (ID: 21300)
Inserted sentiment data for article ID 21300
Inserted keyword data for article ID 21300
Inserted row 4 into article: 넥슨지티, 코스닥시장 시가총액 순위 한주간 17계단 '점프' (ID: 21301)
Inserted sentiment data for article ID 21301
Inserted keyword data for article ID 21301
Inserted row 5 into article: [코스닥 마감]기관·외국인 쌍끌이 매도…880선 턱걸이 (ID: 21302)
Inserted sentiment data for article ID 21302
Inserted keyword data for article ID 21302
Inserted row 6 into article: 한국비엔씨 6.19% 급등 ...코스닥150 지수 구성종목 중 주가상승률 '선두' (ID: 21303)
Inserted sentiment data for article ID 21303
Inserted keyword data for article ID 21303
Inserted row 

In [None]:
def insert_data_to_article_and_sentiment(conn, cursor, search_platform, search_keyword, df):
    try:
        # Article 테이블에 데이터 삽입
        for i, row in df.iterrows():
            sql_article = "INSERT INTO article (search_platform, search_keyword, pub_date, title, content, link) VALUES (%s, %s, %s, %s, %s, %s)"
            values_article = (search_platform, search_keyword, row['date'], row['title'], row['content'], row['link'])
            cursor.execute(sql_article, values_article)
            article_id = cursor.lastrowid  # 마지막으로 삽입된 article의 ID를 가져옴
            print(f"Inserted row {i+1} into article: {row['title']} (ID: {article_id})")
            # Sentiment 테이블에 데이터 삽입
            sql_sentiment = "INSERT INTO sentiment (article_id, sentiment_from_ctext, sentiment_from_keyword, match_status) VALUES (%s, %s, %s, %s)"
            values_sentiment = (article_id, row['senti_ctext'], row['senti_keyword'], row['match_status'])
            cursor.execute(sql_sentiment, values_sentiment)
            print(f"Inserted sentiment data for article ID {article_id}")
            # keyword 테이블에 데이터 삽입
            sql_keyword = "INSERT INTO keyword (article_id, keyword_set) VALUES (%s, %s)"
            values_keyword = (article_id, row['keyword'])
            cursor.execute(sql_keyword, values_keyword)
            print(f"Inserted keyword data for article ID {article_id}")
            conn.commit()
    except mysql.connector.Error as error:
        print(f"Error inserting data: {error}")