In [77]:
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
from tqdm import tqdm

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"}

In [78]:
def makeUrl(search, start_date, end_date, start_pg=1, end_pg=2):
    date_part = "&pd=3&ds=" + start_date + "&de=" + end_date
    return ["https://search.naver.com/search.naver?where=news&sm=tab_pge&query=" + search + date_part + "&sort=2&start=" + str((i - 1) * 10 + 1) for i in range(start_pg, end_pg + 1)]

In [79]:
def get_naver_articles(urls):
    articles, titles, dates, press = [], [], [], []
    
    for url in urls:
        html = requests.get(url, headers=headers)
        soup = BeautifulSoup(html.text, "html.parser")

        article_links = [link.attrs['href'] for link in soup.select("div.group_news > ul.list_news > li div.news_area > div.news_info > div.info_group > a.info")]
        article_titles = [title.text for title in soup.select(".news_tit")]
        article_infos = [info.text for info in soup.select(".info")]

        new_urls = []
        idx = 0

        while idx < len(article_links) - 1:
            if 'news.naver.com' in article_links[idx]:
                new_urls.append(article_links[idx])
                idx += 1
            elif 'news.naver.com' in article_links[idx + 1]:
                new_urls.append(article_links[idx + 1])
                idx += 2
            else:
                new_urls.append(article_links[idx])
                idx += 1

        # 마지막 요소 처리
        if idx == len(article_links) - 1 and 'news.naver.com' not in new_urls[-1]:
            new_urls.append(article_links[idx])

        date_pattern = re.compile(r"\d{4}\.\d{2}\.\d{2}")

        current_dates = []
        current_press = []

        for idx, info in enumerate(article_infos):
            match = date_pattern.search(info)
            if match:
                current_dates.append(match.group(0))
                
                if idx > 0:  # 인덱스 0 앞에는 데이터가 없으므로 idx > 0 조건 추가
                    current_press.append(article_infos[idx - 1])

        articles.extend(new_urls)
        titles.extend(article_titles)
        dates.extend(current_dates)
        press.extend(current_press)

        time.sleep(1)
        
    return articles, titles, dates, press

In [80]:
def get_article_contents(article_urls):
    contents = []
    article_time = []  # 이름 변경
    for url in tqdm(article_urls):
        news = requests.get(url, headers=headers)
        soup = BeautifulSoup(news.text, "html.parser")
        
        content = soup.select("#dic_area") or soup.select("#articeBody")
        cleaned_content = re.sub('<[^>]*>', '', ''.join(str(item) for item in content)).replace("flash 오류를 우회하기 위한 함수 추가function _flash_removeCallback() {}", '')
        contents.append(cleaned_content)
        
        try:
            date = soup.select_one("div#ct> div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > div > span").attrs['data-date-time']
        except AttributeError:
            date = re.sub('<[^>]*>', '', str(soup.select_one("#content > div.end_ct > div > div.article_info > span > em")))
        article_time.append(date)  # 이름 변경
    
    return contents, article_time  # 이름 변경


In [82]:
def main():
    search = input("검색할 키워드를 입력해주세요:")
    start_date = input("\n크롤링할 시작 날짜를 입력해주세요. ex)2022.01.01:")
    end_date = input("\n크롤링할 종료 날짜를 입력해주세요. ex)2022.12.31:")

    urls = makeUrl(search, start_date, end_date)
    article_urls, article_titles, article_dates, article_press = get_naver_articles(urls)
    
    contents, article_time = get_article_contents(article_urls)  # 이름 변경
    
    df = pd.DataFrame({'date': article_dates, 'time': article_time, 'title': article_titles, 'content': contents,'press': article_press, 'link': article_urls })  # 이름 변경
    df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
    return df

if __name__ == "__main__":
    result_df = main()
    print(result_df)

100%|██████████| 20/20 [00:11<00:00,  1.72it/s]

          date    press                                               link  \
0   2017.01.01     전자신문  https://n.news.naver.com/mnews/article/030/000...   
1   2017.01.01   39면 1단  https://n.news.naver.com/mnews/article/030/000...   
2   2017.01.01   울산매일신문                              http://www.iusm.co.kr   
3   2017.01.02   4면 TOP  https://n.news.naver.com/mnews/article/029/000...   
4   2017.01.03     이데일리  https://n.news.naver.com/mnews/article/018/000...   
5   2017.01.03   지디넷코리아  https://n.news.naver.com/mnews/article/092/000...   
6   2017.01.03     IT조선                              http://it.chosun.com/   
7   2017.01.03   아이뉴스24  https://n.news.naver.com/mnews/article/031/000...   
8   2017.01.03     국방일보                        http://kookbang.dema.mil.kr   
9   2017.01.03   A2면 1단  https://n.news.naver.com/mnews/article/015/000...   
10  2017.01.04    오마이뉴스  https://n.news.naver.com/mnews/article/047/000...   
11  2017.01.05     이데일리  https://n.news.naver.com/mnews/article/




In [84]:
#데이터 프레임 저장
filename = '{}_{}_{}.csv'.format(search, start_date.replace(".", ""), end_date.replace(".", ""))
result_df.to_csv(filename, encoding='utf-8-sig', index=False)

In [75]:
search = input("검색할 키워드를 입력해주세요:")
start_date = input("\n크롤링할 시작 날짜를 입력해주세요. ex)2022.01.01:")
end_date = input("\n크롤링할 종료 날짜를 입력해주세요. ex)2022.12.31:")

urls = makeUrl(search, start_date, end_date)
article_urls, article_titles, article_dates, article_press = get_naver_articles(urls)

# 여기서 각 리스트의 길이를 확인합니다.
print("Length of article_urls:", len(article_urls))
print("Length of article_titles:", len(article_titles))
print("Length of article_dates:", len(article_dates))
print("Length of article_press:", len(article_press))


contents, article_time = get_article_contents(article_urls)  # 이름 변경

# 길이가 동일한지 확인
if len(article_titles) == len(article_dates)== len(article_press) == len(article_urls) == len(contents):
    df = pd.DataFrame({'title': article_titles, 'dates': article_dates,'press': article_press, 'link': article_urls, 'content': contents, 'time' : article_time})
    df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
    print(df)
else:
    print("The lengths of the lists are not equal. Dataframe cannot be created.")


  0%|          | 0/20 [00:00<?, ?it/s]

Length of article_urls: 20
Length of article_titles: 20
Length of article_dates: 20
Length of article_press: 20


100%|██████████| 20/20 [00:12<00:00,  1.58it/s]

                                              title       dates    press  \
0                           [2017 국제 핫이슈]블록체인 기술 확산  2017.01.01     전자신문   
1                       [신년기획]블록체인, 금융권 넘어 IT기업도 관심  2017.01.01   39면 1단   
2   [2017 신년특집] 세계 금융의 대변화-화폐없는 시대가 온다 디지털 가상화폐 ...  2017.01.01   울산매일신문   
3                 블록체인 등 최신 핀테크 기술 도입… `금융 한류` 일으키자  2017.01.02   4면 TOP   
4                         비트코인, 3년만에 처음으로 1000달러 돌파  2017.01.03     이데일리   
5                       '암호화 화폐' 비트코인, 3년만에 1천달러 돌파  2017.01.03   지디넷코리아   
6                  비즈니스로 진화한 사이버 범죄, ‘한국형 랜섬웨어’ 현실로  2017.01.03     IT조선   
7                             KISA, 암호기술팀·블록체인TF 신설  2017.01.03   아이뉴스24   
8                 교황청의 문장.잉글랜드 국기 모두 전쟁에서 만들어진 ‘심벌’  2017.01.03     국방일보   
9                                   비트코인 1000달러 넘었다  2017.01.03   A2면 1단   
10                          김기춘 위기에 빠트린 사건 세 가지의 결말  2017.01.04    오마이뉴스   
11                        비트코인 사상최고가…中 자본통제에 `풍선효과`  2017.01.05     이데일리   
12          


