In [1]:
# 필요한 라이브러리 import
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.common.keys import Keys
import time
import warnings
warnings.filterwarnings('ignore')

In [5]:
base_url = "https://www.bbc.com"
start_url = "https://www.bbc.com/news/business"

# 해당 페이지에서 각 기사들의 url 뽑아오는 함수
def get_articles_urls(soup):
    return [base_url + li.a["href"] for li in soup.select("li.lx-stream__post-container")], [li.find('article').find('div').find('div').find('time').find_all('span')[1].text for li in soup.select("li.lx-stream__post-container")]


# 해당 페이지 기사들의 정보 뽑아오는 코드
def get_page_articles(driver):
    driver_source = driver.page_source
    soup = BeautifulSoup(driver_source, 'html.parser')
    article_urls, article_times = get_articles_urls(soup)

    article_titles = []
    article_time_save = []
    article_contents = []
    article_relateds = []
    for article_url, article_time in zip(article_urls, article_times):
        article_title, article_content, article_related = extract_single_article_content(article_url)
        if article_title and article_content:
            article_titles.append(article_title)
            article_time_save.append(article_time)
            article_contents.append(article_content)
            article_relateds.append(article_related)
    return article_titles, article_time_save, article_contents, article_relateds, soup


# 한 기사의 제목, 본문, 태그들 가져오는 함수
def extract_single_article_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # 기사 제목 가져오기
    title = soup.select_one("h1")
    if title:
        title = title.get_text(strip=True)
    else:
        return None, None, None
    
    # 기사 내용 가져오기
    paragraphs = []
    p_elements = soup.select("div.ssrcss-7uxr49-RichTextContainer.e5tfeyi1 > p")
    
    if not p_elements: # 비어있는 경우를 확인하여 에러를 방지합니다.
        return None, None, None
    
    last_p_element = p_elements[-1]
    # div.ssrcss-7uxr49-RichTextContainer.e5tfeyi1 클래스 하위의 p 요소들을 가져오기
    for p in p_elements:
        # 마지막 문단의 a, i에 태그가 나올 땐 날려버려야 하는 경우가 있다.
        if p == last_p_element:
            # p 요소 안에 있는 모든 a와 i 태그 삭제
            for tag in p.find_all(["i", "a"]):
                tag.decompose()
        # p 요소의 텍스트만 추출하여 paragraphs에 추가
        paragraphs.append(p.text.strip())

    content = "\n".join(paragraphs)
    
    # 기사 태그 가져오기
    tags = soup.select('div.ssrcss-1qmkvfu-TopicListWrapper.etw6iwl1 > div.ssrcss-1szabdv-StyledTagContainer.ed0g1kj1 > div.ssrcss-17ehax8-Cluster.e1ihwmse1 > ul.ssrcss-1ujonwb-ClusterItems.e1ihwmse0 > li')
    if not tags:
        return None, None, None
    
    related = ', '.join([tag.get_text() for tag in tags])
       

    return title, content, related



In [None]:
# 크롬 드라이버 설치, 연결
s = Service("D:\chromedriver.exe")
driver = webdriver.Chrome(service=s)

# 브라우저 화면 크기 변경하기
driver.maximize_window()

# 웹 페이지 열기
url = "https://www.bbc.com/news/business"
driver.get(url)

# 데이터 프레임 초기화
df_business = pd.DataFrame(columns=["Title", "Time", "Content", "Related"])

# title과 content를 가져오기 위해 get_page_articles 함수 호출
article_titles, article_time_save, article_contents, article_relateds, soup = get_page_articles(driver)

body = driver.find_elements('css selector', 'body')[0]
for i in range(17):
    body.send_keys(Keys.PAGE_DOWN)
    
btn = driver.find_elements('css selector', 'div > div.gel-icon.gel-icon--next')[0]

while True:
    try:
        # 데이터 프레임에 저장
        for title, time_save, content, related in zip(article_titles, article_time_save, article_contents, article_relateds):
            if title in df_business:
                continue
            else:
                df_business = df_business.append({"Title": title, "Time": time_save, "Content": content, "Related": related}, ignore_index=True)
        
        # 500개 까지만 모으기
        if len(df_business) > 500:
                break

        print('모인 기사 수: ' + str(len(df_business)))

        time.sleep(1)

        btn.click()

        # 페이지를 변경한 후에도 다음 뉴스 목록을 가져올 수 있도록 변경된 페이지에 대한 soup 객체 생성
        driver_source = driver.page_source
        page_soup = BeautifulSoup(driver_source, 'html.parser')

        # 변경된 페이지에 대한 정보를 크롤링하고 순환하는 데 사용
        article_titles, article_time_save, article_contents, aricle_relateds, soup = get_page_articles(driver)

    except:
        btn.click()
    

# 웹 드라이버 종료
driver.quit()

# 데이터 프레임 출력
df_business

모인 기사 수: 17
모인 기사 수: 34
모인 기사 수: 49


In [22]:
# 중복 기사 제거
df_business = df_business.drop_duplicates()

In [25]:
df_business

Unnamed: 0,Title,Time,Content,Related
0,Meta OKs deleting Threads without losing Insta...,20:22,Changes are being rolled out to the Threads ap...,"Social media, Meta, Instagram, Social media re..."
1,Pay rises outstrip inflation by most for two y...,19:37,Wages have risen faster than inflation by the ...,"Unemployment, Employment, UK economy, Pay, Off..."
2,Why businesses are pulling billions in profits...,13:00,Foreign businesses have been pulling money out...,"Companies, China-US relations"
3,"Cost-of-living payments: What are they, who ge...",9:03,Millions of low-income households across the U...,"Pensioners, Money, Personal finance, Energy in..."
4,"'We go to work to serve customers, not to be a...",1:34,"Jo Crumplin, a team leader at a convenience st...","Companies, Co-operative Group, Retailing, Crime"
...,...,...,...,...
476,Mortgage rates: The full pain of this chaos is...,6:01 24 Jun,In the run-up to this week's shock interest ra...,"UK Finance, Money, Personal finance, Housing m..."
477,Sunny weather sees people splash out on new cl...,21:05 23 Jun,Shoppers have been splashing out on new outfit...,"Tourism, Retailing, London"
478,Furby: Toy giant Hasbro brings back iconic rob...,20:11 23 Jun,US toy giant Hasbro has brought back the iconi...,"Personal finance, Inflation, Cost of living, U..."
479,Interest rates: Bank of England boss denies wa...,2:59 23 Jun,The Bank of England boss has denied trying to ...,"US economy, Airbnb"


In [26]:
df_business.to_excel('business.xlsx', index=False)