In [1]:
import requests # type: ignore
from bs4 import BeautifulSoup # type: ignore
import re
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_article_info(article_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    try:
        response = requests.get(article_url, headers=headers)
        response.raise_for_status()  # Raise an error for bad status codes
    except requests.RequestException as e:
        print(f"Failed to retrieve article: {article_url}. Error: {e}")
        return None

    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.content, "html.parser")

    title = soup.find("h1", class_="headline__text")
    title = title.get_text(strip=True) if title else "No title found"

    author = soup.find("span", class_="byline__name") or soup.find("meta", {"name": "author"})
    author = author.get_text(strip=True) if author and hasattr(author, "get_text") else author["content"] if author else "No author listed"

    date = soup.find("div", class_="timestamp vossi-timestamp")
    if date:
        date_text = date.get_text(strip=True)
        date_match = re.search(r"\b[A-Za-z]{3}\s[A-Za-z]+\s\d{1,2},\s\d{4}\b", date_text)
        date = date_match.group(0) if date_match else "No date found"
    else:
        date = "No date found"

    paragraphs = soup.find_all("p")
    summary = " ".join(p.get_text(strip=True) for p in paragraphs[:2])

    reading_time = soup.find("div", class_="headline__sub-description")
    reading_time = reading_time.get_text(strip=True) if reading_time else ""

    return {
        "title": title,
        "author": author,
        "summary": summary,
        "published_date": date,
        "reading_time": reading_time,
        "url": article_url
    }

def scrape_cnn_articles():
    base_url = "https://edition.cnn.com"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }

    try:
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Failed to retrieve homepage. Error: {e}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    headlines = soup.find_all("span", class_="container__headline-text")

    article_links = []
    for headline in headlines:
        parent_link = headline.find_parent("a", href=True)
        if parent_link and parent_link['href']:
            article_url = f"{base_url}{parent_link['href']}" if parent_link['href'].startswith('/') else parent_link['href']
            article_links.append(article_url)

    return article_links

def save_to_csv(data, filename="cnn_articles.csv"):
    with open(filename, mode="w", newline='', encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["title", "author", "summary", "published_date", "reading_time", "url"])
        writer.writeheader()
        writer.writerows(data)

if __name__ == "__main__":
    article_links = scrape_cnn_articles()
    scraped_data = []
    max_articles = 100

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(get_article_info, url): url for url in article_links[:max_articles]}
        for future in as_completed(futures):
            article_info = future.result()
            if article_info:
                scraped_data.append(article_info)
                print(f"Scraped article: {article_info['title']}")

    if scraped_data:
        save_to_csv(scraped_data)
        print(f"Scraped {len(scraped_data)} articles and saved to cnn_articles.csv.")
    else:
        print("No articles were scraped.")

Scraped article: Can New York Yankees recover from last-gasp World Series Game 1 loss against Los Angeles Dodgers in Game 2?
Scraped article: Israel launched strikes on Iran in a retaliatory attack. Here’s what we know
Scraped article: Russian strike on Ukrainian hospital used to treat soldiers kills at least five
Scraped article: Airline cracks down on ‘gate lice’ who skip the line
Scraped article: Georgia’s pro-Western president condemns ‘troubling incidents of violence’ at polling stations with crucial vote underway
Scraped article: The rape claims that tie the Menendez case to Menudo: ‘There might be other victims,’ doc producer says
Scraped article: With worst-case scenario averted for now, White House and Harris campaign still face a serious risk in Middle East conflict
Scraped article: Israel launched strikes on Iran in a retaliatory attack. Here’s what we know
Scraped article: No title found
Scraped article: Retired colonel explains significance of Israel’s strike against Iran
