In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os

In [3]:
def scrape_cnn_sport_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract title
    title = soup.find('h1', class_='headline__text').get_text(strip=True) if soup.find('h1', class_='headline__text') else 'N/A'

    # Extract publication time
    timestamp_div = soup.find('div', class_='timestamp')
    timestamp = timestamp_div.get_text(strip=True).replace('Published', '').strip() if timestamp_div else 'N/A'

    # Extract author
    author_div = soup.find('div', class_='byline__names')
    author = author_div.find('span', class_='byline__name').get_text(strip=True) if author_div and author_div.find('span', class_='byline__name') else 'N/A'

    # Extract first paragraph
    first_paragraph = soup.find('p', class_='paragraph').get_text(strip=True) if soup.find('p', class_='paragraph') else 'N/A'

    data = [[timestamp, title, first_paragraph, author]]
    return data
# Example URL of the article
url = 'https://www.cnn.com/2024/07/23/sport/paris-olympics-security-threats-spt-intl/index.html'
data = scrape_cnn_sport_article(url)

# Create DataFrame and save to Excel
df = pd.DataFrame(data, columns=['Date Time', 'News Title', 'First Paragraph', 'Author'])
df.to_excel('sport.xlsx', index=False)

print("Data scraping and saving to Excel file completed.")

Data scraping and saving to Excel file completed.


In [8]:
def scrape_article_details(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract title
        title = soup.find('h1', class_='headline__text').get_text(strip=True) if soup.find('h1', class_='headline__text') else 'N/A'

        # Extract publication time
        timestamp_div = soup.find('div', class_='timestamp')
        timestamp = timestamp_div.get_text(strip=True).replace('Published', '').strip() if timestamp_div else 'N/A'

        # Extract author
        author_div = soup.find('div', class_='byline__names')
        author = author_div.find('span', class_='byline__name').get_text(strip=True) if author_div and author_div.find('span', class_='byline__name') else 'N/A'

        # Extract first paragraph
        first_paragraph = soup.find('p', class_='paragraph').get_text(strip=True) if soup.find('p', class_='paragraph') else 'N/A'

        return [timestamp, title, first_paragraph, author]
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ['N/A', 'N/A', 'N/A', 'N/A']

def scrape_cnn_sport():
    url = 'https://www.cnn.com/sport'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all article links
    article_links = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('/2024') and '/sport' in href:
            article_links.append('https://www.cnn.com' + href)
    
    data = []
    for link in article_links:
        data.append(scrape_article_details(link))
        time.sleep(1)  # Wait for 1 second before next request to avoid overwhelming the server

    return data

# Scrape data from the sport category
data = scrape_cnn_sport()

# Create DataFrame and save to Excel
df = pd.DataFrame(data, columns=['Date Time', 'News Title', 'First Paragraph', 'Author'])
df.to_excel('sport.xlsx', index=False)

print("Data scraping and saving to Excel file completed.")

Data scraping and saving to Excel file completed.


In [None]:
def scrape_article_details(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract title
        title = soup.find('h1', class_='headline__text').get_text(strip=True) if soup.find('h1', class_='headline__text') else 'N/A'

        # Extract publication time
        timestamp_div = soup.find('div', class_='timestamp')
        timestamp = timestamp_div.get_text(strip=True).replace('Published', '').strip() if timestamp_div else 'N/A'

        # Extract author
        author_div = soup.find('div', class_='byline__names')
        author = author_div.find('span', class_='byline__name').get_text(strip=True) if author_div and author_div.find('span', class_='byline__name') else 'N/A'

        # Extract first paragraph
        first_paragraph = soup.find('p', class_='paragraph').get_text(strip=True) if soup.find('p', class_='paragraph') else 'N/A'

        return [timestamp, title, first_paragraph, author]
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ['N/A', 'N/A', 'N/A', 'N/A']

def scrape_cnn_football():
    url = 'https://www.cnn.com/sport/football'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all article links
    article_links = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if href.startswith('/2024') and '/football' in href:
            article_links.append('https://www.cnn.com' + href)
    
    data = []
    for link in article_links:
        data.append(scrape_article_details(link))
        time.sleep(1)  # Wait for 1 second before next request to avoid overwhelming the server

    return data

# File path for the existing data
file_path = 'sport.xlsx'

# Scrape data from the football category
new_data = scrape_cnn_football()

# Load existing data if the file exists
if os.path.exists(file_path):
    existing_df = pd.read_excel(file_path)
    new_df = pd.DataFrame(new_data, columns=['Date Time', 'News Title', 'First Paragraph', 'Author'])
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
else:
    combined_df = pd.DataFrame(new_data, columns=['Date Time', 'News Title', 'First Paragraph', 'Author'])

# Save the combined data to Excel
combined_df.to_excel(file_path, index=False)

print("Data scraping and saving to Excel file completed.")