#### Libraries used

 Requests & BeautifulSoup: Used for fetching and parsing HTML pages.
     
 CSV & JSON: For saving scraped data.

 Logging: Logs important messages and errors.
     
 Threading and Time: Useful for handling multiple requests and measuring execution time.

 Selenium and WebDriver: Used for handling JavaScript-rendered pages.

#### Logging:

Sets up logging to a file named 'scraper.log'

Logs INFO level messages and above.

It includes timestamp, log level, and message


In [34]:
import requests
from bs4 import BeautifulSoup
import csv
import json
import logging
import threading
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


#Configures logging to record events in a file (scraper.log).
logging.basicConfig(filename='scraper.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')



In [35]:
def fetch_page(url, use_selenium=False):
    """
    Fetch page content using either requests or Selenium
    Requests work if page is static while Selenium works if page loads dynamically using javascript
    
    """
    if use_selenium:
        return fetch_with_selenium(url)
    else:
        headers = {'User-Agent': 'Mozilla/5.0'}  #headers are used to mimic real browsers
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as e:
            logging.error(f"Failed to fetch {url}: {e}")
            return None

##### Main function to fetch page content.
##### Uses regular requests by default and switches to Selenium if needed (for JavaScript content).
##### Requests work if page is static while Selenium works if page loads dynamically using javascript
##### Uses a user-agent header to mimic a real browser.
##### Implements error handling to log failures.

In [36]:
def fetch_with_selenium(url):
    """Use Selenium to get JavaScript-rendered content"""
    try:
        options = Options()
        options.add_argument("--headless")  # Run in headless mode
        options.add_argument("--disable-gpu")
        options.add_argument("--window-size=1920,1080")
        
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        
        # Wait for articles to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "[data-testid='edinburgh-card']"))
        )
        
        html = driver.page_source
        driver.quit()
        return html
    except Exception as e:
        logging.error(f"Selenium error: {e}")
        return None


##### Uses Selenium WebDriver to fetch pages requiring JavaScript rendering.
##### Runs in headless mode (without opening a browser window).
##### Waits until specific elements ([data-testid='edinburgh-card']) are loaded.
##### The driver opens the webpage and waits until news articles appear before extracting content.
##### Returns page HTML source.
##### Properly closes the browser and handles errors


In [37]:
#Extracts title, summary, date, and link from the HTML content.
#Uses BeautifulSoup to find elements with specific data attributes.

def parse_articles(html):
    """Parse BBC News articles from HTML"""
    soup = BeautifulSoup(html, 'html.parser')
    articles = []
    
    # Modern BBC News selectors (as of 2025)
    for article in soup.select("[data-testid='edinburgh-card'], [data-testid='london-card']"):
        try:
            title_elem = article.select_one("h2[data-testid='card-headline']")
            link_elem = article.select_one("a[data-testid='internal-link']")
            summary_elem = article.select_one("p[data-testid='card-description']")
            time_elem = article.select_one("time")
            
            if not title_elem or not link_elem:
                continue
                
            title = title_elem.get_text(strip=True)
            link = link_elem['href']
            
            if not link.startswith('http'):
                link = f'https://www.bbc.com{link}'
                
            summary = summary_elem.get_text(strip=True) if summary_elem else ''
            date = time_elem['datetime'] if time_elem and time_elem.has_attr('datetime') else ''
            
            articles.append({
                'title': title,
                'date': date,
                'summary': summary,
                'link': link
            })
        except Exception as e:
            logging.error(f"Error parsing article: {e}")
            continue
    
    return articles


##### Parses HTML using BeautifulSoup.
##### Finds article elements using CSS selectors.
##### Handles missing elements.
##### Logs parsing errors.

##### Extracts:

Title

Link (converts to absolute URL if relative)

Summary (if available)

Publication date (if available).

In [38]:
#Pagination handling
#Looks for the "Next" button in the page navigation.Extracts the next page’s URL to continue scraping.

def get_next_page_url(soup, current_url):
    """Find the next page URL for BBC News"""
    
    #Constructs the full URL if the link is relative.
    next_button = soup.select_one('a[aria-label="Next"]')
    if next_button and 'href' in next_button.attrs:
        next_url = next_button['href']
        if not next_url.startswith('http'):
            base_url = current_url.split('/news')[0]
            next_url = f"{base_url}{next_url}"
        return next_url
    return None


##### Finds the "Next" page button in the pagination.
##### Constructs absolute URL if the href is relative.
##### Returns None if no next page exists.

In [39]:
#Loops through multiple pages to collect articles.
#Tries requests first, then switches to Selenium if no articles are found.

def scrape_news(base_url, max_pages=5):
    """Scrape BBC News with proper pagination handling"""
    all_articles = []
    current_url = base_url
    pages_scraped = 0
    
    while current_url and pages_scraped < max_pages:
        logging.info(f"Scraping {current_url}")
        
        #Extracts and appends articles until reaching max_pages.
        # First try with requests
        html = fetch_page(current_url)
        
        # If no articles found, try with Selenium
        if html and len(parse_articles(html)) == 0:
            html = fetch_page(current_url, use_selenium=True)
        
        if html:
            soup = BeautifulSoup(html, 'html.parser')
            articles = parse_articles(html)
            all_articles.extend(articles)
            pages_scraped += 1
            
            # Get next page URL
            current_url = get_next_page_url(soup, current_url)
        else:
            break
    
    return all_articles

##### Main scraping logic.
##### Handles pagination up to max_pages.
##### First tries regular requests, falls back to Selenium if no articles found.
##### Collects all articles in a list.
##### Logs progress.

In [40]:
#Saves scraped data as CSV or JSON.
#Writes column headers for CSV.

def save_data(data, filename, format='csv'):
    """Save data to file"""
    if not data:
        logging.warning("No data to save!")
        return
     
    #Uses UTF-8 encoding to handle special characters.
    if format == 'csv':
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=data[0].keys())
            writer.writeheader()
            writer.writerows(data)
    elif format == 'json':
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4)

##### Saves data in either CSV or JSON format.
##### Handles UTF-8 encoding properly.
##### Creates proper headers in CSV.
##### Pretty-prints JSON with indentation.
##### Warns if no data to save.

In [41]:
#Main Execution Block

#Measures execution time.
if __name__ == "__main__":
    start_time = time.time()
    base_url = "https://www.bbc.com/news"
    
    # Check initial access
    response = requests.get(base_url, headers={'User-Agent': 'Mozilla/5.0'})
    print(f"Initial request status code: {response.status_code}")
    
    # Start scraping
    print("Starting scraping process...\n")
    articles = scrape_news(base_url, max_pages=2)  # Start with 2 pages
    
    # Print scraped articles in a readable format
    if articles:
        print(f"\nSuccessfully scraped {len(articles)} articles:\n")
        print("-" * 80)
        for i, article in enumerate(articles, 1):
            print(f"ARTICLE {i}:")
            print(f"Title: {article['title']}")
            if article['date']:
                print(f"Date: {article['date']}")
            if article['summary']:
                print(f"Summary: {article['summary']}")
            print(f"Link: {article['link']}")
            print("-" * 80)
    else:
        print("\nNo articles were scraped. Check the log file for errors.")
    
    # Save to files
    if articles:
        save_data(articles, "bbc_news_articles.csv", format='csv')
        save_data(articles, "bbc_news_articles.json", format='json')
        print("\nData saved to 'bbc_news_articles.csv' and 'bbc_news_articles.json'")
    
    print(f"\nScraping completed in {time.time() - start_time:.2f} seconds.")
    logging.info(f"Scraping completed in {time.time() - start_time:.2f} seconds.")

2025-03-25 14:25:55,038 - INFO - Scraping https://www.bbc.com/news


Initial request status code: 200
Starting scraping process...



2025-03-25 14:25:55,874 - INFO - Scraping completed in 1.32 seconds.



Successfully scraped 2 articles:

--------------------------------------------------------------------------------
ARTICLE 1:
Title: Bitcoin in the bush - the crypto mine in remote Zambia
Summary: Bitcoin miners will go to remote locations to take advantage of cheap electricity.
Link: https://www.bbc.com/news/articles/cly4xe373p4o
--------------------------------------------------------------------------------
ARTICLE 2:
Title: Trump bemoans a portrait of him - but gets a new one from Putin
Summary: A "distorted" portrait in the US was removed after the president complained, but he was "touched" by a Russian gift.
Link: https://www.bbc.com/news/articles/c62xyrr20dxo
--------------------------------------------------------------------------------

Data saved to 'bbc_news_articles.csv' and 'bbc_news_articles.json'

Scraping completed in 1.32 seconds.


##### Entry point when script is run directly.
##### Times the scraping process.
##### Prints progress and results to console.
##### Saves data to both CSV and JSON.
##### Provides detailed output of scraped articles.
##### Includes performance metrics.