In [None]:
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
# Base url
base_url = "https://www.reuters.com"
page_endpoint = "/news/archive/goldMktRpt?page="

In [None]:
# Function to perform HTTP requests
def send_http_request(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"HTTP Request Error: {e}")
        return None

In [None]:
# Function to scrape a page URL and return a list of article links
def scrape_page(page_number):
    page_url = f"{base_url}{page_endpoint}{page_number}"
    page_html = send_http_request(page_url)
    if page_html:
        page_soup = BeautifulSoup(page_html, 'html.parser')
        articles = page_soup.find_all('div', class_='story-content')
        article_links = [f"{base_url}{article.a.attrs['href']}" for article in articles]
        return article_links
    return []

In [None]:
# Function to scrape an article link and return article information
def scrape_article(article_link):
    article_html = send_http_request(article_link)
    if article_html:
        article_soup = BeautifulSoup(article_html, 'html.parser')
        article_info = {}
        
        article_json_meta_data = article_soup.find('script', type="application/ld+json")
        json_data = json.loads(article_json_meta_data.contents[0])
        
        utc_dt = article_info['datePublished'].replace("Z", "UTC")
        dt_obj = pd.to_datetime(utc_dt)
        date = str(dt_obj.date())
        time = str(dt_obj.time())
        
        article_info['date'] = date
        article_info['time'] = time
        article_info['link'] = article_link
        article_info['headline'] = json_data['headline']
        article_info['datePublished'] = json_data['datePublished']
        article_info['author'] = json_data['author']['name']
        article_info['type_of_author'] = json_data['author']['@type']
        article_info['publisher'] = json_data['publisher']['name']
        article_info['type_of_publisher'] = json_data['publisher']['@type']
        
        article_pre_tag = article_soup.find('pre')
        if article_pre_tag:
            article_text = article_pre_tag.text
            article_text = article_text.replace('\n', '').strip()
        else:
            article_body_wrapper = article_soup.find('div', class_='ArticleBodyWrapper')
            article_text_paragraphs = article_body_wrapper.find_all('p', class_='Paragraph-paragraph-2Bgue ArticleBody-para-TD_9x')
            article_paragraphs = [paragraph.get_text(strip=True) for paragraph in article_text_paragraphs]
            article_text = ' '.join(article_paragraphs).strip()
            
        article_info['type_of_publisher'] = article_text
        return article_info
    return None


In [None]:
# Function to read the last scraped page number from a file
def read_last_page_number(filename):
    try:
        with open(filename, 'r') as file:
            last_page = int(file.read().strip())
        return last_page
    except FileNotFoundError:
        return 1  # Start from page 1 if the file doesn't exist

# Function to write the last scraped page number to a file
def write_last_page_number(filename, page_number):
    with open(filename, 'w') as file:
        file.write(str(page_number))

In [None]:
# Function to scrape articles from a specified range of pages and save them to CSV
def scrape_pages_and_save_csv(start_page, end_page):
    article_info_list = []
    last_scraped_page = read_last_page_number('last_scraped_page.txt')
    
    for page_number in range(start_page, last_scraped_page - 1, -1):
        article_links = scrape_page(page_number)
        if article_links:
            for article_link in article_links:
                article_info = scrape_article(article_link)
                if article_info:
                    article_info_list.append(article_info)
                    if len(article_info_list) >= 10:
                        save_to_csv(article_info_list, f'articles_page_{page_number}.csv')
                        article_info_list = []
                write_last_page_number('last_scraped_page.txt', page_number)
        else:
            print(f"Failed to scrape page {page_number}. Skipping...")

    # Save any remaining articles
    if article_info_list:
        save_to_csv(article_info_list, f'articles_page_{last_scraped_page}.csv')

In [None]:
# Scrape articles from the last scraped page (or page 394) to page 1
scrape_pages_and_save_csv(read_last_page_number('last_scraped_page.txt'), 1)