In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# Base url 
base_url = "https://www.reuters.com"

In [3]:
# Endpoint to the page containing articles
page_endpoint = "/news/archive/goldMktRpt?page="

# Here focus is on one page however there are 394 more pages to be considered
page_number = 60

# Full page url created by combining the base url, endpoint and the page number
# Pagination
page_url = f"{base_url}{page_endpoint}{page_number}"

# Send a request to the page url
response = requests.get(page_url)
if response.status_code != 200:
    print(f"Failed to retrieve page {page_number}")

# Soup object for the page and parsing it
page_soup = BeautifulSoup(response.text, 'html.parser')

# Saving the contents of the page for offline access and inspection 
with open("news_page.html", "w", encoding = 'utf-8') as file:
    ## prettify the soup object and convert it into a string  
    file.write(str(page_soup.prettify()))

In [4]:
# Data structure to store article infor
article_infor = {}
articles = page_soup.find_all('div', class_='story-content')
for article in articles:
    print(f"{base_url}{article.a.attrs['href']}")
    print(article.a.text.strip())
    print(article.find('time').text)

https://www.reuters.com/article/global-precious/precious-gold-gains-for-third-week-on-banking-sector-tension-idUSL4N35P00X
PRECIOUS-Gold gains for third week on banking sector tension

Mar 16 2023

https://www.reuters.com/article/global-precious/precious-gold-prices-hold-firm-as-banking-worries-persist-idUSL4N35O3T6
PRECIOUS-Gold prices hold firm as banking worries persist

Mar 16 2023

https://www.reuters.com/article/global-precious/precious-gold-steady-but-off-6-wk-peak-as-credit-suisse-lifeline-lifts-sentiment-idUSL4N35O2BE
PRECIOUS-Gold steady but off 6-wk peak as Credit Suisse lifeline lifts sentiment

Mar 16 2023

https://www.reuters.com/article/global-precious/precious-gold-stalls-as-traders-strap-in-for-more-banking-news-idUSL1N35O03T
PRECIOUS-Gold stalls as traders strap in for more banking news

Mar 16 2023

https://www.reuters.com/article/global-precious/precious-gold-prices-rise-as-global-banking-crisis-spurs-safe-haven-demand-idUSL4N35N52B
PRECIOUS-Gold prices rise as glob

In [79]:
# Article link is parsed next
article_link = f"{base_url}{articles[0].a.attrs['href']}"
article_infor["link"] = article_link
article_link

[32m'https://www.reuters.com/article/global-precious/precious-gold-slips-1-ahead-of-u-s-china-trade-deal-idUSL4N29I246'[0m

In [80]:
# Sending http request to article link
response = requests.get(article_link)
if response.status_code != 200:
    print(f"Failed to retrieve article with link :  {article_link}")

article_soup = BeautifulSoup(response.text, 'html.parser')
# open the file in w mode
# set encoding to UTF-8
with open("article_page.html", "w", encoding = 'utf-8') as file:
    
    # prettify the soup object and convert it into a string  
    file.write(str(article_soup.prettify()))

In [81]:
# Meta data contains most of the information about an article
article_json_meta_data = article_soup.find('script', type="application/ld+json")
article_json_meta_data

[1m<[0m[1;95mscript[0m[39m [0m[33mtype[0m[39m=[0m[32m"application[0m[32m/ld+json"[0m[39m>[0m[1;39m{[0m[32m"@context"[0m[39m:[0m[32m"http://schema.org"[0m[39m,[0m[32m"@type"[0m[39m:[0m[32m"NewsArticle"[0m[39m,[0m[32m"mainEntityOfPage"[0m[39m:[0m[1;39m{[0m[32m"@type"[0m[39m:[0m[32m"WebPage"[0m[39m,[0m[32m"@id"[0m[39m:[0m[32m"https://www.reuters.com/article/global-precious-idUSL4N29I246"[0m[1;39m}[0m[39m,[0m[32m"headline"[0m[39m:[0m[32m"PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal"[0m[39m,[0m[32m"url"[0m[39m:[0m[32m"https://www.reuters.com/article/global-precious-idUSL4N29I246"[0m[39m,[0m[32m"image"[0m[39m:[0m[1;39m{[0m[32m"@type"[0m[39m:[0m[32m"ImageObject"[0m[39m,[0m[32m"url"[0m[39m:[0m[32m"https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800"[0m[39m,[0m[32m"width"[0m[39m:[0m[32m"800"[0m[39m,[0m[32m"height"[0m[39m:[0m[32m"800"

In [82]:
# content contains a string representation of a json on=bject containing all infor needed
article_json_meta_data.contents[0]

[32m'[0m[32m{[0m[32m"@context":"http://schema.org","@type":"NewsArticle","mainEntityOfPage":[0m[32m{[0m[32m"@type":"WebPage","@id":"https://www.reuters.com/article/global-precious-idUSL4N29I246"[0m[32m}[0m[32m,"headline":"PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal","url":"https://www.reuters.com/article/global-precious-idUSL4N29I246","image":[0m[32m{[0m[32m"@type":"ImageObject","url":"https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800"[0m[32m,"width":"800","height":"800"[0m[32m}[0m[32m,"thumbnailUrl":"https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800"[0m[32m,"dateCreated":"2020-01-13T11:35:26Z","dateModified":"2020-01-13T13:05:08Z","datePublished":"2020-01-13T11:35:26Z","author":[0m[32m{[0m[32m"@type":"Person","name":"K. Sathya Narayanan"[0m[32m}[0m[32m,"publisher":[0m[32m{[0m[32m"@type":"Organization","name":"Reuters","logo":[0m[32m{[0m[32m

In [83]:
# Converting string to json object for easy extraction of data
import json

json_data = json.loads(article_json_meta_data.contents[0])
json_data


[1m{[0m
    [32m'@context'[0m: [32m'http://schema.org'[0m,
    [32m'@type'[0m: [32m'NewsArticle'[0m,
    [32m'mainEntityOfPage'[0m: [1m{[0m
        [32m'@type'[0m: [32m'WebPage'[0m,
        [32m'@id'[0m: [32m'https://www.reuters.com/article/global-precious-idUSL4N29I246'[0m
    [1m}[0m,
    [32m'headline'[0m: [32m'PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal'[0m,
    [32m'url'[0m: [32m'https://www.reuters.com/article/global-precious-idUSL4N29I246'[0m,
    [32m'image'[0m: [1m{[0m
        [32m'@type'[0m: [32m'ImageObject'[0m,
        [32m'url'[0m: [32m'https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800[0m[32m'[0m,
        [32m'width'[0m: [32m'800'[0m,
        [32m'height'[0m: [32m'800'[0m
    [1m}[0m,
    [32m'thumbnailUrl'[0m: [32m'https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800[0m[32m'[0m,
    [32m'dateCreated'[0m: [

In [84]:
# Article infor to be acquired:
#       date
#       time
#       link *
#       author
#       type_of_author
#       publisher
#       type_of_publisher
#       headline *
#       full_text


print(json_data['headline']) # Article headline
print(json_data['datePublished']) # publish date and time of article, this is in Zulu time so conversion to CAT needed
print(json_data['author']['name']) # Name of author'name'
print(json_data['author']['@type']) # Type of author
print(json_data['publisher']['name']) # Name of publisher
print(json_data['publisher']['@type']) # Type of publisher

PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal
2020-01-13T11:35:26Z
K. Sathya Narayanan
Person
Reuters
Organization


In [85]:
# Adding data into article infor dict
article_infor['headline'] = json_data['headline']
article_infor['datePublished'] = json_data['datePublished']
article_infor['author'] = json_data['author']['name']
article_infor['type_of_author'] = json_data['author']['@type']
article_infor['publisher'] = json_data['publisher']['name']
article_infor['type_of_publisher'] = json_data['publisher']['@type']

article_infor


[1m{[0m
    [32m'link'[0m: [32m'https://www.reuters.com/article/global-precious/precious-gold-slips-1-ahead-of-u-s-china-trade-deal-idUSL4N29I246'[0m,
    [32m'headline'[0m: [32m'PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal'[0m,
    [32m'datePublished'[0m: [32m'2020-01-13T11:35:26Z'[0m,
    [32m'author'[0m: [32m'K. Sathya Narayanan'[0m,
    [32m'type_of_author'[0m: [32m'Person'[0m,
    [32m'publisher'[0m: [32m'Reuters'[0m,
    [32m'type_of_publisher'[0m: [32m'Organization'[0m
[1m}[0m

In [1]:
# Full article text is located in the only pre in the page but not on all pages
# Some of the pages the article is contained between multiple p tags 
# Check if there is a <pre> tag containing the article text
article_pre_tag = article_soup.find('pre')

if article_pre_tag:
    # If a <pre> tag is found, process the article as originally posted
    article_text = article_pre_tag.text
    article_text = article_text.replace('\n', '').strip()
else:
    # If there is no <pre> tag, process the article using paragraphs
    article_body_wrapper = article_soup.find('div', class_='ArticleBodyWrapper')
    
    # Find all <p> tags within the ArticleBodyWrapper
    article_text_paragraphs = article_body_wrapper.find_all('p', class_='Paragraph-paragraph-2Bgue ArticleBody-para-TD_9x')
    
    # Initialize an empty list to store paragraphs of the article
    article_paragraphs = []

    # Iterate through the <p> tags and extract text
    for paragraph in article_text_paragraphs:
        paragraph_text = paragraph.get_text(strip=True)  # Get the text and remove leading/trailing whitespace
        article_paragraphs.append(paragraph_text)

    # Concatenate the paragraphs to form the full article text
    article_text = ' '.join(article_paragraphs)
    article_text = article_text.strip()

In [88]:
# Adding article text to article_info
article_info['full_text'] = article_text

In [89]:
article_infor


[1m{[0m
    [32m'link'[0m: [32m'https://www.reuters.com/article/global-precious/precious-gold-slips-1-ahead-of-u-s-china-trade-deal-idUSL4N29I246'[0m,
    [32m'headline'[0m: [32m'PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal'[0m,
    [32m'datePublished'[0m: [32m'2020-01-13T11:35:26Z'[0m,
    [32m'author'[0m: [32m'K. Sathya Narayanan'[0m,
    [32m'type_of_author'[0m: [32m'Person'[0m,
    [32m'publisher'[0m: [32m'Reuters'[0m,
    [32m'type_of_publisher'[0m: [32m'Organization'[0m,
    [32m'full_text'[0m: [32m'[0m[32m([0m[32mUpdates prices[0m[32m)[0m[32m    * U.S., China due to sign trade deal on Wednesday    * SPDR Gold holdings fall to lowest since Sept. 16 on Friday    By K. Sathya Narayanan    Jan 13 [0m[32m([0m[32mReuters[0m[32m)[0m[32m - Gold prices fell 1% on Monday as optimismin equity markets ahead of the signing of an interim U.S.-Chinatrade deal and lack of further escalation in Middle Easttensions diminished bullion\'s s

In [90]:
# Converting the date to utc from zulu time
import pandas as pd
utc_dt = article_infor['datePublished'].replace("Z","UTC")
dt_obj = pd.to_datetime(utc_dt)
date = dt_obj.date()
time = dt_obj.time()
article_infor['date'] = str(date)
article_infor['time'] = str(time)

In [91]:
article_infor


[1m{[0m
    [32m'link'[0m: [32m'https://www.reuters.com/article/global-precious/precious-gold-slips-1-ahead-of-u-s-china-trade-deal-idUSL4N29I246'[0m,
    [32m'headline'[0m: [32m'PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal'[0m,
    [32m'datePublished'[0m: [32m'2020-01-13T11:35:26Z'[0m,
    [32m'author'[0m: [32m'K. Sathya Narayanan'[0m,
    [32m'type_of_author'[0m: [32m'Person'[0m,
    [32m'publisher'[0m: [32m'Reuters'[0m,
    [32m'type_of_publisher'[0m: [32m'Organization'[0m,
    [32m'full_text'[0m: [32m'[0m[32m([0m[32mUpdates prices[0m[32m)[0m[32m    * U.S., China due to sign trade deal on Wednesday    * SPDR Gold holdings fall to lowest since Sept. 16 on Friday    By K. Sathya Narayanan    Jan 13 [0m[32m([0m[32mReuters[0m[32m)[0m[32m - Gold prices fell 1% on Monday as optimismin equity markets ahead of the signing of an interim U.S.-Chinatrade deal and lack of further escalation in Middle Easttensions diminished bullion\'s s