In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# Base url 
base_url = "https://www.reuters.com"

In [3]:
# Endpoint to the page containing articles
page_endpoint = "/news/archive/goldMktRpt?page="

# Here focus is on one page however there are 394 more pages to be considered
page_number = 394

# Full page url created by combining the base url, endpoint and the page number
# Pagination
page_url = f"{base_url}{page_endpoint}{page_number}"

# Send a request to the page url
response = requests.get(page_url)
if response.status_code != 200:
    print(f"Failed to retrieve page {page_number}")

# Soup object for the page and parsing it
page_soup = BeautifulSoup(response.text, 'html.parser')

# Saving the contents of the page for offline access and inspection 
with open("news_page.html", "w", encoding = 'utf-8') as file:
    ## prettify the soup object and convert it into a string  
    file.write(str(page_soup.prettify()))

In [8]:
# Data structure to store article infor
# Article infor to be acquired:
#       date
#       time
#       link
#       author
#       type_of_author
#       publisher
#       type_of_publisher
#       headline
#       full_text

article_infor = {}
articles = page_soup.find_all('div', class_='story-content')
for article in articles:
    print(f"{base_url}{article.a.attrs['href']}")
    print(article.a.text.strip())

https://www.reuters.com/article/global-precious/precious-gold-slips-as-asian-equities-rise-ahead-of-sino-u-s-trade-deal-idUSL4N29I09V
PRECIOUS-Gold slips as Asian equities rise ahead of Sino-U.S. trade deal
https://www.reuters.com/article/global-precious/precious-gold-dips-on-sino-u-s-trade-optimism-firm-dollar-idUSL4N29I04S
PRECIOUS-Gold dips on Sino-U.S. trade optimism, firm dollar
https://www.reuters.com/article/global-precious/precious-gold-firms-on-renewed-u-s-iran-tensions-eyes-fifth-weekly-gain-idUSL4N29F31F
PRECIOUS-Gold firms on renewed U.S.-Iran tensions, eyes fifth weekly gain
https://www.reuters.com/article/global-precious/precious-gold-extends-fall-as-fading-middle-east-tensions-boost-risk-appetite-idUSL4N29F29I
PRECIOUS-Gold extends fall as fading Middle East tensions boost risk appetite
https://www.reuters.com/article/global-precious/precious-gold-inches-lower-amid-rising-equities-as-mideast-tensions-fade-idUSL4N29F0TM
PRECIOUS-Gold inches lower amid rising equities as M

In [10]:
article_link = f"{base_url}{articles[0].a.attrs['href']}"
article_link

[32m'https://www.reuters.com/article/global-precious/precious-gold-slips-as-asian-equities-rise-ahead-of-sino-u-s-trade-deal-idUSL4N29I09V'[0m

In [11]:

response = requests.get(article_link)
if response.status_code != 200:
    print(f"Failed to retrieve article with link :  {article_link}")

article_soup = BeautifulSoup(response.text, 'html.parser')
# open the file in w mode
# set encoding to UTF-8
with open("article_page.html", "w", encoding = 'utf-8') as file:
    
    # prettify the soup object and convert it into a string  
    file.write(str(article_soup.prettify()))

In [29]:
article_json_meta_data = article_soup.find('script', type="application/ld+json")
article_json_meta_data

[1m<[0m[1;95mscript[0m[39m [0m[33mtype[0m[39m=[0m[32m"application[0m[32m/ld+json"[0m[39m>[0m[1;39m{[0m[32m"@context"[0m[39m:[0m[32m"http://schema.org"[0m[39m,[0m[32m"@type"[0m[39m:[0m[32m"NewsArticle"[0m[39m,[0m[32m"mainEntityOfPage"[0m[39m:[0m[1;39m{[0m[32m"@type"[0m[39m:[0m[32m"WebPage"[0m[39m,[0m[32m"@id"[0m[39m:[0m[32m"https://www.reuters.com/article/global-precious-idUSL4N29I09V"[0m[1;39m}[0m[39m,[0m[32m"headline"[0m[39m:[0m[32m"PRECIOUS-Gold slips as Asian equities rise ahead of Sino-U.S. trade deal"[0m[39m,[0m[32m"url"[0m[39m:[0m[32m"https://www.reuters.com/article/global-precious-idUSL4N29I09V"[0m[39m,[0m[32m"image"[0m[39m:[0m[1;39m{[0m[32m"@type"[0m[39m:[0m[32m"ImageObject"[0m[39m,[0m[32m"url"[0m[39m:[0m[32m"https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800"[0m[39m,[0m[32m"width"[0m[39m:[0m[32m"800"[0m[39m,[0m[32m"height"[0m

In [30]:
article_json_meta_data.contents[0]

[32m'[0m[32m{[0m[32m"@context":"http://schema.org","@type":"NewsArticle","mainEntityOfPage":[0m[32m{[0m[32m"@type":"WebPage","@id":"https://www.reuters.com/article/global-precious-idUSL4N29I09V"[0m[32m}[0m[32m,"headline":"PRECIOUS-Gold slips as Asian equities rise ahead of Sino-U.S. trade deal","url":"https://www.reuters.com/article/global-precious-idUSL4N29I09V","image":[0m[32m{[0m[32m"@type":"ImageObject","url":"https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800"[0m[32m,"width":"800","height":"800"[0m[32m}[0m[32m,"thumbnailUrl":"https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800"[0m[32m,"dateCreated":"2020-01-13T05:00:07Z","dateModified":"2020-01-13T08:10:19Z","datePublished":"2020-01-13T05:00:07Z","author":[0m[32m{[0m[32m"@type":"Person","name":"Asha Sistla"[0m[32m}[0m[32m,"publisher":[0m[32m{[0m[32m"@type":"Organization","name":"Reuters","logo":[0m[32

In [31]:
import json

json_data = json.loads(article_json_meta_data.contents[0])
json_data


[1m{[0m
    [32m'@context'[0m: [32m'http://schema.org'[0m,
    [32m'@type'[0m: [32m'NewsArticle'[0m,
    [32m'mainEntityOfPage'[0m: [1m{[0m
        [32m'@type'[0m: [32m'WebPage'[0m,
        [32m'@id'[0m: [32m'https://www.reuters.com/article/global-precious-idUSL4N29I09V'[0m
    [1m}[0m,
    [32m'headline'[0m: [32m'PRECIOUS-Gold slips as Asian equities rise ahead of Sino-U.S. trade deal'[0m,
    [32m'url'[0m: [32m'https://www.reuters.com/article/global-precious-idUSL4N29I09V'[0m,
    [32m'image'[0m: [1m{[0m
        [32m'@type'[0m: [32m'ImageObject'[0m,
        [32m'url'[0m: [32m'https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800[0m[32m'[0m,
        [32m'width'[0m: [32m'800'[0m,
        [32m'height'[0m: [32m'800'[0m
    [1m}[0m,
    [32m'thumbnailUrl'[0m: [32m'https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800[0m[32m'[0m,
    [32m'd

In [27]:
json_data['datePublished']
json_data['author']

[32m'2020-01-13T05:00:07Z'[0m

In [13]:
article_text = article_soup.find('pre').text
article_text

[32m' [0m[32m([0m[32mUpdates prices[0m[32m)[0m[32m\n    * U.S., China due to sign trade deal on Wednesday\n    * SPDR Gold holdings fall to lowest since Sept. 16 on Friday\n    * Gold specs raise bullish positions in week to Jan. 7\n\n    By Asha Sistla\n    Jan 13 [0m[32m([0m[32mReuters[0m[32m)[0m[32m - Gold prices fell on Monday as Asian\nstocks touched 19-month highs ahead of the planned signing of an\ninterim trade deal between Washington and Beijing, which has\nencouraged investors to plough back into riskier assets.\n    Spot gold        dipped 0.6% to $1,552.42 per ounce by 0756\nGMT. U.S. gold futures         fell 0.4% to $1,553.30.\n    Asian shares rose to new 19-month highs ahead of the Phase 1\ndeal due to be signed at the White House on Wednesday.\n           \n    "Risk [0m[32m([0m[32msentiment[0m[32m)[0m[32m looks pretty good in Asia, equity inflows\nare coming [0m[32m([0m[32malong[0m[32m)[0m[32m nicely, built around this trade narrative 