In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
# Base url 
base_url = "https://www.reuters.com"

In [4]:
# Endpoint to the page containing articles
page_endpoint = "/news/archive/goldMktRpt?page="

# Here focus is on one page however there are 394 more pages to be considered
page_number = 394

# Full page url created by combining the base url, endpoint and the page number
# Pagination
page_url = f"{base_url}{page_endpoint}{page_number}"

# Send a request to the page url
response = requests.get(page_url)
if response.status_code != 200:
    print(f"Failed to retrieve page {page_number}")

# Soup object for the page and parsing it
page_soup = BeautifulSoup(response.text, 'html.parser')

# Saving the contents of the page for offline access and inspection 
with open("news_page.html", "w", encoding = 'utf-8') as file:
    ## prettify the soup object and convert it into a string  
    file.write(str(page_soup.prettify()))

In [5]:
# Data structure to store article infor
article_infor = {}
articles = page_soup.find_all('div', class_='story-content')
for article in articles:
    print(f"{base_url}{article.a.attrs['href']}")
    print(article.a.text.strip())

https://www.reuters.com/article/global-precious/precious-gold-slips-1-ahead-of-u-s-china-trade-deal-idUSL4N29I246
PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal
https://www.reuters.com/article/global-precious/precious-gold-slips-as-asian-equities-rise-ahead-of-sino-u-s-trade-deal-idUSL4N29I09V
PRECIOUS-Gold slips as Asian equities rise ahead of Sino-U.S. trade deal
https://www.reuters.com/article/global-precious/precious-gold-dips-on-sino-u-s-trade-optimism-firm-dollar-idUSL4N29I04S
PRECIOUS-Gold dips on Sino-U.S. trade optimism, firm dollar
https://www.reuters.com/article/global-precious/precious-gold-firms-on-renewed-u-s-iran-tensions-eyes-fifth-weekly-gain-idUSL4N29F31F
PRECIOUS-Gold firms on renewed U.S.-Iran tensions, eyes fifth weekly gain
https://www.reuters.com/article/global-precious/precious-gold-extends-fall-as-fading-middle-east-tensions-boost-risk-appetite-idUSL4N29F29I
PRECIOUS-Gold extends fall as fading Middle East tensions boost risk appetite
https://www.reuters

In [15]:
# Article link is parsed next
article_link = f"{base_url}{articles[0].a.attrs['href']}"
article_infor["link"] = article_link
article_link

[32m'https://www.reuters.com/article/global-precious/precious-gold-slips-1-ahead-of-u-s-china-trade-deal-idUSL4N29I246'[0m

In [16]:
# Sending http request to article link
response = requests.get(article_link)
if response.status_code != 200:
    print(f"Failed to retrieve article with link :  {article_link}")

article_soup = BeautifulSoup(response.text, 'html.parser')
# open the file in w mode
# set encoding to UTF-8
with open("article_page.html", "w", encoding = 'utf-8') as file:
    
    # prettify the soup object and convert it into a string  
    file.write(str(article_soup.prettify()))

In [8]:
# Meta data contains most of the information about an article
article_json_meta_data = article_soup.find('script', type="application/ld+json")
article_json_meta_data

[1m<[0m[1;95mscript[0m[39m [0m[33mtype[0m[39m=[0m[32m"application[0m[32m/ld+json"[0m[39m>[0m[1;39m{[0m[32m"@context"[0m[39m:[0m[32m"http://schema.org"[0m[39m,[0m[32m"@type"[0m[39m:[0m[32m"NewsArticle"[0m[39m,[0m[32m"mainEntityOfPage"[0m[39m:[0m[1;39m{[0m[32m"@type"[0m[39m:[0m[32m"WebPage"[0m[39m,[0m[32m"@id"[0m[39m:[0m[32m"https://www.reuters.com/article/global-precious-idUSL4N29I246"[0m[1;39m}[0m[39m,[0m[32m"headline"[0m[39m:[0m[32m"PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal"[0m[39m,[0m[32m"url"[0m[39m:[0m[32m"https://www.reuters.com/article/global-precious-idUSL4N29I246"[0m[39m,[0m[32m"image"[0m[39m:[0m[1;39m{[0m[32m"@type"[0m[39m:[0m[32m"ImageObject"[0m[39m,[0m[32m"url"[0m[39m:[0m[32m"https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800"[0m[39m,[0m[32m"width"[0m[39m:[0m[32m"800"[0m[39m,[0m[32m"height"[0m[39m:[0m[32m"800"

In [17]:
# content contains a string representation of a json on=bject containing all infor needed
article_json_meta_data.contents[0]

[32m'[0m[32m{[0m[32m"@context":"http://schema.org","@type":"NewsArticle","mainEntityOfPage":[0m[32m{[0m[32m"@type":"WebPage","@id":"https://www.reuters.com/article/global-precious-idUSL4N29I246"[0m[32m}[0m[32m,"headline":"PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal","url":"https://www.reuters.com/article/global-precious-idUSL4N29I246","image":[0m[32m{[0m[32m"@type":"ImageObject","url":"https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800"[0m[32m,"width":"800","height":"800"[0m[32m}[0m[32m,"thumbnailUrl":"https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800"[0m[32m,"dateCreated":"2020-01-13T11:35:26Z","dateModified":"2020-01-13T13:05:08Z","datePublished":"2020-01-13T11:35:26Z","author":[0m[32m{[0m[32m"@type":"Person","name":"K. Sathya Narayanan"[0m[32m}[0m[32m,"publisher":[0m[32m{[0m[32m"@type":"Organization","name":"Reuters","logo":[0m[32m{[0m[32m

In [18]:
# Converting string to json object for easy extraction of data
import json

json_data = json.loads(article_json_meta_data.contents[0])
json_data


[1m{[0m
    [32m'@context'[0m: [32m'http://schema.org'[0m,
    [32m'@type'[0m: [32m'NewsArticle'[0m,
    [32m'mainEntityOfPage'[0m: [1m{[0m
        [32m'@type'[0m: [32m'WebPage'[0m,
        [32m'@id'[0m: [32m'https://www.reuters.com/article/global-precious-idUSL4N29I246'[0m
    [1m}[0m,
    [32m'headline'[0m: [32m'PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal'[0m,
    [32m'url'[0m: [32m'https://www.reuters.com/article/global-precious-idUSL4N29I246'[0m,
    [32m'image'[0m: [1m{[0m
        [32m'@type'[0m: [32m'ImageObject'[0m,
        [32m'url'[0m: [32m'https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800[0m[32m'[0m,
        [32m'width'[0m: [32m'800'[0m,
        [32m'height'[0m: [32m'800'[0m
    [1m}[0m,
    [32m'thumbnailUrl'[0m: [32m'https://s1.reutersmedia.net/resources_v2/images/rcom-default.png?[0m[32mw[0m[32m=[0m[32m800[0m[32m'[0m,
    [32m'dateCreated'[0m: [

In [14]:
# Article infor to be acquired:
#       date
#       time
#       link *
#       author
#       type_of_author
#       publisher
#       type_of_publisher
#       headline *
#       full_text


print(json_data['headline']) # Article headline
print(json_data['datePublished']) # publish date and time of article, this is in Zulu time so conversion to CAT needed
print(json_data['author']['name']) # Name of author'name'
print(json_data['author']['@type']) # Type of author
print(json_data['publisher']['name']) # Name of publisher
print(json_data['publisher']['@type']) # Type of publisher

PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal
2020-01-13T11:35:26Z
K. Sathya Narayanan
Person
Reuters
Organization


In [19]:
# Adding data into article infor dict
article_infor['headline'] = json_data['headline']
article_infor['datePublished'] = json_data['datePublished']
article_infor['author'] = json_data['author']['name']
article_infor['type_of_author'] = json_data['author']['@type']
article_infor['publisher'] = json_data['publisher']['name']
article_infor['type_of_publisher'] = json_data['publisher']['@type']

article_infor


[1m{[0m
    [32m'link'[0m: [32m'https://www.reuters.com/article/global-precious/precious-gold-slips-1-ahead-of-u-s-china-trade-deal-idUSL4N29I246'[0m,
    [32m'headline'[0m: [32m'PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal'[0m,
    [32m'datePublished'[0m: [32m'2020-01-13T11:35:26Z'[0m,
    [32m'author'[0m: [32m'K. Sathya Narayanan'[0m,
    [32m'type_of_author'[0m: [32m'Person'[0m,
    [32m'publisher'[0m: [32m'Reuters'[0m,
    [32m'type_of_publisher'[0m: [32m'Organization'[0m
[1m}[0m

In [40]:
# Full article text is located in the only pre in the page
article_text = article_soup.find('pre').text
article_text

[32m' [0m[32m([0m[32mUpdates prices[0m[32m)[0m[32m\n    * U.S., China due to sign trade deal on Wednesday\n    * SPDR Gold holdings fall to lowest since Sept. 16 on Friday\n\n    By K. Sathya Narayanan\n    Jan 13 [0m[32m([0m[32mReuters[0m[32m)[0m[32m - Gold prices fell 1% on Monday as optimism\nin equity markets ahead of the signing of an interim U.S.-China\ntrade deal and lack of further escalation in Middle East\ntensions diminished bullion\'s safe-haven appeal.\n    The U.S.-China Phase 1 agreement is due to be signed at the\nWhite House on Wednesday.             \n    Spot gold        dipped 0.5% to $1,553.60 per ounce as of\n1246 GMT, having fallen 1% to $1,546.27 earlier in the session.\n    U.S. gold futures         fell 0.4% to $1,554.50.\n    "We are struggling [0m[32m([0m[32ma[0m[32m)[0m[32m little bit with the details. It\'ll be\nquite interesting to see if there is any concrete guidance in\nthe details of the phase-one deal," said Julius Baer analy

In [41]:
# Clean article text 
article_text = article_text.replace('\n', '').strip()
article_text

[32m'[0m[32m([0m[32mUpdates prices[0m[32m)[0m[32m    * U.S., China due to sign trade deal on Wednesday    * SPDR Gold holdings fall to lowest since Sept. 16 on Friday    By K. Sathya Narayanan    Jan 13 [0m[32m([0m[32mReuters[0m[32m)[0m[32m - Gold prices fell 1% on Monday as optimismin equity markets ahead of the signing of an interim U.S.-Chinatrade deal and lack of further escalation in Middle Easttensions diminished bullion\'s safe-haven appeal.    The U.S.-China Phase 1 agreement is due to be signed at theWhite House on Wednesday.                 Spot gold        dipped 0.5% to $1,553.60 per ounce as of1246 GMT, having fallen 1% to $1,546.27 earlier in the session.    U.S. gold futures         fell 0.4% to $1,554.50.    "We are struggling [0m[32m([0m[32ma[0m[32m)[0m[32m little bit with the details. It\'ll bequite interesting to see if there is any concrete guidance inthe details of the phase-one deal," said Julius Baer analystCarsten Menke.    "Also, the ne

In [42]:
# Adding article text to article infor
article_infor['full_text'] = article_text

In [43]:
article_infor


[1m{[0m
    [32m'link'[0m: [32m'https://www.reuters.com/article/global-precious/precious-gold-slips-1-ahead-of-u-s-china-trade-deal-idUSL4N29I246'[0m,
    [32m'headline'[0m: [32m'PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal'[0m,
    [32m'datePublished'[0m: [32m'2020-01-13T11:35:26Z'[0m,
    [32m'author'[0m: [32m'K. Sathya Narayanan'[0m,
    [32m'type_of_author'[0m: [32m'Person'[0m,
    [32m'publisher'[0m: [32m'Reuters'[0m,
    [32m'type_of_publisher'[0m: [32m'Organization'[0m,
    [32m'full_text'[0m: [32m'[0m[32m([0m[32mUpdates prices[0m[32m)[0m[32m    * U.S., China due to sign trade deal on Wednesday    * SPDR Gold holdings fall to lowest since Sept. 16 on Friday    By K. Sathya Narayanan    Jan 13 [0m[32m([0m[32mReuters[0m[32m)[0m[32m - Gold prices fell 1% on Monday as optimismin equity markets ahead of the signing of an interim U.S.-Chinatrade deal and lack of further escalation in Middle Easttensions diminished bullion\'s s

In [69]:
# Converting the date to utc from zulu time
from datetime import datetime as dt
utc_dt = article_infor['datePublished'].replace("Z","UTC")
dt_obj = pd.to_datetime(utc_dt)
date = dt_obj.date()
time = dt_obj.time()
article_infor['date'] = str(date)
article_infor['time'] = str(time)

In [70]:
article_infor


[1m{[0m
    [32m'link'[0m: [32m'https://www.reuters.com/article/global-precious/precious-gold-slips-1-ahead-of-u-s-china-trade-deal-idUSL4N29I246'[0m,
    [32m'headline'[0m: [32m'PRECIOUS-Gold slips 1% ahead of U.S.-China trade deal'[0m,
    [32m'datePublished'[0m: [32m'2020-01-13T11:35:26Z'[0m,
    [32m'author'[0m: [32m'K. Sathya Narayanan'[0m,
    [32m'type_of_author'[0m: [32m'Person'[0m,
    [32m'publisher'[0m: [32m'Reuters'[0m,
    [32m'type_of_publisher'[0m: [32m'Organization'[0m,
    [32m'full_text'[0m: [32m'[0m[32m([0m[32mUpdates prices[0m[32m)[0m[32m    * U.S., China due to sign trade deal on Wednesday    * SPDR Gold holdings fall to lowest since Sept. 16 on Friday    By K. Sathya Narayanan    Jan 13 [0m[32m([0m[32mReuters[0m[32m)[0m[32m - Gold prices fell 1% on Monday as optimismin equity markets ahead of the signing of an interim U.S.-Chinatrade deal and lack of further escalation in Middle Easttensions diminished bullion\'s s

In [None]:
# Function to scrap article_links from page url
