# Guardian API access

In [1]:
#import required libraries
import requests
import json
import re
import time

In [2]:
#load your personal API key
file_path = file_path = "../../private/guardian_key.txt"
with open(file_path, 'r') as file:
    key = file.read().strip()
len(key)

36

In [3]:
#build a search URL
base_url = 'https://content.guardianapis.com/'
search_string = "(indigenous%20OR%20first%20nations%20OR%20aboriginal)%20AND%20government%20AND%20(policy%20OR%20initiative%20OR%20program%20OR%20funding%20OR%20aid%20OR%20assistance)%20AND%20(economy%20OR%20innovation%20OR%20job%20creation%20OR%20employment)%20AND%20(boost%20OR%20grow%20OR%20increase%20OR%20build%20OR%20create%20OR%20develop)"
production_office = "aus"
from_date = "2015-01-01"

full_url = base_url+f"search?q={search_string}&production-office={production_office}&from-date={from_date}&show-fields=body&api-key={key}"

print(full_url)

https://content.guardianapis.com/search?q=(indigenous%20OR%20first%20nations%20OR%20aboriginal)%20AND%20government%20AND%20(policy%20OR%20initiative%20OR%20program%20OR%20funding%20OR%20aid%20OR%20assistance)%20AND%20(economy%20OR%20innovation%20OR%20job%20creation%20OR%20employment)%20AND%20(boost%20OR%20grow%20OR%20increase%20OR%20build%20OR%20create%20OR%20develop)&production-office=aus&from-date=2015-01-01&show-fields=body&api-key=78c5d4a1-1677-4424-8a0e-2019f74b3fcd


In [4]:
# get data from server
server_response = requests.get(full_url)
server_data = server_response.json()
resp_data = server_data.get('response','')
if resp_data == '':
    print("ERROR obtaining results:",server_data)
else:
    print("SUCCESS!")
    print(f"{resp_data['total']} results found available in {resp_data['pages']} pages")
    print(f"{resp_data['pageSize']} results per page")
    results = resp_data.get('results',[])

SUCCESS!
110 results found available in 11 pages
10 results per page


In [5]:
results[0]

{'id': 'australia-news/live/2023/sep/14/australia-politics-live-qantas-yes-voice-smoke-sydney-greens-federal-funding-schools-covid-cruise-ship',
 'type': 'liveblog',
 'sectionId': 'australia-news',
 'sectionName': 'Australia news',
 'webPublicationDate': '2023-09-14T08:17:23Z',
 'webTitle': 'Developer ‘deeply regrets’ comments – as it happened',
 'webUrl': 'https://www.theguardian.com/australia-news/live/2023/sep/14/australia-politics-live-qantas-yes-voice-smoke-sydney-greens-federal-funding-schools-covid-cruise-ship',
 'apiUrl': 'https://content.guardianapis.com/australia-news/live/2023/sep/14/australia-politics-live-qantas-yes-voice-smoke-sydney-greens-federal-funding-schools-covid-cruise-ship',
 'isHosted': False,
 'pillarId': 'pillar/news',
 'pillarName': 'News'}

In [6]:
num_pages = resp_data['pages']
num_pages

11

In [7]:
def articles_from_page_results(page_results):
    articles = {}
    for result in page_results:
        article_date = result['webPublicationDate']
        article_title = result['webTitle']+f" [{article_date}]"
        article_html = result['fields']['body']
        article_text = re.sub(r'<.*?>','',article_html)
        articles[article_title] = article_text
    return articles

In [8]:
def get_all_articles_for_response(response_json,full_url):
    total_pages = response_json['pages']
    total_articles = response_json['total']
    print(f"Fetching {total_articles} articles from {total_pages} pages...")
    all_articles = {}
    page1_articles = articles_from_page_results(response_json['results'])
    all_articles.update(page1_articles)
    print("Added articles for page: 1")
    
    for page in range(2,total_pages+1):
        print("Getting articles from API for page:",page)
        page_response = requests.get(full_url+f"&page={page}")
        page_data = page_response.json()['response']
        print("Processing results for page:",page_data['currentPage'])
        page_articles = articles_from_page_results(page_data['results'])
        print(f"Fetched {len(page_articles)} articles.")
        all_articles.update(page_articles)
        print("Added articles for page:",page)
        print(f"Status: {len(all_articles)} articles.")
        time.sleep(1) # make sure we're not hitting the API to hard
    
    print(f"FINISHED: Fetched {len(all_articles)} articles.")
    return all_articles

In [9]:
my_articles = get_all_articles_for_response(resp_data,full_url)

Fetching 110 articles from 11 pages...
Added articles for page: 1
Getting articles from API for page: 2
Processing results for page: 2
Fetched 10 articles.
Added articles for page: 2
Status: 20 articles.
Getting articles from API for page: 3
Processing results for page: 3
Fetched 10 articles.
Added articles for page: 3
Status: 30 articles.
Getting articles from API for page: 4
Processing results for page: 4
Fetched 10 articles.
Added articles for page: 4
Status: 40 articles.
Getting articles from API for page: 5
Processing results for page: 5
Fetched 10 articles.
Added articles for page: 5
Status: 50 articles.
Getting articles from API for page: 6
Processing results for page: 6
Fetched 10 articles.
Added articles for page: 6
Status: 60 articles.
Getting articles from API for page: 7
Processing results for page: 7
Fetched 10 articles.
Added articles for page: 7
Status: 70 articles.
Getting articles from API for page: 8
Processing results for page: 8
Fetched 10 articles.
Added articles f

In [10]:
print("Total Articles:",len(my_articles))
for title,text in my_articles.items():
    print(title)

Total Articles: 110
Developer ‘deeply regrets’ comments – as it happened [2023-09-14T08:17:23Z]
Bob Carr urges New Zealand not to join Aukus – as it happened [2024-04-17T08:12:08Z]
Treasurer delivers budget speech – as it happened [2024-05-14T11:55:11Z]
Dutton delivers federal budget reply speech – as it happened [2024-05-16T10:54:54Z]
Premier defends Games compensation; Black Lives Matter rally in Sydney – as it happened [2023-08-19T07:22:42Z]
Indigenous group condemns Jacinta Nampijinpa Price’s ‘denial of history’ – as it happened [2023-09-20T08:26:46Z]
Vast Aukus spending sparks calls to boost Australia’s aid budget [2023-03-29T14:00:40Z]
David Shoebridge says Julian Assange ‘may not survive’ trial and extradition – as it happened [2024-03-21T06:53:15Z]
ALP national conference day 2 – as it happened [2023-08-18T06:36:34Z]
Hastie’s defence comments ‘unhinged and misleading’, Conroy says – as it happened [2024-02-20T07:18:38Z]
Two charged over building fire – as it happened [2023-08-2

In [11]:
with open(f"indigenous_articles.json",'w', encoding='utf-8') as fp:
    fp.write(json.dumps(my_articles))