In [113]:
# Import Libraries
import requests
from bs4 import BeautifulSoup
import re
import json
import pandas as pd

In [114]:
# Download html of main page
urls = [('https://www.abc.net.au/news/business/','business'),('https://www.abc.net.au/news/sport/','sport')] # List of URL tuples with url and classification

for url in urls:
    filename = "abc_" + url[1] + "_inital_page.txt"   
    url = url[0]
    
    try:       
        page = requests.get(url) # Get main page
        
        if page.status_code == 200: # if successfull write to file
            with open(filename,mode="w",encoding="utf-8") as file:
                file.write(page.text)
                file.close()
            print('Saved to File')
        else:
            print('HTTP Error')
    except Exception as  e:
        print(e)



Saved to File
Saved to File


In [115]:
# Parse main page HTMLs

articles = {}

for url in urls:
    article_category = url[1]
    filename = "abc_" + article_category + "_inital_page.txt"  
    url = url[0]


    with open(filename,mode="r",encoding="utf-8") as file:
        page = file.read()

    soup = BeautifulSoup(page, 'html.parser')



    div = soup.find_all('h3', {"data-component":"CardHeading"}) # tag containing article heading and link

    def is_valid_target_business_link(article_link):
            # Check if the article is a business article (excludes video and radio)
            valid_link = False
            regex_strings = ['/news/\d', '/news/rural/'] # Regex stings for valid url's
            for regex_string in regex_strings:
                x = re.search(regex_string, article_link) 
                if x: valid_link = True
            return valid_link

    # Loop through all divs with articles and extract relevant details from 
    for d in div:
        a_tag = d.find('a') # Get a tag in div
        if a_tag is not None: # check if there is an a tag         
            article_link = a_tag.get('href') # Get Article link from the a_tag            
            valid_link = is_valid_target_business_link(article_link) # Check for valid business url
            if valid_link: # Add to articles list
                article_description = a_tag.contents[0]
                article = {}
                article['description'] = article_description
                article['article_category'] = article_category
                articles[article_link] = article

print('Main Page Articles Length: ', len(articles))




Main Page Articles Length:  56


In [116]:
# Download more Articles (need up to 100) from ABC API
business_api_url = ('https://www.abc.net.au/news-web/api/loader/channelrefetch?name=PaginationArticles&documentId=12785638&prepareParams={%22imagePosition%22:{%22mobile%22:%22right%22,%22tablet%22:%22right%22,%22desktop%22:%22right%22}}&loaderParams={%22pagination%22:{%22size%22:5}}&offset=5&size=100&total=250','business')
sport_api_url = ('https://www.abc.net.au/news-web/api/loader/channelrefetch?name=PaginationArticles&documentId=12785658&prepareParams=%7B%22imagePosition%22:%7B%22mobile%22:%22right%22,%22tablet%22:%22right%22,%22desktop%22:%22right%22%7D%7D&loaderParams=%7B%22pagination%22:%7B%22size%22:5%7D%7D&offset=5&size=100&total=250','sport')

api_urls = [business_api_url,sport_api_url]


for api_url in api_urls:
    article_category = api_url[1]
    filename = "abc_" + article_category + "_additional_articles_raw_json.txt"  
    api_url = api_url[0]
    try:
        response = requests.get(api_url)

        if response.status_code == 200:
            with open(filename,mode="w",encoding="utf-8") as file:
                file.write(response.text)
                file.close()
            print('Saved to File')
        else:
            print('HTTP Error')
    except Exception as  e:
        print(e)



Saved to File
Saved to File


In [117]:

for url in urls:
    article_category = url[1]
    filename = "abc_" + article_category + "_additional_articles_raw_json.txt"   

    # Open JSON with second list of articles from API
    with open(filename,mode="r",encoding="utf-8") as file:# Open file object
        abc_json = file.read() # Read objecto to string

    abc_json = json.loads(abc_json) # Convert sting to JSON

    # Loop through JSON list, extract article details and apprend to "article" list
    for article in abc_json['collection']:
        #print(article['title']['children'])

        article_link = article['link']['to'] # Get Article link from the a_tag            
        valid_link = is_valid_target_business_link(article_link) # Check for valid business url
        if valid_link:
            article_description = article['title']['children']
            article = {}
            article['description'] = article_description
            article['article_category'] = article_category
            articles[article_link] = article

    print('Updated Articles Length: ', len(articles))

    

    # Save Articles to file  

    with open("abc_articles.txt",mode="w",encoding="utf-8") as file:
            file.write(json.dumps(articles))
            file.close()
    print('Saved to File')        

    


Updated Articles Length:  153
Saved to File
Updated Articles Length:  235
Saved to File


Psuedo Code

- Get first business or sport page
- Extract all articles - titles and links from this html page
- Generate URL to download - titles and links from API
- Extract all articles - titles and links from json
- Save to disk

- Load from disk
- For each article:
    - Follow Link
    - Parse content
- Save content 



In [118]:
# Download article text for all articles in list and save to disk
with open("abc_articles.txt",mode="r",encoding="utf-8") as file:
    articles = json.load(file)# Open file object

base_url = 'https://www.abc.net.au'


print('Articles to process: ', len(articles))
for idx, article in enumerate(articles):
    url = base_url + article # Create Article URL
    
    try:
        page = requests.get(url)
        #pagetest = "TEST"
        if page.status_code == 200:
            #article_raw_html = page.text
            #article = {}
            #article['raw_html'] = article_raw_html
            articles[article]['raw_html'] = page.text
        else:
            print('Server Error:', article)

    except Exception as  e:
        print(e)


# Save Articles to file        
with open("abc_articles_with_raw.txt",mode="w",encoding="utf-8") as file:
        file.write(json.dumps(articles))
        file.close()
print('Saved to File') 




Articles to process:  235
Saved to File


In [131]:
# Parse downloaded HTML
# Load Articles        
with open("abc_articles_with_raw.txt",mode="r",encoding="utf-8") as file:
    articles = json.load(file)# Open file object

print('Articles to process: ', len(articles))
for idx, article in enumerate(articles):
    article_text = ''
    page = articles[article]['raw_html']
    soup = BeautifulSoup(page, 'html.parser') 
    div = soup.find_all('div', {'data-component':'LayoutContainer'}) #,{"class":"_1HzXw"} 
    p_tag = div[0].find_all('p')
    try:
        for p in p_tag:
            current_tag_contents = ' '
            has_child = len(p.find_all('a')) != 0
            if has_child:
                a_tag = p.find()
                if isinstance(a_tag.contents[0] ,str):
                    current_tag_contents = a_tag.contents[0]
            else:
                if isinstance(p.contents[0] ,str):
                    current_tag_contents = p.contents[0]

            article_text = ' '.join([article_text,current_tag_contents])
    except Exception as  e:
        print(e)
        print(article)
    article_text = article_text.replace(u'\xa0', u' ') # Replace Unicode non-breaking space with regular space
    articles[article]['article_text'] = article_text # Add extractd article text to json
    del articles[article]['raw_html'] # Remove raw html
    

# Save Articles to file        
with open("abc_articles_with_cleaned.txt",mode="w",encoding="utf-8") as file:
        file.write(json.dumps(articles, ensure_ascii=False))
        file.close()
print('Saved to File') 
    

Articles to process:  235
list index out of range
/news/2021-11-30/australia-matildas-draw-with-usa-in-newcastle/100658892
list index out of range
/news/2021-11-28/teofimo-lopez-vs-george-kambosos-jr-updates-boxing-blog/100655884
list index out of range
/news/2021-11-27/usa-beat-australia-matildas-3-0-sydney/100631628
list index out of range
/news/2021-11-26/afl-rookie-and-preseason-draft-live-blog/100653554
Saved to File


In [136]:
# Parse downloaded HTML
# Save Articles to file        
with open("abc_articles_with_cleaned.txt",mode="r",encoding="utf-8") as file:
    articles = json.load(file)# Open file object
   
    
# Convert to CSV and save final output for use in NLP
articles_df = pd.read_json("abc_articles_with_cleaned.txt", orient='index')
articles_df.rename_axis("uri",inplace=True)
articles_df.reset_index(inplace=True)
articles_df.head()
articles_df.to_csv("abc_articles_df.csv", index=False, sep='|', encoding='utf-16')
#print(articles[next(iter(articles))]['article_text'])

# Sandbox

In [135]:
articles['/news/2021-11-26/nats-egypt-unveils-renovated-avenue-of-the-sphinxes-in-luxor/100653682']['article_text']


" Egyptian authorities have unveiled a renovated ancient promenade in the city of Luxor that dates back 3,000 years.  It's the latest government project undertaken to highlight the country's archaeological treasures. Egypt has struggled to revive its tourism industry, battered by years of political turmoil after the 2011 popular uprising that toppled longtime autocrat Hosni Mubarak, and more recently, the COVID-19 pandemic. The ancient walkway — known as the Avenue of the Sphinxes, but also dubbed the Way of the Rams and the Path of the Gods — connects the famous Karnak and Luxor temples in what was the city of Thebes, which was Egypt's capital in antiquity. It is believed to have been the path that pilgrims trod to visit the temples and pay tribute to their deities. Lined with statues of rams and sphinxes on pedestals, the ancient road in Luxor — which sits on the banks of the Nile River and is located about 650 kilometres south of Cairo — stretches for several kilometres and had been

In [130]:
# Parse downloaded HTML
# Load Articles        
with open("abc_articles_with_raw.txt",mode="r",encoding="utf-8") as file:
    articles_test = json.load(file)# Open file object

print(articles_test['/news/2021-11-26/nats-egypt-unveils-renovated-avenue-of-the-sphinxes-in-luxor/100653682'])

    
    
print('Articles to process: ', len(articles))
for idx, article in enumerate(articles):
    article_text = ''
    page = articles_test[article]['raw_html']
    soup = BeautifulSoup(page, 'html.parser') 
    div = soup.find_all('div', {'data-component':'LayoutContainer'}) #,{"class":"_1HzXw"} 
    p_tag = div[0].find_all('p')
    try:
        for p in p_tag:
            current_tag_contents = ' '
            has_child = len(p.find_all('a')) != 0
            if has_child:
                a_tag = p.find()
                if isinstance(a_tag.contents[0] ,str):
                    current_tag_contents = a_tag.contents[0]
            else:
                if isinstance(p.contents[0] ,str):
                    current_tag_contents = p.contents[0]
            article_text = article_text.replace(u'\xa0', u' ')
            article_text = ' '.join([article_text,current_tag_contents])
    except Exception as  e:
        print(e)
        print(article)
    articles_test[article]['article_text'] = article_text # Add extractd article text to json
    del articles_test[article]['raw_html'] # Remove raw html
    
    
    
print(articles_test['/news/2021-11-26/nats-egypt-unveils-renovated-avenue-of-the-sphinxes-in-luxor/100653682'])




Articles to process:  235


list index out of range
/news/2021-11-30/australia-matildas-draw-with-usa-in-newcastle/100658892
list index out of range
/news/2021-11-28/teofimo-lopez-vs-george-kambosos-jr-updates-boxing-blog/100655884
list index out of range
/news/2021-11-27/usa-beat-australia-matildas-3-0-sydney/100631628
list index out of range
/news/2021-11-26/afl-rookie-and-preseason-draft-live-blog/100653554
{'description': 'Egypt unveils 3,000-year-old renovated Avenue of the Sphinxes in Luxor', 'article_category': 'business', 'article_text': " Egyptian authorities have unveiled a renovated ancient promenade in the city of Luxor that dates back 3,000 years.  It's the latest government project undertaken to highlight the country's archaeological treasures. Egypt has struggled to revive its tourism industry, battered by years of political turmoil after the 2011 popular uprising that toppled longtime autocrat Hosni Mubarak, and more recently, the COVID-19 pandemic. The ancient walkway — known as the Avenue of the