In [1]:
import requests
from urllib.request import Request, urlopen

from bs4 import BeautifulSoup
import pandas as pd

from tqdm.notebook import tqdm

In [2]:
base_url = "https://www.ndis.gov.au"

In [3]:
# get all links from NDIS webpage
try:
    sitemap = pd.read_csv('docs/NDIS_sitemap.csv').loc[0].tolist()
    
except (FileNotFoundError, pd.errors.EmptyDataError):
    print('NDIS sitemap could not be found, loading sitemap now...')
    
    req = Request(base_url + "/sitemap", headers={'User-Agent': 'Mozilla/5.0'})
    html_page = urlopen(req)

    soup = BeautifulSoup(html_page, "lxml")

    links = []
    for link in soup.findAll('a'):
        tmp = link.get('href')
        try: 
            if tmp.startswith('/'):
                links.append(base_url + tmp)
        except AttributeError:
            pass

    print(links[:10])
    pd.DataFrame(links).to_csv('docs/NDIS_sitemap.csv', index = False)

In [4]:
def extract_text_from(url):
     html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}).text
     soup = BeautifulSoup(html, features="html.parser")
     text = soup.get_text()

     lines = (line.strip() for line in text.splitlines())
     return '\n'.join(line for line in lines if line)

In [5]:
# get raw text from all NDIS pages
try:
    df = pd.read_csv('docs/NDIS_site_content.csv', index_col = 0)

except (FileNotFoundError, pd.errors.EmptyDataError):
    print('NDIS site content could not be found, loading pages now...')

    pages = []

    for link in tqdm(links):
        pages.append({'text': extract_text_from(link), 'source': link})

    # pages is a list of dictionaries. convert to dataframe and save as csv
    pd.DataFrame(pages).to_csv('docs/NDIS_site_content.csv')
    
df.head()

Unnamed: 0,text,source
0,National Disability Insurance Scheme (NDIS)\nS...,https://www.ndis.gov.au/
1,Languages | NDIS\nSkip to main content\nSkip t...,https://www.ndis.gov.au/languages
2,National Disability Insurance Scheme (NDIS)\nS...,https://www.ndis.gov.au/
3,Understanding the NDIS | NDIS\nSkip to main co...,https://www.ndis.gov.au/understanding
4,What is the NDIS? | NDIS\nSkip to main content...,https://www.ndis.gov.au/understanding/what-ndis
