# Development of data ingestion code

Note the code in this file is not used directly in the end solution, but large sections are copied into the .py files in the /src folder.

I think Notebooks are a good way to prototype and iterate on code, particularly where it involves data (so you can visualise what's going on and prevent headaches down the line) or processes you're not completely familiar with (so you can try things rapidly until they work). This is also a good place to include extra comments along the way, which may be covered in a blog post or simply used by someone who wants to understand the code better.

In [53]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime

In [None]:
# will get a 403 forbidden error unless we look like a browser
# to get an appropriate user-agent value:
# - Open Google Chrome and go to the website you're interested in (in your case, http://www.bom.gov.au/nsw/forecasts/sydney.shtml).
# - Press F12 or right-click on the page and select Inspect to open the Developer Tools.
# - Go to the Network tab.
# - Reload the page (you can press F5 or click the reload button in the browser).
# - In the Network tab, look for the first request that is made (usually listed at the top, with the name of the page you're visiting, e.g., sydney.shtml).
# - Click on that request to view its details.
# - On the right, you should see a Headers tab. Under the Request Headers section, look for the User-Agent value.
# - Copy that User-Agent string and use it in your Python code.

def get_page_source(url):
    '''returns page source from url'''

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36'
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.text
    else:
        raise Exception(f"Failed to fetch page: {response.status_code}")

# Sydney Forecast

In [64]:
url = "http://www.bom.gov.au/nsw/forecasts/sydney.shtml"
src = get_page_source(url)
soup = BeautifulSoup(src, 'html.parser') # recommended to include html.parser here to ensure consistent cross-platform results

In [None]:
# use this to have an initial look at the html code
#print(soup.prettify())

### Development

In [65]:
sections = soup.find_all(class_="day", limit=3) # limit=1 # for testing
#print([section.text for section in sections])

results = []
for section in sections:
    #print(section.prettify())
    date = section.find('h2').text.strip()
    print(date)
    
    rain_section = section.find_next('dd', class_="rain")
    # print(rain_section) # while testing
    # [<dd class="rain">Chance of any rain: <em class="pop">5%
    # 					<img alt="" height="10" src="/images/ui/weather/rain_5.gif" width="69"/></em></dd>]
    # [<dd class="rain">Chance of any rain: <em class="pop">10%
    # 					<img alt="" height="10" src="/images/ui/weather/rain_10.gif" width="69"/></em></dd>]
    # [<dd class="rain">Possible rainfall: <em class="rain">0 to 3 mm</em></dd>, <dd class="rain">Chance of any rain: <em class="pop">50%
    # 					<img alt="" height="10" src="/images/ui/weather/rain_50.gif" width="69"/></em></dd>]
    # Note that when rain chance exceeds a threshold % the layout is different

    rain_mm_low = 0
    rain_mm_high = 0
    if "Possible rainfall" in rain_section.text:
        rain_mm = rain_section.find_next('em', class_="rain").text.strip()
        match = re.search(r"(\d+)\s*to\s*(\d+)", rain_mm)
        if match:
            rain_mm_low = int(match.group(1))
            rain_mm_high = int(match.group(2))

    rain_chance = rain_section.find_next('em', class_="pop").text.strip()
    rain_chance = float(rain_chance.strip('%')) / 100

    # print('rain_mm_low', rain_mm_low) # while testing
    # print('rain_mm_high', rain_mm_high)
    # print('rain_chance', rain_chance)

    results.append([date, rain_chance, rain_mm_low, rain_mm_high])
print(results)

Forecast for the rest of Wednesday
Thursday 20 March
Friday 21 March
[['Forecast for the rest of Wednesday', 0.05, 0, 0], ['Thursday 20 March', 0.1, 0, 0], ['Friday 21 March', 0.5, 0, 3]]


### Functions for script

This will be placed in src and will be called from the database.py and/or data_ingestion.py so should return the data in an array format

In [78]:
def convert_to_datetime(date_str, current_date=None):
    '''Converts string from html (e.g. 'Friday 21 March') into datetime (e.g. 21/3/25)
    
    Keyword args:
    current_date -- used for testing the function, defaults to today's date

    Returns:
    date_forecast_was_made, date_forecast_applies_to -- both in d/m/y string format
    '''

    if not current_date:
        current_date = datetime.today().date()

    # forecast for the remainder of the day on which the forecast is made
    if "Forecast" in date_str:
        forecast_date = current_date

    else:    
        month_day = date_str.split(' ', 1)[1] # remove the weekday portion
        current_year = current_date.year

        forecast_date = datetime.strptime(f"{month_day} {current_year}", "%d %B %Y").date() # read in as d mmm y
        if forecast_date < current_date:
            forecast_date = forecast_date.replace(year=current_year + 1)

    # format as d/m/y
    return current_date.strftime('%d/%m/%y'), forecast_date.strftime('%d/%m/%y')

print(convert_to_datetime('Friday 19 March', datetime.today().date()))
print(convert_to_datetime('Friday 21 March', datetime.today().date()))
print(convert_to_datetime('Forecast for the rest of today', datetime.today().date())) # check it reads the forecast row correctly
print(convert_to_datetime('Friday 1 January', datetime.today().date())) # check it reads dates at the start of next year correctly

('19/03/25', '19/03/25')
('19/03/25', '21/03/25')
('19/03/25', '19/03/25')
('19/03/25', '01/01/26')


In [82]:
def forecast_data(soup):
    '''Extract rainfall chance and amounts from the Beautiful Soup class'''

    sections = soup.find_all(class_="day")
    results = []
    for section in sections:

        # extract the date (which is implied by the current date and this being a forecast for the coming week, includes rest of today)
        date_forecast_applies_to = section.find('h2').text.strip()
        date_forecast_was_made, date_forecast_applies_to = convert_to_datetime(date_forecast_applies_to)

        rain_section = section.find_next('dd', class_="rain")
        rain_mm_low = 0
        rain_mm_high = 0
        if "Possible rainfall" in rain_section.text: # rainfall mm is only shown when rainfall chance exceeds some threshold
            rain_mm = rain_section.find_next('em', class_="rain").text.strip()
            match = re.search(r"(\d+)\s*to\s*(\d+)", rain_mm) # convert 0 to 3 mm into values 0 and 3 using regular expressions
            if match:
                rain_mm_low = int(match.group(1))
                rain_mm_high = int(match.group(2))

        rain_chance = rain_section.find_next('em', class_="pop").text.strip()
        rain_chance = float(rain_chance.strip('%')) / 100 # convert from text % to float now (might be easier than doing so later on)

        results.append([date_forecast_was_made, date_forecast_applies_to, rain_chance, rain_mm_low, rain_mm_high])

    return results

forecast_data_raw = forecast_data(soup=soup)
print(forecast_data_raw)

[['19/03/25', '19/03/25', 0.05, 0, 0], ['19/03/25', '20/03/25', 0.1, 0, 0], ['19/03/25', '21/03/25', 0.5, 0, 3], ['19/03/25', '22/03/25', 0.4, 0, 1], ['19/03/25', '23/03/25', 0.6, 0, 5], ['19/03/25', '24/03/25', 0.5, 0, 2], ['19/03/25', '25/03/25', 0.5, 0, 5]]


In [85]:
# convert data to dataframe for ease of filtering etc
df = pd.DataFrame(forecast_data_raw, columns=['date_forecast_was_made', 'date_forecast_applies_to', 'rain_chance', 'rain_mm_low', 'rain_mm_high'])
df['date_forecast_was_made'] = pd.to_datetime(df['date_forecast_was_made'], format='%d/%m/%y')
df['date_forecast_applies_to'] = pd.to_datetime(df['date_forecast_applies_to'], format='%d/%m/%y')
print(df)

  date_forecast_was_made date_forecast_applies_to  rain_chance  rain_mm_low  \
0             2025-03-19               2025-03-19         0.05            0   
1             2025-03-19               2025-03-20         0.10            0   
2             2025-03-19               2025-03-21         0.50            0   
3             2025-03-19               2025-03-22         0.40            0   
4             2025-03-19               2025-03-23         0.60            0   
5             2025-03-19               2025-03-24         0.50            0   
6             2025-03-19               2025-03-25         0.50            0   

   rain_mm_high  
0             0  
1             0  
2             3  
3             1  
4             5  
5             2  
6             5  


# Sydney Historical

### Development

In [86]:
p_stn_num = 66037 # Sydney airport
url = f"http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_stn_num={p_stn_num}"
src = get_page_source(url)
soup = BeautifulSoup(src, 'html.parser') # recommended to include html.parser here to ensure consistent cross-platform results

In [None]:
<table class="climatedata tdtooltip" id="dataTable" summary="Daily data, with a column for each month. After the table headings, the first row contains links to graphs of the data.">
        <thead>
         <tr>
          <th scope="col">
           2025
          </th>
...
        <tbody>
         <tr class="graphcell">
          <th scope="row">
           Graph
          </th>
...
         <tr>
          <th scope="row">
           1st
          </th>
          <td class="no-qc">
           0
          </td>
          <td class="no-qc">
           0
          </td>
          <td class="no-qc">
           0
          </td>
          <td>
         </tr>
...
         <tr>
          <th scope="row">
           3rd
          </th>
          <td class="no-qc">
           4.8
          </td>
          <td class="no-qc">
           0
          </td>
          <td class="no-qc">
           0
          </td>
          <td>
          </td>
...

In [62]:
# Extract the year (from th with scope="col")
year = soup.find('th', {'scope': 'col'}).text.strip()

# Initialize the list for storing data rows
data = []

# Regular expressions pattern for checking to see if value is 1st .. 31st
pattern = re.compile(r'^\d{1,2}(st|nd|rd|th)$')

# Iterate over each table row in the tbody
for row in soup.find_all('tr')[1:]:  # Skip the first row (graph row)
    row_data = []

    # check row exists and is one of the data rows (1st to 31st in the first column)
    th = row.find('th', {'scope': 'row'})
    if th:
        th = th.text.strip()
        if pattern.match(th):
            row_data.append(th)

            # extract the row data
            cells = row.find_all('td', class_='no-qc')
            for cell in cells:
                cell_text = cell.text.strip()
                if cell_text: # only append non-empty cells
                    row_data.append(cell_text)

            data.append(row_data)

# Print the year and the data
print("Year:", year)
print("Data:", data)

# convert data to a list with dates in the format dd/mm/yy
daily_data = []
for i, (day, *values) in enumerate(data):
    day_number = int(day[:-2]) # remove 'st', 'nd' etc
    for month_number, value in enumerate(values):
        date = datetime(int(year), month_number + 1, day_number).strftime('%d/%m/%y') # add year and format as dd/mm/yy
        daily_data.append([date, value])

# convert data to dataframe for ease of filtering etc
df = pd.DataFrame(daily_data, columns=['date', 'rainfall_mm'])
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%y')
df = df.sort_values(by='date', ascending=True)
print(df)


Year: 2025
Data: [['1st', '0', '0', '0'], ['2nd', '0', '0', '0'], ['3rd', '4.8', '0', '0'], ['4th', '0.2', '0', '3.0'], ['5th', '0', '0', '3.6'], ['6th', '0', '0.6', '0'], ['7th', '14.0', '0.2', '0'], ['8th', '0.4', '0', '0.2'], ['9th', '37.2', '13.0', '5.0'], ['10th', '1.0', '0.2', '0'], ['11th', '23.6', '6.8', '20.4'], ['12th', '0.2', '0.2', '7.2'], ['13th', '0', '5.0', '0.2'], ['14th', '0', '0', '0'], ['15th', '0', '22.8', '0'], ['16th', '35.6', '0.2', '0'], ['17th', '1.6', '0', '0'], ['18th', '11.0', '0', '0'], ['19th', '5.4', '0', '0'], ['20th', '0', '0'], ['21st', '0', '0'], ['22nd', '0', '4.0'], ['23rd', '0', '0'], ['24th', '0', '0'], ['25th', '0', '0'], ['26th', '0', '0'], ['27th', '0', '0'], ['28th', '7.6', '0'], ['29th', '8.2'], ['30th', '0.6'], ['31st', '1.0']]
         date rainfall_mm
0  2025-01-01           0
3  2025-01-02           0
6  2025-01-03         4.8
9  2025-01-04         0.2
12 2025-01-05           0
..        ...         ...
44 2025-03-15           0
47 2025-0

### Functions for script

This will be placed in src and will be called from the database.py and/or data_ingestion.py so should return the data in an array format

In [88]:
def extract_historical_data(soup):

    # Extract the year (from th with scope="col")
    year = soup.find('th', {'scope': 'col'}).text.strip()

    # Initialize the list for storing data rows
    data = []

    # Regular expressions pattern for checking to see if value is 1st .. 31st
    pattern = re.compile(r'^\d{1,2}(st|nd|rd|th)$')

    # Iterate over each table row in the tbody
    for row in soup.find_all('tr')[1:]:  # Skip the first row (graph row)
        row_data = []

        # check row exists and is one of the data rows (1st to 31st in the first column)
        th = row.find('th', {'scope': 'row'})
        if th:
            th = th.text.strip()
            if pattern.match(th):
                row_data.append(th)

                # extract the row data
                cells = row.find_all('td', class_='no-qc')
                for cell in cells:
                    cell_text = cell.text.strip()
                    if cell_text: # only append non-empty cells
                        row_data.append(cell_text)

                data.append(row_data)

    # convert data to a list with dates in the format dd/mm/yy
    daily_data = []
    for i, (day, *values) in enumerate(data):
        day_number = int(day[:-2]) # remove 'st', 'nd' etc
        for month_number, value in enumerate(values):
            date = datetime(int(year), month_number + 1, day_number).strftime('%d/%m/%y') # add year and format as dd/mm/yy
            daily_data.append([date, value])

    # convert data to dataframe for ease of filtering etc
    df = pd.DataFrame(daily_data, columns=['date', 'rainfall_mm'])
    df['date'] = pd.to_datetime(df['date'], format='%d/%m/%y')
    df = df.sort_values(by='date', ascending=True)
    return df

df = extract_historical_data(soup=soup)
print(df)

         date rainfall_mm
0  2025-01-01           0
3  2025-01-02           0
6  2025-01-03         4.8
9  2025-01-04         0.2
12 2025-01-05           0
..        ...         ...
44 2025-03-15           0
47 2025-03-16           0
50 2025-03-17           0
53 2025-03-18           0
56 2025-03-19           0

[78 rows x 2 columns]


In [None]:
# placeholder in case we need to use pickle to save some test data for the database notebook to play with
# will try and set up import and use / test the src version in the first instance
if False:
        
    import pickle

    # save file for database.ipynb to play with
    with open('data/df_forecast.pkl', 'wb') as f:
        pickle.dump(df_forecast, f)

    # Load DataFrame from pickle file
    with open('df_forecast.pkl', 'rb') as f:
        loaded_df = pickle.load(f)