# Open Dataset Parsing

## VIC & NSW School Holiday Dates

In [8]:
import requests
import pandas as pd

In [116]:
nsw_sholiday_url = "https://www.nsw.gov.au/about-nsw/school-holidays"
vic_sholiday_url = "https://www.vic.gov.au/school-term-dates-and-holidays-victoria"

nsw_sholiday_resp = requests.get(nsw_sholiday_url)
vic_sholiday_resp = requests.get(vic_sholiday_url)

nsw_sholiday = pd.read_html(nsw_sholiday_resp.text)
vic_sholiday = pd.read_html(vic_sholiday_resp.text)

nsw_sholiday_2025, nsw_sholiday_2026 = nsw_sholiday[0:2]

nsw_sholiday_2025 = nsw_sholiday_2025.rename(columns={"Season": "season", "Division": "description"})
nsw_sholiday_2026 = nsw_sholiday_2026.rename(columns={"Season": "season", "Division": "description"})

  nsw_sholiday = pd.read_html(nsw_sholiday_resp.text)
  vic_sholiday = pd.read_html(vic_sholiday_resp.text)


In [125]:
date_pat = r"(\d{1,2}\s\w+(?:\s\d{4})?)"

nsw_sholiday_2025[["start_date", "end_date"]] = pd.DataFrame((nsw_sholiday_2025.description.str.findall(pat=date_pat)
                                                            .map(lambda rec: rec[:2])
                                                            .map(lambda rec: [f"{rec[0]} 2025", rec[1]])
                                                            .tolist()))

nsw_sholiday_2026[["start_date", "end_date"]] = pd.DataFrame((nsw_sholiday_2026.description.str.findall(pat=date_pat)
                                                            .map(lambda rec: rec[:2])
                                                            .map(lambda rec: [f"{rec[0]} 2026", rec[1]])
                                                            .tolist()))

In [126]:
def convert_to_dt(df):
    # Retain only start and end dates as datetime format
    out_df = df.copy().drop("description", axis=1)
    out_df["start_date"] = pd.to_datetime(df.start_date)
    out_df["end_date"] = pd.to_datetime(df.end_date)
    # Unpack
    out_df[["start_day","start_month","start_year"]] = out_df["start_date"].apply(lambda rec: pd.Series([rec.day, rec.month, rec.year]))
    out_df[["end_day","end_month","end_year"]] = out_df["end_date"].apply(lambda rec: pd.Series([rec.day, rec.month, rec.year]))
    return out_df

In [127]:
nsw_sholiday_2025_clean = convert_to_dt(nsw_sholiday_2025)
nsw_sholiday_2026_clean = convert_to_dt(nsw_sholiday_2026)

In [128]:
nsw_sholiday_2025_clean

Unnamed: 0,season,start_date,end_date,start_day,start_month,start_year,end_day,end_month,end_year
0,Autumn holidays,2025-04-14,2025-04-24,14,4,2025,24,4,2025
1,Winter holidays,2025-07-07,2025-07-18,7,7,2025,18,7,2025
2,Spring holidays,2025-09-29,2025-10-10,29,9,2025,10,10,2025
3,Summer holidays,2025-12-22,2026-01-26,22,12,2025,26,1,2026


In [129]:
nsw_sholiday_2026_clean

Unnamed: 0,season,start_date,end_date,start_day,start_month,start_year,end_day,end_month,end_year
0,Autumn holidays,2026-04-07,2026-04-17,7,4,2026,17,4,2026
1,Winter holidays,2026-07-06,2026-07-17,6,7,2026,17,7,2026
2,Spring holidays,2026-09-28,2026-10-09,28,9,2026,9,10,2026
3,Summer holidays,2026-12-18,2027-01-27,18,12,2026,27,1,2027


In [130]:
vic_sholiday[0]

Unnamed: 0,Term,Start date,Finish date
0,Term 1,Start dateTuesday 28 January (students start W...,Finish dateFriday 4 April
1,Term 2,Start dateTuesday 22 April,Finish dateFriday 4 July
2,Term 3,Start dateMonday 21 July,Finish dateFriday 19 September
3,Term 4,Start dateMonday 6 October,Finish dateFriday 19 December


In [139]:
vic_sholiday_2025, vic_sholiday_2026 = vic_sholiday[0:2]

def term_to_holiday(df, year):
    # Get complement of term dates as holidays
    out_df = df.copy().drop("Term", axis=1)
    out_df.insert(0, "season", pd.Series(["autumn", "winter", "spring", "summer"]))
    out_df = out_df.rename(columns={"Start date": "start_date", "Finish date": "end_date"})
    out_df["start_date"] = out_df.start_date.str.extract(date_pat).map(lambda date: f"{date} {year}")
    out_df["end_date"] = out_df.end_date.str.extract(date_pat).map(lambda date: f"{date} {year}")
    return out_df

In [140]:
term_to_holiday(vic_sholiday_2025, 2025)

Unnamed: 0,season,start_date,end_date
0,autumn,28 January 2025,4 April 2025
1,winter,22 April 2025,4 July 2025
2,spring,21 July 2025,19 September 2025
3,summer,6 October 2025,19 December 2025
