# Open Dataset Parsing

## VIC & NSW School Holiday Dates

In [164]:
import requests
import pandas as pd
from pathlib import Path

In [None]:
nsw_sholiday_url = "https://www.nsw.gov.au/about-nsw/school-holidays"
vic_sholiday_url = "https://www.vic.gov.au/school-term-dates-and-holidays-victoria"

nsw_sholiday_resp = requests.get(nsw_sholiday_url)
vic_sholiday_resp = requests.get(vic_sholiday_url)

nsw_sholiday = pd.read_html(nsw_sholiday_resp.text)
vic_sholiday = pd.read_html(vic_sholiday_resp.text)

nsw_sholiday_2025, nsw_sholiday_2026 = nsw_sholiday[0:2]

nsw_sholiday_2025 = nsw_sholiday_2025.rename(columns={"Season": "season", "Division": "description"})
nsw_sholiday_2026 = nsw_sholiday_2026.rename(columns={"Season": "season", "Division": "description"})

  nsw_sholiday = pd.read_html(nsw_sholiday_resp.text)
  vic_sholiday = pd.read_html(vic_sholiday_resp.text)


### Parsing NSW School Holidays

In [151]:
date_pat = r"(\d{1,2}\s\w+(?:\s\d{4})?)"

# Get start date, end date, and format season
nsw_sholiday_2025[["start_date", "end_date"]] = pd.DataFrame((nsw_sholiday_2025.description.str.findall(pat=date_pat)
                                                            .map(lambda rec: rec[:2])
                                                            .map(lambda rec: [f"{rec[0]} 2025", rec[1]])
                                                            .tolist()))
nsw_sholiday_2025["season"] = (nsw_sholiday_2025.season.str.lower()
                               .str.replace(" holidays", ""))

nsw_sholiday_2026[["start_date", "end_date"]] = pd.DataFrame((nsw_sholiday_2026.description.str.findall(pat=date_pat)
                                                            .map(lambda rec: rec[:2])
                                                            .map(lambda rec: [f"{rec[0]} 2026", rec[1]])
                                                            .tolist()))
nsw_sholiday_2026["season"] = (nsw_sholiday_2026.season.str.lower()
                               .str.replace(" holidays", ""))

In [None]:
def convert_to_dt(df):
    # Retain only start and end dates as datetime format
    out_df = df.copy().drop("description", axis=1)
    out_df["start_date"] = pd.to_datetime(df.start_date)
    out_df["end_date"] = pd.to_datetime(df.end_date)
    
    # Unpack
    out_df[["start_day","start_month","start_year"]] = out_df["start_date"].apply(lambda rec: pd.Series([rec.day, rec.month, rec.year]))
    out_df[["end_day","end_month","end_year"]] = out_df["end_date"].apply(lambda rec: pd.Series([rec.day, rec.month, rec.year]))
    return out_df

In [153]:
nsw_sholiday_2025_clean = convert_to_dt(nsw_sholiday_2025)
nsw_sholiday_2026_clean = convert_to_dt(nsw_sholiday_2026)

In [154]:
nsw_sholiday_2025_clean

Unnamed: 0,season,start_date,end_date,start_day,start_month,start_year,end_day,end_month,end_year
0,autumn,2025-04-14,2025-04-24,14,4,2025,24,4,2025
1,winter,2025-07-07,2025-07-18,7,7,2025,18,7,2025
2,spring,2025-09-29,2025-10-10,29,9,2025,10,10,2025
3,summer,2025-12-22,2026-01-26,22,12,2025,26,1,2026


In [155]:
nsw_sholiday_2026_clean

Unnamed: 0,season,start_date,end_date,start_day,start_month,start_year,end_day,end_month,end_year
0,autumn,2026-04-07,2026-04-17,7,4,2026,17,4,2026
1,winter,2026-07-06,2026-07-17,6,7,2026,17,7,2026
2,spring,2026-09-28,2026-10-09,28,9,2026,9,10,2026
3,summer,2026-12-18,2027-01-27,18,12,2026,27,1,2027


### Parsing VIC School Holidays

In [160]:
vic_sholiday_2025, vic_sholiday_2026 = vic_sholiday[0:2]

def term_to_holiday(df, year):
    # Get complement of term dates as holidays
    
    # Formatting
    out_df = df.copy().drop("Term", axis=1)
    out_df.insert(0, "season", pd.Series(["autumn", "winter", "spring", "summer"]))
    out_df = out_df.rename(columns={"Start date": "start_date", "Finish date": "end_date"})
    
    # Converting to dt
    out_df["start_date"] = (out_df.start_date.str.extract(date_pat)
                            .map(lambda date: f"{date} {year}"))
    out_df["start_date"] = pd.to_datetime(out_df.start_date)
    out_df["end_date"] = (out_df.end_date.str.extract(date_pat)
                          .map(lambda date: f"{date} {year}"))
    out_df["end_date"] = pd.to_datetime(out_df.end_date)
    
    # Unpacking dt
    out_df[["start_day","start_month","start_year"]] = out_df["start_date"].apply(lambda rec: pd.Series([rec.day, rec.month, rec.year]))
    out_df[["end_day","end_month","end_year"]] = out_df["end_date"].apply(lambda rec: pd.Series([rec.day, rec.month, rec.year]))
    return out_df

In [161]:
vic_sholiday_2025_clean = term_to_holiday(vic_sholiday_2025, 2025)
vic_sholiday_2026_clean = term_to_holiday(vic_sholiday_2026, 2026)

In [162]:
vic_sholiday_2025_clean

Unnamed: 0,season,start_date,end_date,start_day,start_month,start_year,end_day,end_month,end_year
0,autumn,2025-01-28,2025-04-04,28,1,2025,4,4,2025
1,winter,2025-04-22,2025-07-04,22,4,2025,4,7,2025
2,spring,2025-07-21,2025-09-19,21,7,2025,19,9,2025
3,summer,2025-10-06,2025-12-19,6,10,2025,19,12,2025


In [163]:
vic_sholiday_2026_clean

Unnamed: 0,season,start_date,end_date,start_day,start_month,start_year,end_day,end_month,end_year
0,autumn,2026-01-27,2026-04-02,27,1,2026,2,4,2026
1,winter,2026-04-20,2026-06-26,20,4,2026,26,6,2026
2,spring,2026-07-13,2026-09-18,13,7,2026,18,9,2026
3,summer,2026-10-05,2026-12-18,5,10,2026,18,12,2026


## Exporting to CSV and Parquet

In [171]:
data_fp = list(Path().rglob("../*/*_holidays"))[0]

def df_to_parquet(df, fname):
    df.to_parquet(data_fp / f"{fname}.parquet", engine="fastparquet")
    
def df_to_csv(df, fname):
    df.to_csv(data_fp / f"{fname}.csv", index=False)

dfs = (nsw_sholiday_2025_clean, nsw_sholiday_2026_clean, vic_sholiday_2025_clean, vic_sholiday_2026_clean)
fnames = ("nsw_sholiday_2025_clean", "nsw_sholiday_2026_clean", "vic_sholiday_2025_clean", "vic_sholiday_2026_clean")

# Run everything
[df_to_csv(df, fname) for df, fname in zip(dfs, fnames)]
[df_to_parquet(df, fname) for df, fname in zip(dfs, fnames)]

[None, None, None, None]