# RBA SOMP Forecasts

## Set-up

In [1]:
# system imports
import io
import textwrap
from pathlib import Path
from typing import NewType, cast, Any
from urllib.error import HTTPError
from datetime import datetime

# analytic imports
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import readabs as ra

# local imports
from plotting import clear_chart_dir, finalise_plot, set_chart_dir
import common

In [2]:
# plotting constants
CHART_DIR = "./CHARTS/SOMP/"
Path(CHART_DIR).mkdir(parents=True, exist_ok=True)
clear_chart_dir(CHART_DIR)
set_chart_dir(CHART_DIR)
SHOW = False
FILE_TYPE = "png"
MARKERS = [
    "o",
    "s",
    "D",
    "x",
    "P",
    "H",
    "v",
    "^",
    "<",
    ">",
    "1",
    "2",
    "3",
    "4",
    "8",
    "p",
    "*",
    "h",
    "+",
    "X",
    "D",
]

## Collect and reorganise SOMP data

### Typing information

In [3]:
# A dictionary that holds DataFrame data keyed by report dates
# rows in df are subject domains columns are projection dates
DateSompFrame = NewType("DateSompFrame", pd.DataFrame)


class DateSompDict(dict[str, DateSompFrame]):
    """A dictionary of DateSompFrame DataFrames keyed by str report dates"""


# A dictionary that holds DataFrame data keyed by subject domain
# rows in df are projection dates, columns are report dates
DomainSompFrame = NewType("DomainSompFrame", pd.DataFrame)


class DomainSompDict(dict[str, DomainSompFrame]):
    """A dictionary of DomainSompFrame DataFrames keyed by str subject domain"""

### Get raw SOMP tables by date

In [4]:
# Note: inclusive search terms:
START_YEAR = 2018
STOP_YEAR = datetime.now().year  # current year

In [5]:
def data_cleaning(frame: DateSompFrame) -> DateSompFrame:
    """Preliminary data cleaning for the raw SOMP
    data captured from the RBA website."""

    for col in frame.columns:

        # convert text fractions to decimals
        frame[col] = (
            frame[col]
            .astype(str)
            .str.replace("¼", ".25")
            .str.replace("½", ".5")
            .str.replace("¾", ".75")
            .str.replace("−", "-")
        )

        # make to NaN all non-numeric data
        frame[col] = pd.to_numeric(frame[col], errors="coerce")

    # delete any all-NaN rows - Needed for Feb 24 report
    frame = cast(DateSompFrame, frame.dropna(how="all", axis="index"))

    # remove odd footnotes from index introduced in Feb 2024
    # and standardize the case used in the index
    # (because of case mismatch before and after Feb 2024)
    frame.index = frame.index.str.replace(r"\([a-z]\)$", "", regex=True).str.title()

    return frame

In [6]:
def get_table(
    url: str,
    cache: Path | None,
    direct: bool = False,  # default to using a cached copy
) -> DateSompFrame | None:
    """Get the SOMP table from the RBA website and clean it.
    Arguments:
    url -- the URL of the SOMP table, as a string.
    cache -- the directory Path where the table is/will be
        cached or None.
    direct -- a boolean flag or whether to get the table
        directly, or to use a local copy of the table, if it
        has been previously saved to the cache.
    Returns:
    A cleaned DataFrame of the SOMP data, or None if the table
    could not be found."""

    table_list: list[DateSompFrame] | None = None
    if direct or cache is None:
        try:
            table_list = cast(list[DateSompFrame], pd.read_html(url, index_col=0))
        except HTTPError:
            return None

    else:
        # use a cached copy where we have it.
        # This approach is less aggressive on the RBA website.
        try:
            file = common.get_file(url, cache)
            f_string = cast(str, io.BytesIO(file))
            table_list = cast(list[DateSompFrame], pd.read_html(f_string, index_col=0))
        except common.HttpError:
            return None

    if table_list is None or not table_list:
        return None
    relevant_table = 0
    return data_cleaning(table_list[relevant_table])

In [7]:
def collect_somp_data(cached=True) -> DateSompDict:
    """Collect raw SOMP data, as presented in the RBA SOMP
    reports. Returns one table for each report in a dict."""

    cache: None | Path = None
    if cached:
        cache_dir = "./RBA_SOMP_CACHE"
        cache = Path(cache_dir)
        cache.mkdir(parents=True, exist_ok=True)

    pool = DateSompDict()
    q_start = {"feb": 1, "may": 2, "aug": 3, "nov": 4}
    for year in range(START_YEAR, STOP_YEAR + 1):
        for qtr, q_number in q_start.items():

            # get URL
            url = (
                (
                    "https://www.rba.gov.au/publications"
                    f"/smp/{year}/{qtr}/forecasts.html"
                )
                if year < 2024
                else (
                    # Change of RBA data location in Feb 2024 ...
                    "https://www.rba.gov.au/publications"
                    f"/smp/{year}/{qtr}/outlook.html"
                )
            )
            frame = get_table(url, cache)
            if frame is None:
                continue
            index = f"{year}Q{q_number}"
            pool[index] = frame

    return pool


raw_somp = collect_somp_data()
textwrap.wrap(", ".join(raw_somp.keys()), width=85)

['2018Q4, 2019Q1, 2019Q2, 2019Q3, 2019Q4, 2020Q1, 2020Q2, 2020Q3, 2020Q4, 2021Q1,',
 '2021Q2, 2021Q3, 2021Q4, 2022Q1, 2022Q2, 2022Q3, 2022Q4, 2023Q1, 2023Q2, 2023Q3,',
 '2023Q4, 2024Q1, 2024Q2, 2024Q3, 2024Q4']

### Reorganise SOMP data into a dictionary keyed by domain

In [8]:
def reorganise_somp(inputs: DateSompDict) -> DomainSompDict:
    """Reorganise the SOMP data
    from tables by date to tables by domain."""

    ue = "Unemployment Rate"
    pool = DomainSompDict()
    for index, frame in inputs.items():
        for item in frame.index:
            if item[0] == "(":
                continue  # ignore footnotes
            row = frame.loc[item]
            row.name = index
            item_adj = ue if ue in item else item
            row_frame = cast(DomainSompFrame, pd.DataFrame(row))
            if item_adj not in pool:
                pool[item_adj] = row_frame
            else:
                pool[item_adj] = cast(
                    DomainSompFrame, pd.concat([pool[item_adj], row_frame], axis=1)
                )

    # sort out column names and index names
    rpool = DomainSompDict()
    for key, table in pool.items():
        rpool[key] = cast(
            DomainSompFrame,
            pd.DataFrame(
                table.values,
                columns=pd.PeriodIndex(table.columns, freq="Q-NOV"),
                index=pd.PeriodIndex(table.index, freq="Q-DEC"),
            ),
        )

    return rpool


somp = reorganise_somp(raw_somp)
textwrap.wrap(", ".join(somp.keys()), width=85)

['Gross Domestic Product, Household Consumption, Dwelling Investment, Business',
 'Investment, Public Demand, Gross National Expenditure, Imports, Exports, Real',
 'Household Disposable Income, Terms Of Trade, Major Trading Partner (Export-Weighted)',
 'Gdp, Unemployment Rate, Employment, Wage Price Index, Nominal (Non-Farm) Average',
 'Earnings Per Hour, Trimmed Mean Inflation, Consumer Price Index, Hours-Based',
 'Underutilisation Rate (Quarterly, %), Nominal Average Earnings Per Hour (Non-Farm),',
 'Cash Rate (%), Trade-Weighted Index (Index), Brent Crude Oil Price (Us$/Bbl),',
 'Estimated Resident Population, Labour Productivity, Household Savings Rate (%), Real',
 'Wage Price Index, Real Average Earnings Per Hour (Non-Farm), Household Saving Rate',
 '(%)']

### Get key ABS data

In [9]:
def get_wanted_domains() -> tuple[dict[str, tuple[str, str, str]], str]:
    """Get a list of domains that we want to plot."""

    gdp_cat = "5206.0"
    gdp_kags = "5206001_Key_Aggregates"
    gdp_hhc = "5206008_Household_Final_Consumption_Expenditure"
    gdp_exp = "5206002_Expenditure_Volume_Measures"
    gdp_ipd = "5206005_Expenditure_Implicit_Price_Deflators"
    gdp_hhi = "5206020_Household_Income"
    gdp_sas = "5206024_Selected_Analytical_Series"
    cpi_cat, cpi_seo = "6401.0", "640106"
    wpi_cat, wpi_seo = "6345.0", "634501"
    lfs_cat, lfs_seo = "6202.0", "6202001"
    erp_cat, erp_seo = "3101.0", "310101"

    # Do we need anything from the Labour Account?
    # la_cat, la_seo = "6150.0.55.003", "6150055003DO001"
    # la_seo_summ = "Industry summary table"

    wanted = {
        # "Series ID": ["Category ID", "single-excel-only table name", "Short Series Title"]
        "A2325846C": (cpi_cat, cpi_seo, "CPI Index Orig"),
        "A3604506F": (cpi_cat, cpi_seo, "CPI Index SA"),
        "A3604509L": (cpi_cat, cpi_seo, "CPI Index TM SA"),
        "A2302460K": (gdp_cat, gdp_kags, "GDP per capita CVM Orig"),  # for population
        "A2302459A": (gdp_cat, gdp_kags, "GDP CVM Orig"),  # for population
        "A2304404C": (gdp_cat, gdp_kags, "GDP per capita CVM SA"),
        "A2304402X": (gdp_cat, gdp_kags, "GDP CVM SA"),
        "A2323382F": (gdp_cat, gdp_kags, "Household savings ratio SA"),
        "A2303280V": (gdp_cat, gdp_hhc, "Household consumption CVM SA"),
        "A2304098T": (gdp_cat, gdp_exp, "Dwelling Investment CVM SA"),
        "A2133251W": (erp_cat, erp_seo, "Estimated Resident Population Orig"),
        "A2304113C": (gdp_cat, gdp_exp, "GNE CVM SA"),
        "A2304114F": (gdp_cat, gdp_exp, "Exports CVM SA"),
        "A2304115J": (gdp_cat, gdp_exp, "Imports CVM SA"),
        "A2713849C": (wpi_cat, wpi_seo, "WPI Index SA"),
        "A84423043C": (lfs_cat, lfs_seo, "Thousand Employed SA"),
        "A84423050A": (lfs_cat, lfs_seo, "Unemployment Rate SA"),
        "A2303940R": (gdp_cat, gdp_ipd, "HHIPD Index CVM SA"),
        "A2303727C": (gdp_cat, gdp_ipd, "GNEIPD Index CVM SA"),
        "A2303730T": (gdp_cat, gdp_ipd, "GDPIPD Index CVM SA"),
        "A2302939L": (gdp_cat, gdp_hhi, "Gross Disposable Income CP SA"),
        "A124830484V": (gdp_cat, gdp_sas, "Public Final demand (CVM) (SA)"),
        "A2304200A": (gdp_cat, gdp_kags, "Terms of Trade Index (SA)"),
        "A129552325C": (
            gdp_cat,
            gdp_sas,
            "Non-farm compensation of employees per hour (CP) (SA)",
        ),
    }

    return wanted, gdp_cat

In [10]:
def get_data() -> tuple[dict[str, pd.Series], pd.DataFrame, dict[str, str]]:
    """Get a dictionary of data items from the ABS."""

    data, meta, cat = {}, {}, {}
    wanted, gdp_cat = get_wanted_domains()
    for series_id, (category_id, seo, title) in wanted.items():
        if not series_id or not category_id or not seo or not title:
            continue
        d, m = ra.read_abs_series(
            category_id,
            series_id,
            single_excel_only=seo,
        )
        data[title] = d[series_id]
        meta[title] = m.loc[series_id]
        cat[title] = f"ABS: {category_id}"

    # -- GDP based implicit population
    data["Million population"] = data["GDP CVM Orig"] / data["GDP per capita CVM Orig"]
    cat["Million population"] = f"ABS: {gdp_cat}"

    # See table H from the RBA Statistical Tables: 'Real household disposable income' is
    # household disposable income after the deduction of interest payments, deflated by
    # the implicit price deflator for household consumption expenditure; includes income
    # from unincorporated enterprises.
    # data["HHIPD Index rebase"] = (
    #    data["HHIPD Index CVM SA"] / data["HHIPD Index CVM SA"].iloc[-1]
    # )
    # data["Real Household Disposable Income"] = (
    #    data["Gross Disposable Income CP SA"] / data["HHIPD Index rebase"]
    # )
    cat["Real Household Disposable Income"] = f"ABS: {gdp_cat}"

    return data, pd.DataFrame(meta).T, cat


abs_data, abs_meta, abs_cat = get_data()
# abs_meta

## Plotting

### SOMP/ABS pairs

In [11]:
def plot_somp(
    ax: mpl.axes.Axes,
    df: pd.DataFrame,
    last_n: int = 0,
) -> None:
    """Add the quarterly SOMP forecasts to a plot."""

    df = df.copy()  # non destructive
    df = df[df.columns[-last_n:]] if last_n else df

    gradient = np.linspace(0, 1, len(df.columns))
    colors = (
        plt.get_cmap("viridis")(gradient)
        if last_n == 0 or last_n > 2
        else ("cornflowerblue", "darkorange")
    )
    xy = set()
    last = df.columns[-1]
    styles = ["-", "--", "-.", ":"] * 10
    for count, (col, color) in enumerate(zip(df.columns, colors)):
        s = df[col].astype(float).dropna()  # kludge
        if s.notna().sum() == 0 or s.index[0].year < START_YEAR:
            continue
        color = "darkred" if col == last else color
        mark: dict[str, Any] = {"marker": MARKERS[count % len(MARKERS)], "ms": 2}
        s.plot(ax=ax, lw=1, c=color, ls=styles[count], **mark, label=str(col)[2:])
        x, y = s.index[-1], s.iloc[-1]
        # let's minimise over-plotting of text
        va = "bottom" if (x, y) not in xy else "top"
        ax.text(x=x, y=y, s=str(col)[2:], fontsize=6, va=va)
        xy.add((x, y))

In [12]:
def plot_somp_abs_pairs(start=f"{START_YEAR-1}-01-01", last_n=2) -> None:
    """Plot SOMP forecasts against ABS data."""

    # identify the pairs of SOMP and ABS data
    lfs = "Quarterly mean monthly data from Labour Force Survey"
    somp_abs_pairs = [
        # ("SOMP data", "ABS/RBA", "footnote")
        ("Gross Domestic Product", "GDP CVM SA", ""),
        ("Household Consumption", "Household consumption CVM SA", ""),
        ("Gross National Expenditure", "GNE CVM SA", ""),
        ("Exports", "Exports CVM SA", ""),
        ("Imports", "Imports CVM SA", ""),
        ("Wage Price Index", "WPI Index SA", ""),
        ("Employment", "Thousand Employed SA", lfs),
        ("Unemployment Rate", "Unemployment Rate SA", lfs),
        ("Trimmed Mean Inflation", "CPI Index TM SA", ""),
        ("Consumer Price Index", "CPI Index Orig", ""),
        (
            "Estimated Resident Population",
            "Million population",
            "ERP implied from GDP and GDP per capita in ABS National Accounts. ",
        ),
        (
            "Estimated Resident Population",
            "Estimated Resident Population Orig",
            "ERP sourced from the ABS population estimates. ",
        ),
        ("Terms Of Trade", "Terms of Trade Index (SA)", ""),
        # -- Broken - something wrong with these ones - need to work out what
        # (
        #    "Nominal Average Earnings Per Hour (Non-Farm)",
        #    "Non-farm compensation of employees per hour (CP) (SA)",
        #    "",
        # ),
        # ("Labour Productivity", "Labour Productivity", ""),
        # ("Dwelling Investment", "Dwelling Investment CVM SA", ""),
        # ("Household Savings Rate (%)", "Household savings ratio SA", ""),
        # ("Public Demand", "Public Final demand (CVM) (SA)", ""),
        # ("Real Household Disposable Income", "Real Household Disposable Income", ""),
    ]

    # plot the data
    growth_exceptions = ["Unemployment Rate", "Household Savings Rate (%)"]
    for i, (s, a, f) in enumerate(somp_abs_pairs):
        series = abs_data[a].loc[start:]
        freq = cast(pd.PeriodIndex, series.index).freqstr[0]
        if freq == "M":
            series = ra.monthly_to_qtly(series, q_ending="DEC", f="mean")
        if s not in growth_exceptions:
            growth = series.pct_change(periods=4).dropna() * 100.0
            title = "Annual Growth"
        else:
            growth = series
            title = ""
        growth.name = s
        ax = growth.plot(lw=2, color="darkorange")
        plot_somp(ax, somp[s], last_n=last_n)
        finalise_plot(
            ax,
            title=f"SOMP: {s} {title}",
            ylabel=f"% {title}",
            legend={"loc": "best", "fontsize": 9, "ncol": 3},
            lfooter=f"Australia. {f}",
            rfooter=f"{abs_cat[a]}, RBA: SOMP.",
            tag=f"{i}-{last_n}",
            y0=True,
            show=SHOW,
            file_type=FILE_TYPE,
        )


for n in 1, 2, 0:
    plot_somp_abs_pairs(last_n=n)

### Cash rate (market based assumptions going forward)

In [13]:
def plot_cr(somp_data: DomainSompDict) -> None:
    """Produce a cash rate chart."""

    # plot the official cash rate history
    ocr_rba = ra.read_rba_ocr()
    ax = ocr_rba[cast(pd.PeriodIndex, ocr_rba.index).year >= START_YEAR].plot(
        lw=2, color="darkorange", drawstyle="steps-post"
    )
    m_somp = ra.qtly_to_monthly(somp_data["Cash Rate (%)"], interpolate=False)
    plot_somp(ax, m_somp, last_n=2)
    finalise_plot(
        ax,
        title="SOMP: Official Cash Rate",
        ylabel="%",
        legend={"loc": "best", "fontsize": 9, "ncol": 3},
        lfooter="Australia. OCR plotted on an end-of-month basis. "
        + "Quarterly forward assumptions plotted against the last month in the quarter. ",
        rfooter="RBA: SOMP.",
        y0=True,
        show=SHOW,
        file_type=FILE_TYPE,
    )


plot_cr(somp)

### Trade Weighted Index (constant assumption going forward)

In [14]:
def plot_twi():
    """Plot the Trade Weighted Index."""

    twi_df, _twi_meta = ra.read_rba_table(table="F11.1")
    twi = twi_df["FXRTWI"]
    ax = twi[cast(pd.PeriodIndex, twi.index).year >= START_YEAR].plot(
        lw=2,
        color="darkorange",  # drawstyle="steps-post"
    )
    series = (
        somp["Trade-Weighted Index (Index)"]
        .to_timestamp(how="end", freq="D")
        .resample("D")
        .agg("last")
    )
    series.index = pd.PeriodIndex(series.index, freq="D") - 45  # days
    plot_somp(ax, series, last_n=2)
    finalise_plot(
        ax,
        title="SOMP: Trade Weighted Index",
        ylabel="Index",
        legend={"loc": "best", "fontsize": 9, "ncol": 3},
        lfooter="Australia. ",
        rfooter="RBA: SOMP.",
        y0=True,
        show=SHOW,
        file_type=FILE_TYPE,
    )


plot_twi()

## Finished

In [15]:
%load_ext watermark
%watermark --python --machine --conda --iversions --watermark

Python implementation: CPython
Python version       : 3.13.1
IPython version      : 8.31.0

conda environment: 313

Compiler    : Clang 18.1.8 
OS          : Darwin
Release     : 24.2.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

matplotlib: 3.10.0
pandas    : 2.2.3
numpy     : 2.2.2
readabs   : 0.0.26

Watermark: 2.5.0



In [16]:
print("Finished.")

Finished.
