# RBA SOMP Forecasts

## Set-up

In [1]:
# system imports
import io
import textwrap
from pathlib import Path
from typing import NewType, cast, Any
from urllib.error import HTTPError
from datetime import datetime
from io import StringIO

# analytic imports
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import readabs as ra
from mgplot import clear_chart_dir, finalise_plot, set_chart_dir

# internet imports
import webdriver_manager
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
print(f"webdriver_manager version: {webdriver_manager.__version__}")

webdriver_manager version: 4.0.2


In [2]:
# plotting constants
CHART_DIR = "./CHARTS/SOMP/"
set_chart_dir(CHART_DIR)
clear_chart_dir()
SHOW = False
FILE_TYPE = "png"
MARKERS = [
    "o",
    "s",
    "D",
    "x",
    "P",
    "H",
    "v",
    "^",
    "<",
    ">",
    "1",
    "2",
    "3",
    "4",
    "8",
    "p",
    "*",
    "h",
    "+",
    "X",
    "D",
]

## Data acquisition from (1) SOMP, (2) ABS, (3) RBA

### Typing information for SOMP data

In [3]:
# A dictionary that holds DataFrame data keyed by report dates
# rows in df are subject domains columns are projection dates
DateSompFrame = NewType("DateSompFrame", pd.DataFrame)


class DateSompDict(dict[str, DateSompFrame]):
    """A dictionary of DateSompFrame DataFrames keyed by str report dates"""


# A dictionary that holds DataFrame data keyed by subject domain
# rows in df are projection dates, columns are report dates
DomainSompFrame = NewType("DomainSompFrame", pd.DataFrame)


class DomainSompDict(dict[str, DomainSompFrame]):
    """A dictionary of DomainSompFrame DataFrames keyed by str subject domain"""

### Get raw SOMP tables by date

In [4]:
# Note: inclusive search terms:
START_YEAR = 2024
STOP_YEAR = datetime.now().year 

In [5]:
def data_cleaning(frame: DateSompFrame) -> DateSompFrame:
    """Preliminary data cleaning for the raw SOMP
    data captured from the RBA website."""

    for col in frame.columns:

        # convert text fractions to decimals
        frame[col] = (
            frame[col]
            .astype(str)
            .str.replace("¼", ".25")
            .str.replace("½", ".5")
            .str.replace("¾", ".75")
            .str.replace("−", "-")
        )

        # make to NaN all non-numeric data
        frame[col] = pd.to_numeric(frame[col], errors="coerce")

    # delete any all-NaN rows - Needed for the Feb 24 report
    frame = cast(DateSompFrame, frame.dropna(how="all", axis="index"))

    # remove odd footnotes from index introduced in Feb 2024
    # and standardize the case used in the index
    # (because of case mismatch before and after Feb 2024)
    frame.index = frame.index.str.replace(r"\([a-z]\)$", "", regex=True).str.title()

    return frame

In [6]:
# Note: This cell was re-written on 23/May/2025. The old way
# of doing this failed following the release of Q2 2025 SOMP.
# This new approach does not cache the data.

def collect_somp_data(
):
    """Collect raw SOMP data, as presented in the RBA SOMP
    reports. Returns one table for each report in a dict,
    keyed by the report date. The data is cleaned and
    converted to a DataFrame. The DataFrame has subject
    domains as rows and forecast dates as columns."""

    service = ChromeService(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--ignore-certificate-errors")
    options.add_argument("--incognito")
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=service, options=options)
    driver.implicitly_wait(5)

    pool = DateSompDict()
    start_year = 2019
    stop_year = datetime.now().year + 1
    stop_quarter = datetime.now().month // 4 + 1
    for year in range(start_year, stop_year):
        for quarter in range(1, 5):

            if year == (stop_year - 1) and quarter > stop_quarter:
                break
            index = f"{year}Q{quarter}"
            qtext = {1: "feb", 2: "may", 3: "aug", 4: "nov"}[quarter]

            url = f"https://www.rba.gov.au/publications/smp/{year}/{qtext}/outlook.html"
            if year < 2024:
                url = f"https://www.rba.gov.au/publications/smp/{year}/{qtext}/forecasts.html"

            #print(index, url, end=" ")
            driver.get(url)
            text = driver.page_source
            tables = pd.read_html(StringIO(text))
            target = tables[-1]
            target.index = target[target.columns[0]]
            target.index.name = "Index"
            target = target.drop(columns=[target.columns[0]])
            print(len(target))
            pool[index] = data_cleaning(target)

    driver.close()
    driver.quit()
    return pool


raw_somp = collect_somp_data()

18


18


18


18


18


18


18


18


18


18


18


18


18


18


18


18


18


18


18


18


34


34


34


34


34


34


34


### Reorganise SOMP data into a dictionary keyed by domain

In [7]:
def reorganise_somp(inputs: DateSompDict) -> DomainSompDict:
    """Reorganise the SOMP data from a dictionary of tables by date
    to a dictionary of tables by domain. For each table in the dictionary.
    The column names of each table are the dates of the SOMP reports.
    The row names are forecast is quarter being forecast.
    Arguments:
    inputs -- a dictionary of tables keyed by date.
    Returns:
    A dictionary of tables keyed by domain."""

    ue = "Unemployment Rate"
    pool = DomainSompDict()
    for index, frame in inputs.items():
        for item in frame.index:
            if item[0] == "(":
                continue  # ignore footnotes
            row = frame.loc[item]
            row.name = index
            item_adj = ue if ue in item else item
            row_frame = cast(DomainSompFrame, pd.DataFrame(row))
            if item_adj not in pool:
                pool[item_adj] = row_frame
            else:
                pool[item_adj] = cast(
                    DomainSompFrame, pd.concat([pool[item_adj], row_frame], axis=1)
                )

    # sort out column names and index names
    rpool = DomainSompDict()
    for key, table in pool.items():
        rpool[key] = cast(
            DomainSompFrame,
            pd.DataFrame(
                table.values,
                columns=pd.PeriodIndex(table.columns, freq="Q-NOV"),
                index=pd.PeriodIndex(table.index, freq="Q-DEC"),
            ),
        )

    return rpool


somp = reorganise_somp(raw_somp)
textwrap.wrap(", ".join(somp.keys()), width=85)

['Gross Domestic Product, Household Consumption, Dwelling Investment, Business',
 'Investment, Public Demand, Gross National Expenditure, Imports, Exports, Real',
 'Household Disposable Income, Terms Of Trade, Major Trading Partner (Export-Weighted)',
 'Gdp, Unemployment Rate, Employment, Wage Price Index, Nominal (Non-Farm) Average',
 'Earnings Per Hour, Trimmed Mean Inflation, Consumer Price Index, Hours-Based',
 'Underutilisation Rate (Quarterly, %), Nominal Average Earnings Per Hour (Non-Farm),',
 'Cash Rate (%), Trade-Weighted Index (Index), Brent Crude Oil Price (Us$/Bbl),',
 'Estimated Resident Population, Labour Productivity, Household Savings Rate (%), Real',
 'Wage Price Index, Real Average Earnings Per Hour (Non-Farm), Household Saving Rate',
 '(%)']

### Get key ABS data

In [8]:
def wanted_abs_series() -> dict[str, tuple[str, str, str]]:
    """Produce a dictionary of data items that can be
    downloaded from the ABS website, in the form of:
    -   key: the unique ABS series identifier
    -   value: a tuple containing
        -   the ABS catalogue number,
        -   the excel file/table name (without the .xlsx suffix), and
        -   a unique series handle for referencing the data."""

    gdp_cat = "5206.0"
    gdp_kags = "5206001_Key_Aggregates"
    gdp_hhc = "5206008_Household_Final_Consumption_Expenditure"
    gdp_exp = "5206002_Expenditure_Volume_Measures"
    # gdp_exp_cp = "5206003_Expenditure_Current_Price"
    gdp_tax = "5206022_Taxes"
    gdp_ipd = "5206005_Expenditure_Implicit_Price_Deflators"
    gdp_hhi = "5206020_Household_Income"
    gdp_sas = "5206024_Selected_Analytical_Series"
    gdp_pay = "5206023_Social_Assistance_Benefits"
    cpi_cat, cpi_seo = "6401.0", "640106"
    wpi_cat, wpi_seo = "6345.0", "634501"
    lfs_cat, lfs_seo = "6202.0", "6202001"
    lfs_uu = "6202022"
    erp_cat, erp_seo = "3101.0", "310101"
    bldg_cat, bldg_seo = "8752.0", "8752003"
    # mdb_cat, mdb_seo = "1364.0.15.003", "1364015004"

    wanted = {
        # "Series ID": ["Catalogue ID", "excel table name", "ABS series name"]
        # -- Indexes
        "A2325846C": (cpi_cat, cpi_seo, "CPI Index Orig"),
        "A3604506F": (cpi_cat, cpi_seo, "CPI Index SA"),
        "A3604509L": (cpi_cat, cpi_seo, "CPI Index TM SA"),
        "A2303940R": (gdp_cat, gdp_ipd, "HHIPD Index CVM SA"),
        "A2303727C": (gdp_cat, gdp_ipd, "GNEIPD Index CVM SA"),
        "A2303730T": (gdp_cat, gdp_ipd, "GDPIPD Index CVM SA"),
        "A2713849C": (wpi_cat, wpi_seo, "WPI Index SA"),
        "A2603609J": (wpi_cat, wpi_seo, "WPI Index Orig"),
        # -- Employment
        "A84423043C": (lfs_cat, lfs_seo, "Thousand Employed SA"),
        "A84423050A": (lfs_cat, lfs_seo, "Unemployment Rate SA"),
        "A85255726K": (lfs_cat, lfs_uu, "Underutilisation rate Persons SA"),
        # -- Population
        "A2133251W": (erp_cat, erp_seo, "Estimated Resident Population Orig"),
        "A2302460K": (gdp_cat, gdp_kags, "GDP per capita CVM Orig"),  # for population
        "A2302459A": (gdp_cat, gdp_kags, "GDP CVM Orig"),  # for population
        # -- GDP
        "A2304402X": (gdp_cat, gdp_kags, "GDP CVM SA"),
        "A2323382F": (gdp_cat, gdp_kags, "Household savings ratio SA"),
        "A2303280V": (gdp_cat, gdp_hhc, "Household consumption CVM SA"),
        "A2304113C": (gdp_cat, gdp_exp, "GNE CVM SA"),
        "A2304114F": (gdp_cat, gdp_exp, "Exports CVM SA"),
        "A2304115J": (gdp_cat, gdp_exp, "Imports CVM SA"),
        "A2302939L": (gdp_cat, gdp_hhi, "Gross Disposable Income CP SA"),
        "A2302777J": (gdp_cat, gdp_tax, "Taxes on income CP SA"),
        "A2304200A": (gdp_cat, gdp_kags, "Terms of Trade Index (SA)"),
        "A2302589X": (gdp_cat, gdp_sas, "Non-farm GDP CVM SA"),
        "A2302607T": (gdp_cat, gdp_sas, "Non-farm total compensation employees CP SA"),
        "A2304100T": (gdp_cat, gdp_exp, "Business Investment CVM SA"),
        "A129552325C": (
            gdp_cat,
            gdp_sas,
            "NF hourly pay CP SA",
        ),
        "A124830484V": (gdp_cat, gdp_sas, "Public Final demand CVM SA"),
        "A2301976F": (gdp_cat, gdp_pay, "Social Assistance Benefits CP Orig"),
        # -- Other
        "A83770795V": (bldg_cat, bldg_seo, "Dwelling Investment CVM SA"),
    }

    # check the handles are unique
    handles = [v[2] for v in wanted.values()]
    assert len(handles) == len(set(handles))

    return wanted

In [9]:
def get_abs_data() -> tuple[dict[str, pd.Series], pd.DataFrame, dict[str, str]]:
    """Using the data items in the wanted_abs_series() function,
    return a tuple comprising:
    -   a dictionary of pandas Series, each containing the data
        for a single ABS series. The key is the unique handle from
        wanted_abs_series().
    -   a DataFrame containing the ABS metadata for each series.
        (Useful if you need to check what you got, but it does not
        contain any references to the special calulta.)
    -   a dictionary of ABS catalogue numbers for each series,
        for use in the footnote to each chart."""

    data, meta, footnotes = {}, {}, {}
    wanted = wanted_abs_series()
    for series_id, (catalogue_id, seo, title) in wanted.items():
        if not series_id or not catalogue_id or not seo or not title:
            continue
        d, m = ra.read_abs_series(
            catalogue_id,
            series_id,
            single_excel_only=seo,
        )
        data[title] = d[series_id]
        meta[title] = m.loc[series_id]
        footnotes[title] = f"ABS: {catalogue_id}"

    # and return the data, metadata and footnotes
    return data, pd.DataFrame(meta).T, footnotes


hist_data, abs_meta, right_footnotes = get_abs_data()
print(f"{len(abs_meta)} data items collected fromthe ABS.")
# Note: abs_meta is not used after this point - but useful to
# have here for checking that we collected what we wanted.

30 data items collected fromthe ABS.


In [10]:
def additional_calculations(
    data: dict[str, pd.Series],
    footnotes: dict[str, str],
) -> tuple[dict[str, pd.Series], dict[str, str]]:
    """Perform additional calculations on the ABS data
    to (re-)create the metric used by the RBA."""

    # -- GDP based implicit population
    data["GDP population"] = data["GDP CVM Orig"] / data["GDP per capita CVM Orig"]
    footnotes["GDP population"] = footnotes["GDP per capita CVM Orig"]

    # -- NF labour productivity
    hours = (
        data["Non-farm total compensation employees CP SA"]
        / data["NF hourly pay CP SA"]
    )
    data["NF labour productivity"] = data["Non-farm GDP CVM SA"] / hours
    footnotes["NF labour productivity"] = footnotes["Non-farm GDP CVM SA"]

    # -- Real WPI: WPI/CPI
    data["Real WPI"] = data["WPI Index SA"] / data["CPI Index SA"]
    footnotes["Real WPI"] = (
        footnotes["WPI Index SA"] + ", " + footnotes["CPI Index SA"][5:]
    )

    # -- Real Average Earnings Per Hour (Non-Farm)
    data["Real hourly pay"] = data["NF hourly pay CP SA"] / data["CPI Index SA"]
    footnotes["Real hourly pay"] = (
        footnotes["NF hourly pay CP SA"] + ", " + footnotes["CPI Index SA"][5:]
    )

    # -- Real Household Disposable Income
    data["Real Household Disposable Income"] = (
        data["Gross Disposable Income CP SA"]
        # - interest payments
    ) / data["HHIPD Index CVM SA"]
    footnotes["Real Household Disposable Income"] = footnotes[
        "Gross Disposable Income CP SA"
    ]

    return data, footnotes


hist_data, right_footnotes = additional_calculations(hist_data, right_footnotes)

## Plotting

### SOMP/ABS pairs

In [11]:
def get_somp_abs_pairs() -> list[tuple[str, str, str]]:
    """Produce a list of tuples, each containing:
    -   the SOMP data series name,
    -   the ABS data series name, and
    -   the left-hand footnote to be used in the chart."""

    # -- oft used footnotes: text - full-stop - space
    os = "Original series. "
    sa = "Seasonally adjusted. "
    cvmsa = f"CVM. {sa}"
    lfs = f"Quarterly mean monthly data from Labour Force Survey. {sa}"
    cpi = "Calculated using CPI. "

    # -- the SOMP-ABS pairs and left-hand footnotes
    somp_abs_pairs = [
        # ("SOMP series name", "ABS series name", "additional left-side footnote")
        ("Gross Domestic Product", "GDP CVM SA", cvmsa),
        ("Household Consumption", "Household consumption CVM SA", cvmsa),
        ("Gross National Expenditure", "GNE CVM SA", cvmsa),
        ("Exports", "Exports CVM SA", cvmsa),
        ("Imports", "Imports CVM SA", cvmsa),
        ("Wage Price Index", "WPI Index SA", sa),
        ("Employment", "Thousand Employed SA", lfs),
        ("Unemployment Rate", "Unemployment Rate SA", lfs),
        ("Trimmed Mean Inflation", "CPI Index TM SA", sa),
        ("Consumer Price Index", "CPI Index Orig", os),
        (
            "Estimated Resident Population",
            "GDP population",
            f"Population implied in ABS National Accounts. {os}",
        ),
        (
            "Estimated Resident Population",
            "Estimated Resident Population Orig",
            f"ABS population estimates. {os}",
        ),
        ("Terms Of Trade", "Terms of Trade Index (SA)", ""),
        ("Labour Productivity", "NF labour productivity", f"Non-farm. {sa}"),
        # -- these look a little off - but not worth worrying about.
        ("Real Wage Price Index", "Real WPI", f"{cpi}{sa}"),
        ("Real Average Earnings Per Hour (Non-Farm)", "Real hourly pay", f"{cpi}{sa}"),
        ("Public Demand", "Public Final demand CVM SA", cvmsa),
        # -- Something really wrong with these ones - need to work out what
        # ("Real Household Disposable Income", "Real Household Disposable Income", ""),
        # ("Nominal Average Earnings Per Hour (Non-Farm)", "NF hourly pay CP SA", sa),
        # ("Business Investment", "Business Investment CVM SA", sa),
        # ("Dwelling Investment", "Dwelling Investment CVM SA", ""),
        # ("Household Savings Rate (%)", "Household savings ratio SA", sa),
        # (
        #    "Hours-Based Underutilisation Rate (Quarterly, %)",
        #    "Underutilisation rate Persons SA",
        #    lfs
        # ),
        # --- still to think about
        # Major Trading Partner (Export-Weighted) GDP
        # Brent Crude Oil Price (Us$/Bbl)
    ]

    return somp_abs_pairs

In [12]:
def plot_somp(
    ax: mpl.axes.Axes,
    df: DomainSompFrame,
    last_n: int = 0,
    odd_qtr: int = 0,  # 0 = all, 1 = odd, 2 = even
) -> None:
    """Add the quarterly SOMP forecasts to a plot."""

    df = cast(DomainSompFrame, df.copy())  # non destructive
    df = cast(DomainSompFrame, df[df.columns[-last_n:]] if last_n else df)
    col_count = len(df.columns)

    gradient = np.linspace(0, 1, len(df.columns))
    colors = (
        plt.get_cmap("viridis")(gradient)
        if col_count > 2
        else ("royalblue", "indianred")
    )
    xy = set()
    last = df.columns[-1]
    styles = ["-", "--", "-.", ":"] * 10
    for count, (col, color) in enumerate(zip(df.columns, colors)):
        rba_series = df[col].astype(float).dropna()  # kludge
        if rba_series.notna().sum() == 0:
            continue
        if odd_qtr:
            keep = (2, 4) if not odd_qtr % 2 else (1, 3)
            if pd.Period(col, freq="Q").quarter not in keep:
                continue
        color = "darkred" if col == last else color
        mark: dict[str, Any] = {"marker": MARKERS[count % len(MARKERS)], "ms": 2}
        rba_series.plot(ax=ax, lw=1, c=color, ls=styles[count], **mark, label=str(col)[2:])
        x, y = rba_series.index[-1], rba_series.iloc[-1]
        # let's minimise over-plotting of text
        va = "bottom" if (x, y) not in xy else "top"
        ax.text(x=x, y=y, s=str(col)[2:], fontsize=6, va=va)
        xy.add((x, y))

In [13]:
def plot_somp_abs_pairs(
    previous_years=6,
    last_n=2,
    odd_qtr=False,
) -> None:
    """Plot SOMP forecasts against ABS data."""

    # plot the data
    somp_abs_pairs = get_somp_abs_pairs()
    growth_exceptions = [
        "Unemployment Rate",
        "Household Savings Rate (%)",
        "Hours-Based Underutilisation Rate (Quarterly, %)",
    ]
    for i, (s, a, f) in enumerate(somp_abs_pairs):
        series = hist_data[a]
        freq = cast(pd.PeriodIndex, series.index).freqstr[0]
        if freq == "M":
            series = ra.monthly_to_qtly(series, q_ending="DEC", f="mean")
        abs_start = previous_years * -4 -1
        if s not in growth_exceptions:
            growth = (
                series.pct_change(
                    periods=4, fill_method=None  # type: ignore[arg-type]
                ).dropna()
                * 100.0
            )
            title = "Annual Growth"
        else:
            growth = series
            title = ""
        # growth = growth.round(1)  # round ABS to 1 decimal place
        growth.name = s
        ax = growth.iloc[abs_start:].plot(lw=2, color="darkorange")
        plot_somp(ax, somp[s], last_n=last_n, odd_qtr=odd_qtr)
        fs = "xx-small" if last_n > 5 else "x-small"
        finalise_plot(
            ax,
            title=f"SOMP: {s} {title}",
            ylabel=f"% {title}",
            xlabel=None,
            legend={"loc": "best", "fontsize": fs, "ncol": 3},
            lfooter=f"Australia. {f}",
            rfooter=f"{right_footnotes[a]}, RBA: SOMP.",
            tag=f"{i}-{last_n}",
            y0=True,
            show=SHOW,
            file_type=FILE_TYPE,
        )


for n in 1, 2, 0:
    plot_somp_abs_pairs(last_n=n)
# plot_somp_abs_pairs(last_n=16, odd_qtr=2) # less dense long run

### Cash rate (market based assumptions going forward)

In [14]:
def plot_cr(somp_data: DomainSompDict) -> None:
    """Produce a cash rate chart."""

    # plot the official cash rate history
    ocr_rba = ra.read_rba_ocr()
    ax = ocr_rba.iloc[(-12*5-1):].plot(
        lw=2, color="darkorange", drawstyle="steps-post"
    )
    m_somp = cast(
        DomainSompFrame,
        ra.qtly_to_monthly(somp_data["Cash Rate (%)"], interpolate=False),
    )
    plot_somp(ax, m_somp, last_n=2)
    fs = "xx-small" if len(m_somp.columns) > 5 else "x-small"
    finalise_plot(
        ax,
        title="SOMP: Official Cash Rate",
        ylabel="%",
        legend={"loc": "best", "fontsize": fs, "ncol": 3},
        lfooter="Australia. OCR plotted on an end-of-month basis. "
        + "Quarterly forward assumptions plotted against the last month in the quarter. ",
        rfooter="RBA: SOMP.",
        y0=True,
        show=SHOW,
        file_type=FILE_TYPE,
    )


plot_cr(somp)

## Finished

In [15]:
%load_ext watermark
%watermark -u -t -d --iversions --watermark --machine --python --conda

Last updated: 2025-09-04 08:30:23

Python implementation: CPython
Python version       : 3.13.6
IPython version      : 9.4.0

conda environment: n/a

Compiler    : Clang 20.1.4 
OS          : Darwin
Release     : 24.6.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

matplotlib       : 3.10.5
numpy            : 2.3.2
pandas           : 2.3.1
pathlib          : 1.0.1
typing           : 3.10.0.0
webdriver_manager: 4.0.2
readabs          : 0.1.4
selenium         : 4.34.2
mgplot           : 0.2.12

Watermark: 2.5.0



In [16]:
print("Finished.")

Finished.
