## Python set-up

In [1]:
# system imports
from math import isfinite
from functools import cache

# analytic imports
import matplotlib.pyplot as plt
import pandas as pd
import readabs as ra
from IPython.display import display
import statsmodels.api as sm  # type: ignore
from mgplot import (
    line_plot_finalise,
    finalise_plot,
    set_chart_dir,
    clear_chart_dir
)


In [2]:
# local imports
from abs_helper import QUARTERLY_CPI_RANGE
from henderson import hma

In [3]:
# pandas display settings
pd.options.display.max_rows = 999999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 100

# save charts in this notebook
plt.style.use("fivethirtyeight")
CHART_DIR = "./CHARTS/Inflation-Model/"
set_chart_dir(CHART_DIR)
clear_chart_dir()

# display charts in this notebook
SHOW = False

## Data capture

 ### Identify the data we want to use

In [4]:
@cache
def get_data() -> tuple[dict[str, pd.Series], pd.DataFrame]:
    """Get a dictionary of data items from the ABS."""

    wanted = {
        # "Series ID": ["Category ID", "single-excel-only table name", "Short Series Title"],
        "A3604510W": ["6401.0", "640106", "CPI_TM"],  # change from previous quarter
        "A2314867K": ["6427.0", "642701", "PPI"],  # change from previous year
        "A83895396W": ["6345.0", "634501", "WPI"],  # change from previous year
        "A84423050A": ["6202.0", "6202001", "UER"],  # percent of labour force
    }

    data, meta = {}, {}
    for series_id, (category_id, seo, title) in wanted.items():
        d, m = ra.read_abs_series(category_id, series_id, single_excel_only=seo)
        data[title] = d[series_id]
        meta[title] = m.loc[series_id]
    return data, pd.DataFrame(meta).T


_, META = get_data()
# check we have the correct variables
# META

### Collect and marshall that data
Assume we are only doind a one period prediction

In [5]:
def collect_data(subs: dict | None = None) -> tuple[pd.DataFrame, pd.Index, pd.Series]:
    """Collect the data for the inflation model.
    Arguments:
        subs: Dictionary of substitutions for forward values
    Returns:
        data: DataFrame containing the data
        exogenous: Index of exogenous variable names
        endogenous: Series of endogenous variables (ie CPI_TM)"""

    data_dict, _meta = get_data()
    data_dict["UER"] = ra.monthly_to_qtly(data_dict["UER"])
    data = pd.concat(data_dict, axis=1)

    # forward for one period prediction
    last = data.loc[data.index[-1], "CPI_TM"]
    if isfinite(last):
        next_period = data.index[-1] + 1
        new_index = data.index.append(pd.PeriodIndex([next_period]))
        data = data.reindex(new_index)

    # populate forward exogenous guesses into the model
    last = data.index[-1]
    if subs is not None:
        for k, v in subs.items():
            if subs == "CPI_TM" or isfinite(data.loc[last, k]):
                continue
            data.loc[last, k] = v

    # Adjust missing values in PPI and WPI because they are published after CPI
    # so we may need rolling average approximations to plug in
    for s in ("PPI", "WPI"):
        _ = "Do noting if next lines commented out ..."
        data[f"{s}_ADJ"] = data[s].where(
            data[s].notna(), other=data[s].rolling(4).mean().ffill()
        )
        # data = data.drop(columns=s)

    # smooth WPI/PPI
    h = 7
    for s in ("WPI", "PPI"):
        _ = "Do noting if next lines commented ..."
        data[f"{s}_HMA[{h}]"] = hma(data[f"{s}_ADJ"].dropna(), h)

    # provide differences
    for col in ("PPI_ADJ", "WPI_ADJ", "UER"):
        data[f"Δ{col}"] = data[f"{s}_ADJ"].diff(1)

    # add autoregression
    for ar in (1, 2):
        data[f"CPI_TM-{ar}"] = data["CPI_TM"].shift(ar)

    # add the constant
    data["const"] = 1.0

    # Add a COVID dummy
    data["covid"] = 0
    mask = (data.index >= "2020Q1") & (data.index <= "2021Q1")
    data.loc[mask, "covid"] = 1

    # remove early nans
    remember = data.iloc[-1]
    data = pd.concat([data.dropna(), remember.to_frame().T])

    # exogenous and out-of-sample endogenous where known variables
    exogenous = data.columns.difference(["CPI_TM"])
    endogenous = data.index[-1]
    return data, exogenous, endogenous


DATA, EXOG, ENDOG = collect_data(subs={"UER": 4.1, "WPI": 4.0, "PPI": 4.3})
# DATA.tail()

In [6]:
EXOG

Index(['CPI_TM-1', 'CPI_TM-2', 'PPI', 'PPI_ADJ', 'PPI_HMA[7]', 'UER', 'WPI',
       'WPI_ADJ', 'WPI_HMA[7]', 'const', 'covid', 'ΔPPI_ADJ', 'ΔUER',
       'ΔWPI_ADJ'],
      dtype='object')

In [7]:
ENDOG

Period('2025Q3', 'Q-DEC')

## Build a simple multiple regression model

In [8]:
forecasts = {}


def run_model(data, exog: list[str], title: str, endog: pd.Series | None):
    """Run the model for the given data and exogenous variables."""

    # Fit the model
    y = data["CPI_TM"].dropna()
    X = data.loc[y.index, exog]
    Xnew = data.loc[data.index.difference(X.index), exog]
    print(X.tail())
    model = sm.OLS(y, X)
    fit = model.fit()
    print(fit.summary())

    # QQ plot
    fig = sm.qqplot(fit.resid, line="s")
    name = title.split(" vs ")[1].rsplit(" ", 1)[0]
    finalise_plot(
        fig.axes[0],
        title=f"QQ Plot: {name}",
        xlabel="Theoretical Quantiles",
        ylabel="Sample Quantiles",
        show=SHOW,
    )

    # can we forecast?
    can_forecast = endog is not None and len(Xnew) > 0
    if can_forecast:
        print("Forecasting using:")
        display(Xnew)
        ynewpred = fit.predict(Xnew)  # predict out of sample
        display(ynewpred)
        forecasts[title] = ynewpred.iloc[0]
        ynewpred[y.index[-1]] = y.iloc[-1]  # start from actual
        quarterly = ynewpred.iloc[1]
        annual = (
            (((pd.concat([y.iloc[-3:-1], ynewpred]) / 100) + 1).cumprod() - 1) * 100
        ).iloc[-1]
        projection = (
            f"Projection {ynewpred.index[-1]}: A:{annual:.2f}% Q:{quarterly:.2f}%"
        )
    projection = projection if can_forecast else ""

    # plot avtual vs predicted
    mlist = [f"({x}*{round(y, 3)})" for x, y in fit.params.items()]
    m = "ypred = " + (" + ").join(mlist)
    print("Model: ", m)
    ypred = fit.predict(X)
    model_frame = pd.DataFrame({"Actual": y, "Within sample predicted": ypred})
    if can_forecast:
        model_frame = model_frame.reindex(model_frame.index.union(ynewpred.index))
        model_frame["Out of sample forecast"] = ynewpred
    line_plot_finalise(
        model_frame,
        title=title,
        ylabel="Inflation % per quarter",
        color=("cornflowerblue", "darkorange", "darkred"),
        width=(1.5, 2, 3),
        axhspan=QUARTERLY_CPI_RANGE,
        lfooter=m if len(m) < 80 else "",
        y0=True,
        legend=True,
        show=SHOW,
    )

In [9]:
# Simple everything model
title_ = "Trimmed Mean Inflation vs All Exogenous Model"
run_model(DATA, EXOG.to_list(), title_, ENDOG)

        CPI_TM-1  CPI_TM-2  PPI  PPI_ADJ  PPI_HMA[7]       UER  WPI  WPI_ADJ  \
2024Q2       1.0       0.9  4.8      4.8    4.406434  4.042098  4.1      4.1   
2024Q3       0.8       1.0  3.9      3.9    4.134965  4.130623  3.5      3.5   
2024Q4       0.8       0.8  3.7      3.7    3.735245  4.004144  3.2      3.2   
2025Q1       0.5       0.8  3.7      3.7    3.601089  4.080584  3.4      3.4   
2025Q2       0.7       0.5  3.4      3.4    3.737976  4.163652  3.4      3.4   

        WPI_HMA[7]  const  covid  ΔPPI_ADJ  ΔUER  ΔWPI_ADJ  
2024Q2    3.894406    1.0    0.0       0.5   0.5       0.5  
2024Q3    3.576364    1.0    0.0      -0.9  -0.9      -0.9  
2024Q4    3.317483    1.0    0.0      -0.2  -0.2      -0.2  
2025Q1    3.332884    1.0    0.0       0.0   0.0       0.0  
2025Q2    3.552066    1.0    0.0      -0.3  -0.3      -0.3  
                            OLS Regression Results                            
Dep. Variable:                 CPI_TM   R-squared:                       0

Unnamed: 0,CPI_TM-1,CPI_TM-2,PPI,PPI_ADJ,PPI_HMA[7],UER,WPI,WPI_ADJ,WPI_HMA[7],const,covid,ΔPPI_ADJ,ΔUER,ΔWPI_ADJ
2025Q3,0.6,0.7,4.3,4.3,3.969041,4.1,4.0,4.0,3.828788,1.0,0.0,0.9,0.9,0.9


2025Q3    0.794373
Freq: Q-DEC, dtype: float64

Model:  ypred = (CPI_TM-1*0.405) + (CPI_TM-2*0.239) + (PPI*-0.05) + (PPI_ADJ*-0.05) + (PPI_HMA[7]*0.132) + (UER*-0.035) + (WPI*0.117) + (WPI_ADJ*0.117) + (WPI_HMA[7]*-0.238) + (const*0.364) + (covid*-0.03) + (ΔPPI_ADJ*0.018) + (ΔUER*0.018) + (ΔWPI_ADJ*0.018)


In [10]:
# REMOVE some cooefficients that are not significantly different from zero
exog_1 = EXOG.difference(
    [
        "WPI",
        "WPI_ADJ",
        "WPI_HMA[7]",
        "PPI",
        "PPI_ADJ",
        "UER",
        "ΔPPI_ADJ",
        "ΔUER",
        "ΔWPI_ADJ",
    ]
).to_list()
title_ = "Trimmed Mean Inflation vs AR2 Model Predicted"
run_model(DATA, exog_1, title_, ENDOG)

        CPI_TM-1  CPI_TM-2  PPI_HMA[7]  const  covid
2024Q2       1.0       0.9    4.406434    1.0    0.0
2024Q3       0.8       1.0    4.134965    1.0    0.0
2024Q4       0.8       0.8    3.735245    1.0    0.0
2025Q1       0.5       0.8    3.601089    1.0    0.0
2025Q2       0.7       0.5    3.737976    1.0    0.0
                            OLS Regression Results                            
Dep. Variable:                 CPI_TM   R-squared:                       0.621
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     40.16
Date:                Thu, 04 Sep 2025   Prob (F-statistic):           6.98e-20
Time:                        08:19:03   Log-Likelihood:                 26.975
No. Observations:                 103   AIC:                            -43.95
Df Residuals:                      98   BIC:                            -30.78
Df Model:                           4             

Forecasting using:


Unnamed: 0,CPI_TM-1,CPI_TM-2,PPI_HMA[7],const,covid
2025Q3,0.6,0.7,3.969041,1.0,0.0


2025Q3    0.720533
Freq: Q-DEC, dtype: float64

Model:  ypred = (CPI_TM-1*0.436) + (CPI_TM-2*0.191) + (PPI_HMA[7]*0.048) + (const*0.136) + (covid*-0.04)


In [11]:
exog_ = ["CPI_TM-1", "UER", "covid", "const"]
title_ = "Trimmed Mean Inflation vs AR1 Model Predicted"
run_model(DATA, exog_, title_, ENDOG)

        CPI_TM-1       UER  covid  const
2024Q2       1.0  4.042098    0.0    1.0
2024Q3       0.8  4.130623    0.0    1.0
2024Q4       0.8  4.004144    0.0    1.0
2025Q1       0.5  4.080584    0.0    1.0
2025Q2       0.7  4.163652    0.0    1.0
                            OLS Regression Results                            
Dep. Variable:                 CPI_TM   R-squared:                       0.594
Model:                            OLS   Adj. R-squared:                  0.581
Method:                 Least Squares   F-statistic:                     48.20
Date:                Thu, 04 Sep 2025   Prob (F-statistic):           2.74e-19
Time:                        08:19:03   Log-Likelihood:                 23.361
No. Observations:                 103   AIC:                            -38.72
Df Residuals:                      99   BIC:                            -28.18
Df Model:                           3                                         
Covariance Type:            nonrobust      

Forecasting using:


Unnamed: 0,CPI_TM-1,UER,covid,const
2025Q3,0.6,4.1,0.0,1.0


2025Q3    0.722406
Freq: Q-DEC, dtype: float64

Model:  ypred = (CPI_TM-1*0.614) + (UER*-0.074) + (covid*-0.083) + (const*0.656)


In [12]:
# Multi-model
# exog_ = ['CPI_TM-1', 'CPI_TM-2', "UER", "WPI_HMA[7]", "PPI_HMA[7]",  "const"]
# exog_ = ['CPI_TM-1', 'CPI_TM-2', "PPI_HMA[7]"]  # <-- the best model (no employment)
# exog_ = ['CPI_TM-1', 'CPI_TM-2', "WPI_HMA[7]"]
exog_ = ["CPI_TM-1", "CPI_TM-2", "ΔUER", "PPI_HMA[7]", "WPI_HMA[7]", "covid"]
title_ = "Trimmed Mean Inflation vs Multi Model Predicted"
run_model(DATA, exog_, title_, ENDOG)

        CPI_TM-1  CPI_TM-2  ΔUER  PPI_HMA[7]  WPI_HMA[7]  covid
2024Q2       1.0       0.9   0.5    4.406434    3.894406    0.0
2024Q3       0.8       1.0  -0.9    4.134965    3.576364    0.0
2024Q4       0.8       0.8  -0.2    3.735245    3.317483    0.0
2025Q1       0.5       0.8   0.0    3.601089    3.332884    0.0
2025Q2       0.7       0.5  -0.3    3.737976    3.552066    0.0
                                 OLS Regression Results                                
Dep. Variable:                 CPI_TM   R-squared (uncentered):                   0.938
Model:                            OLS   Adj. R-squared (uncentered):              0.934
Method:                 Least Squares   F-statistic:                              242.6
Date:                Thu, 04 Sep 2025   Prob (F-statistic):                    4.34e-56
Time:                        08:19:03   Log-Likelihood:                          27.119
No. Observations:                 103   AIC:                                     -42.24


Forecasting using:


Unnamed: 0,CPI_TM-1,CPI_TM-2,ΔUER,PPI_HMA[7],WPI_HMA[7],covid
2025Q3,0.6,0.7,0.9,3.969041,3.828788,0.0


2025Q3    0.757313
Freq: Q-DEC, dtype: float64

Model:  ypred = (CPI_TM-1*0.453) + (CPI_TM-2*0.236) + (ΔUER*0.045) + (PPI_HMA[7]*0.036) + (WPI_HMA[7]*0.036) + (covid*0.03)


In [13]:
# mean of all models
pd.Series(forecasts).mean()

np.float64(0.7486565158872244)

## Backtest

In [14]:
BACK_DATA = DATA.iloc[:-1]
BACK_INDEX = BACK_DATA.index[-1]
ACTUAL = BACK_DATA.iloc[-1, 0]
BACK_DATA.iloc[-1, 0] = None
BACK_DATA.tail(), BACK_INDEX

(        CPI_TM  PPI  WPI       UER  PPI_ADJ  WPI_ADJ  WPI_HMA[7]  PPI_HMA[7]  \
 2024Q2     0.8  4.8  4.1  4.042098      4.8      4.1    3.894406    4.406434   
 2024Q3     0.8  3.9  3.5  4.130623      3.9      3.5    3.576364    4.134965   
 2024Q4     0.5  3.7  3.2  4.004144      3.7      3.2    3.317483    3.735245   
 2025Q1     0.7  3.7  3.4  4.080584      3.7      3.4    3.332884    3.601089   
 2025Q2     NaN  3.4  3.4  4.163652      3.4      3.4    3.552066    3.737976   
 
         ΔPPI_ADJ  ΔWPI_ADJ  ΔUER  CPI_TM-1  CPI_TM-2  const  covid  
 2024Q2       0.5       0.5   0.5       1.0       0.9    1.0    0.0  
 2024Q3      -0.9      -0.9  -0.9       0.8       1.0    1.0    0.0  
 2024Q4      -0.2      -0.2  -0.2       0.8       0.8    1.0    0.0  
 2025Q1       0.0       0.0   0.0       0.5       0.8    1.0    0.0  
 2025Q2      -0.3      -0.3  -0.3       0.7       0.5    1.0    0.0  ,
 Period('2025Q2', 'Q-DEC'))

In [15]:
exog_ = ["CPI_TM-1", "CPI_TM-2", "ΔUER", "PPI_HMA[7]", "WPI_HMA[7]", "covid"]
title_ = "BT: Trimmed Mean Inflation vs Multi Model Predicted"
run_model(BACK_DATA, exog_, title_, ENDOG)

        CPI_TM-1  CPI_TM-2  ΔUER  PPI_HMA[7]  WPI_HMA[7]  covid
2024Q1       0.9       1.2   0.2    4.393986    4.123357    0.0
2024Q2       1.0       0.9   0.5    4.406434    3.894406    0.0
2024Q3       0.8       1.0  -0.9    4.134965    3.576364    0.0
2024Q4       0.8       0.8  -0.2    3.735245    3.317483    0.0
2025Q1       0.5       0.8   0.0    3.601089    3.332884    0.0
                                 OLS Regression Results                                
Dep. Variable:                 CPI_TM   R-squared (uncentered):                   0.937
Model:                            OLS   Adj. R-squared (uncentered):              0.933
Method:                 Least Squares   F-statistic:                              239.0
Date:                Thu, 04 Sep 2025   Prob (F-statistic):                    2.09e-55
Time:                        08:19:04   Log-Likelihood:                          26.462
No. Observations:                 102   AIC:                                     -40.92


Forecasting using:


Unnamed: 0,CPI_TM-1,CPI_TM-2,ΔUER,PPI_HMA[7],WPI_HMA[7],covid
2025Q2,0.7,0.5,-0.3,3.737976,3.552066,0.0


2025Q2    0.687047
Freq: Q-DEC, dtype: float64

Model:  ypred = (CPI_TM-1*0.454) + (CPI_TM-2*0.23) + (ΔUER*0.044) + (PPI_HMA[7]*0.037) + (WPI_HMA[7]*0.037) + (covid*0.03)


## Finished

In [16]:
# watermark
%load_ext watermark
%watermark -u -t -d --iversions --watermark --machine --python --conda

Last updated: 2025-09-04 08:19:04

Python implementation: CPython
Python version       : 3.13.6
IPython version      : 9.4.0

conda environment: n/a

Compiler    : Clang 20.1.4 
OS          : Darwin
Release     : 24.6.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

mgplot     : 0.2.12
statsmodels: 0.14.5
pandas     : 2.3.1
matplotlib : 3.10.5
IPython    : 9.4.0
readabs    : 0.1.4

Watermark: 2.5.0



In [17]:
print("Finished")

Finished
