# ABS Inflation model

Aim: to forecasr the next trimmed mean (TM) print based on ...
- upstream inflation PPI
- prevaiing cost pressures WPI
- the Phillips curve (inflation and unemployment have an inverse relationship) UER)
- previous quarter TM, as the series is autocorrelated. 

## Python set-up

In [1]:
# system imports
from math import isfinite

# analytic imports
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import readabs as ra

In [2]:
# local imports
from plotting import line_plot, finalise_plot, set_chart_dir, clear_chart_dir
from henderson import hma

In [3]:
# pandas display settings
pd.options.display.max_rows = 999999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 100

# save charts in this notebook
plt.style.use("fivethirtyeight")
CHART_DIR = "./CHARTS/Inflation-Model/"
set_chart_dir(CHART_DIR)
clear_chart_dir(CHART_DIR)

# Uaeful markers
QUARTERLY_RANGE = {
    "axhspan": {
        "ymin": (pow(1.02, 0.25) - 1) * 100,
        "ymax": (pow(1.03, 0.25) - 1) * 100,
        "color": "#ffdddd",
        "label": "Quarterly growth consistent with 2-3% annual inflation target",
        "zorder": -1,
    }
}

# display charts in this notebook
SHOW = False

## Data capture

 ### Identify the data we want to use

In [4]:
def get_data() -> tuple[dict[str, pd.Series], pd.DataFrame]:
    """Get a dictionary of data items from the ABS."""

    wanted = {
        # "Series ID": ["Category ID", "single-excel-only table name", "Short Series Title"],
        "A3604510W": ["6401.0", "640106", "CPI_TM"],  # change from previous quarter
        "A2314867K": ["6427.0", "642701", "PPI"],  # change from previous year
        "A83895396W": ["6345.0", "634501", "WPI"],  # change from previous year
        "A84423050A": ["6202.0", "6202001", "UER"],  # percent of labour force
    }

    data, meta = {}, {}
    for series_id, (category_id, seo, title) in wanted.items():
        d, m = ra.read_abs_series(category_id, series_id, single_excel_only=seo)
        data[title] = d[series_id]
        meta[title] = m.loc[series_id]
    return data, pd.DataFrame(meta).T


_, META = get_data()
# check we have the correct variables
META

Unnamed: 0,index,Data Item Description,Series Type,Series ID,Series Start,Series End,No. Obs.,Unit,Data Type,Freq.,Collection Month,Table,Table Description,Catalogue number
CPI_TM,30,Percentage Change from Previous Period ; Trimmed Mean ; Australia ;,Seasonally Adjusted,A3604510W,1982-06-01 00:00:00,2024-03-01 00:00:00,168.0,Percent,PERCENT,Quarter,3.0,640106,"CPI: Analytical Series, Weighted Average of Eight Capital Cities",6401.0
PPI,3,Percentage change from corresponding quarter of previous year ; Final ; Total (Source) ;,Original,A2314867K,1999-09-01 00:00:00,2024-03-01 00:00:00,99.0,Percent,PERCENT,Quarter,3.0,642701,"Final demand, Index Numbers and Percentage Changes.",6427.0
WPI,24,Percentage Change From Corresponding Quarter of Previous Year ; Australia ; Total hourly rates...,Seasonally Adjusted,A83895396W,1997-09-01 00:00:00,2024-03-01 00:00:00,107.0,Percent,PERCENT,Quarter,3.0,634501,"Total Hourly Rates of Pay Excluding Bonuses: Sector, Original, Seasonally Adjusted and Trend",6345.0
UER,65,Unemployment rate ; Persons ;,Seasonally Adjusted,A84423050A,1978-02-01 00:00:00,2024-05-01 00:00:00,556.0,Percent,PERCENT,Month,1.0,6202001,"Labour force status by Sex, Australia - Trend, Seasonally adjusted and Original",6202.0


### Collect and marshall that data
Assume we are only doind a one period prediction

In [5]:
def collect_data(subs: dict | None = None) -> tuple[pd.DataFrame, pd.Index, pd.Series]:
    """Collect the data for the inflation model.
    Arguments:
        subs: Dictionary of substitutions for forward values
    Returns:
        data: DataFrame containing the data
        exogenous: Index of exogenous variable names
        endogenous: Series of endogenous variables (ie CPI_TM)"""

    data_dict, meta = get_data()
    data_dict["UER"] = ra.monthly_to_qtly(data_dict["UER"])
    data = pd.concat(data_dict, axis=1)

    # forward for one period prediction
    last = data.loc[data.index[-1], "CPI_TM"]
    if isfinite(last):
        next = data.index[-1] + 1
        new_index = data.index.append(pd.PeriodIndex([next]))
        data = data.reindex(new_index)

    # populate forward exogenous guesses into the model
    last = data.index[-1]
    if subs is not None:
        for k, v in subs.items():
            if subs == "CPI_TM" or isfinite(data.loc[last, k]):
                continue
            data.loc[last, k] = v

    # Adjust missing values in PPI and WPI because they are published after CPI
    # so we may need rolling average approximations to plug in
    for s in ("PPI", "WPI"):
        "Do noting if next lines commented out ..."
        data[f"{s}_ADJ"] = data[s].where(
            data[s].notna(), other=data[s].rolling(4).mean().ffill()
        )
        # data = data.drop(columns=s)

    # smooth WPI/PPI
    h = 7
    for s in ("WPI", "PPI"):
        "Do noting if next lines commented ..."
        data[f"{s}_HMA[{h}]"] = hma(data[f"{s}_ADJ"].dropna(), h)

    # provide differences
    for col in ("PPI_ADJ", "WPI_ADJ", "UER"):
        data[f"Δ{col}"] = data[f"{s}_ADJ"].diff(1)

    # add autoregression
    for ar in (1, 2):
        data[f"CPI_TM-{ar}"] = data["CPI_TM"].shift(ar)

    # add the constant
    data["const"] = 1.0

    # Add a COVID dummy
    data["covid"] = 0
    data.loc["2022Q1":"2023Q1", "covid"] = 1

    # remove early nans
    remember = data.iloc[-1]
    data = pd.concat([data.dropna(), remember.to_frame().T])

    # exogenous and out-of-sample endogenous where known variables
    exogenous = data.columns.difference(["CPI_TM"])
    endogenous = data.index[-1]
    return data, exogenous, endogenous


DATA, EXOG, ENDOG = collect_data(subs={"UER": 4.1, "WPI": 4.0, "PPI": 4.3})
DATA.tail()

Unnamed: 0,CPI_TM,PPI,WPI,UER,PPI_ADJ,WPI_ADJ,WPI_HMA[7],PPI_HMA[7],ΔPPI_ADJ,ΔWPI_ADJ,ΔUER,CPI_TM-1,CPI_TM-2,const,covid
2023Q2,0.9,3.9,3.7,3.601709,3.9,3.7,3.77049,4.117343,-1.0,-1.0,-1.0,1.2,1.7,1.0,0.0
2023Q3,1.2,3.8,4.0,3.684667,3.8,4.0,3.994126,3.864615,-0.1,-0.1,-0.1,0.9,1.2,1.0,0.0
2023Q4,0.8,4.1,4.2,3.884824,4.1,4.2,4.102564,4.025216,0.3,0.3,0.3,1.2,0.9,1.0,0.0
2024Q1,1.0,4.3,4.1,3.888157,4.3,4.1,4.11381,4.228304,0.2,0.2,0.2,0.8,1.2,1.0,0.0
2024Q2,,4.3,4.0,4.1,4.3,4.0,4.057309,4.365473,0.0,0.0,0.0,1.0,0.8,1.0,0.0


In [6]:
EXOG

Index(['CPI_TM-1', 'CPI_TM-2', 'PPI', 'PPI_ADJ', 'PPI_HMA[7]', 'UER', 'WPI',
       'WPI_ADJ', 'WPI_HMA[7]', 'const', 'covid', 'ΔPPI_ADJ', 'ΔUER',
       'ΔWPI_ADJ'],
      dtype='object')

In [7]:
ENDOG

Period('2024Q2', 'Q-DEC')

## Build a simple multiple regression model

In [8]:
forecasts = {}


def run_model(data, exog: list[str], title: str, endog: pd.Series | None):

    # Fit the model
    y = data["CPI_TM"].dropna()
    X = data.loc[y.index, exog]
    Xnew = data.loc[data.index.difference(X.index), exog]
    model = sm.OLS(y, X)
    fit = model.fit()
    print(fit.summary())

    # QQ plot
    fig = sm.qqplot(fit.resid, line="s")
    name = title.split(" vs ")[1].rsplit(" ", 1)[0]
    finalise_plot(
        fig.axes[0],
        title=f"QQ Plot: {name}",
        xlabel="Theoretical Quantiles",
        ylabel="Sample Quantiles",
        show=SHOW,
    )

    # can we forecast?
    can_forecast = endog is not None and len(Xnew) > 0
    if can_forecast:
        print("Forecasting using:")
        display(Xnew)
        ynewpred = fit.predict(Xnew)  # predict out of sample
        display(ynewpred)
        forecasts[title] = ynewpred.iloc[0]
        ynewpred[y.index[-1]] = y.iloc[-1]  # start from actual
        quarterly = ynewpred.iloc[1]
        annual = (
            (((pd.concat([y.iloc[-3:-1], ynewpred]) / 100) + 1).cumprod() - 1) * 100
        ).iloc[-1]
        projection = (
            f"Projection {ynewpred.index[-1]}: A:{annual:.2f}% Q:{quarterly:.2f}%"
        )
    projection = projection if can_forecast else ""

    # plot avtual vs predicted
    m = [f"({x}*{round(y, 3)})" for x, y in fit.params.items()]
    m = "ypred = " + " + ".join(m)
    ypred = fit.predict(X)
    model_frame = pd.DataFrame({"Actual": y, "Within sample predicted": ypred})
    if can_forecast:
        model_frame = model_frame.reindex(model_frame.index.union(ynewpred.index))
        model_frame["Out of sample forecast"] = ynewpred
    line_plot(
        model_frame,
        title=title,
        ylabel="Inflation % per quarter",
        color=("cornflowerblue", "darkorange", "darkred"),
        width=(1.5, 2, 3),
        **QUARTERLY_RANGE,
        lfooter=m if len(m) < 80 else "",
        y0=True,
        show=SHOW,
    )

In [9]:
# Simple everything model
title = "Trimmed Mean Inflation vs All Exogenous Model"
run_model(DATA, EXOG.to_list(), title, ENDOG)

                            OLS Regression Results                            
Dep. Variable:                 CPI_TM   R-squared:                       0.726
Model:                            OLS   Adj. R-squared:                  0.698
Method:                 Least Squares   F-statistic:                     25.94
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           3.42e-21
Time:                        21:16:31   Log-Likelihood:                 40.882
No. Observations:                  98   AIC:                            -61.76
Df Residuals:                      88   BIC:                            -35.91
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
CPI_TM-1       0.1735      0.109      1.598      0.1

Unnamed: 0,CPI_TM-1,CPI_TM-2,PPI,PPI_ADJ,PPI_HMA[7],UER,WPI,WPI_ADJ,WPI_HMA[7],const,covid,ΔPPI_ADJ,ΔUER,ΔWPI_ADJ
2024Q2,1.0,0.8,4.3,4.3,4.365473,4.1,4.0,4.0,4.057309,1.0,0.0,0.0,0.0,0.0


2024Q2    0.871881
Freq: Q-DEC, dtype: float64

In [10]:
# REMOVE some cooefficients that are not significantly different from zero
exog = EXOG.difference(
    [
        "WPI",
        "WPI_ADJ",
        "WPI_HMA[7]",
        "PPI",
        "PPI_ADJ",
        "UER",
        "ΔPPI_ADJ",
        "ΔUER",
        "ΔWPI_ADJ",
    ]
)
title = "Trimmed Mean Inflation vs AR2 Model Predicted"
run_model(DATA, exog, title, ENDOG)

                            OLS Regression Results                            
Dep. Variable:                 CPI_TM   R-squared:                       0.691
Model:                            OLS   Adj. R-squared:                  0.678
Method:                 Least Squares   F-statistic:                     52.11
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           5.94e-23
Time:                        21:16:32   Log-Likelihood:                 35.026
No. Observations:                  98   AIC:                            -60.05
Df Residuals:                      93   BIC:                            -47.13
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
CPI_TM-1       0.2796      0.105      2.650      0.0

Unnamed: 0,CPI_TM-1,CPI_TM-2,PPI_HMA[7],const,covid
2024Q2,1.0,0.8,4.365473,1.0,0.0


2024Q2    0.865317
Freq: Q-DEC, dtype: float64

In [11]:
exog = ["CPI_TM-1", "UER", "covid", "const"]
title = "Trimmed Mean Inflation vs AR1 Model Predicted"
run_model(DATA, exog, title, ENDOG)

                            OLS Regression Results                            
Dep. Variable:                 CPI_TM   R-squared:                       0.661
Model:                            OLS   Adj. R-squared:                  0.651
Method:                 Least Squares   F-statistic:                     61.21
Date:                Tue, 02 Jul 2024   Prob (F-statistic):           5.00e-22
Time:                        21:16:32   Log-Likelihood:                 30.471
No. Observations:                  98   AIC:                            -52.94
Df Residuals:                      94   BIC:                            -42.60
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
CPI_TM-1       0.4777      0.085      5.593      0.0

Unnamed: 0,CPI_TM-1,UER,covid,const
2024Q2,1.0,4.1,0.0,1.0


2024Q2    0.905355
Freq: Q-DEC, dtype: float64

In [12]:
# Multi-model
# exog = ['CPI_TM-1', 'CPI_TM-2', "UER", "WPI_HMA[7]", "PPI_HMA[7]",  "const"]
# exog = ['CPI_TM-1', 'CPI_TM-2', "PPI_HMA[7]"]  # <-- the best model (no employment)
# exog = ['CPI_TM-1', 'CPI_TM-2', "WPI_HMA[7]"]
exog = ["CPI_TM-1", "CPI_TM-2", "ΔUER", "PPI_HMA[7]", "WPI_HMA[7]", "covid"]
title = "Trimmed Mean Inflation vs Multi Model Predicted"
run_model(DATA, exog, title, ENDOG)

                                 OLS Regression Results                                
Dep. Variable:                 CPI_TM   R-squared (uncentered):                   0.951
Model:                            OLS   Adj. R-squared (uncentered):              0.948
Method:                 Least Squares   F-statistic:                              299.8
Date:                Tue, 02 Jul 2024   Prob (F-statistic):                    4.17e-58
Time:                        21:16:32   Log-Likelihood:                          37.944
No. Observations:                  98   AIC:                                     -63.89
Df Residuals:                      92   BIC:                                     -48.38
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

Unnamed: 0,CPI_TM-1,CPI_TM-2,ΔUER,PPI_HMA[7],WPI_HMA[7],covid
2024Q2,1.0,0.8,0.0,4.365473,4.057309,0.0


2024Q2    0.872678
Freq: Q-DEC, dtype: float64

In [13]:
# mean of all models
pd.Series(forecasts).mean()

0.8788079036917654

## Backtest

In [14]:
BACK_DATA = DATA.iloc[:-1]
BACK_INDEX = BACK_DATA.index[-1]
ACTUAL = BACK_DATA.iloc[-1, 0]
BACK_DATA.iloc[-1, 0] = None
BACK_DATA.tail(), BACK_INDEX

(        CPI_TM  PPI  WPI       UER  PPI_ADJ  WPI_ADJ  WPI_HMA[7]  PPI_HMA[7]  \
 2023Q1     1.2  4.9  3.6  3.605752      4.9      3.6    3.558881    4.900000   
 2023Q2     0.9  3.9  3.7  3.601709      3.9      3.7    3.770490    4.117343   
 2023Q3     1.2  3.8  4.0  3.684667      3.8      4.0    3.994126    3.864615   
 2023Q4     0.8  4.1  4.2  3.884824      4.1      4.2    4.102564    4.025216   
 2024Q1     NaN  4.3  4.1  3.888157      4.3      4.1    4.113810    4.228304   
 
         ΔPPI_ADJ  ΔWPI_ADJ  ΔUER  CPI_TM-1  CPI_TM-2  const  covid  
 2023Q1      -0.9      -0.9  -0.9       1.7       1.8    1.0    1.0  
 2023Q2      -1.0      -1.0  -1.0       1.2       1.7    1.0    0.0  
 2023Q3      -0.1      -0.1  -0.1       0.9       1.2    1.0    0.0  
 2023Q4       0.3       0.3   0.3       1.2       0.9    1.0    0.0  
 2024Q1       0.2       0.2   0.2       0.8       1.2    1.0    0.0  ,
 Period('2024Q1', 'Q-DEC'))

In [15]:
exog = ["CPI_TM-1", "CPI_TM-2", "ΔUER", "PPI_HMA[7]", "WPI_HMA[7]", "covid"]
title = "BT: Trimmed Mean Inflation vs Multi Model Predicted"
run_model(BACK_DATA, exog, title, ENDOG)

                                 OLS Regression Results                                
Dep. Variable:                 CPI_TM   R-squared (uncentered):                   0.951
Model:                            OLS   Adj. R-squared (uncentered):              0.947
Method:                 Least Squares   F-statistic:                              291.9
Date:                Tue, 02 Jul 2024   Prob (F-statistic):                    3.64e-57
Time:                        21:16:33   Log-Likelihood:                          37.235
No. Observations:                  97   AIC:                                     -62.47
Df Residuals:                      91   BIC:                                     -47.02
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

Unnamed: 0,CPI_TM-1,CPI_TM-2,ΔUER,PPI_HMA[7],WPI_HMA[7],covid
2024Q1,0.8,1.2,0.2,4.228304,4.11381,0.0


2024Q1    0.898719
Freq: Q-DEC, dtype: float64

## Finished

In [16]:
# watermark
%load_ext watermark
%watermark -u -n -t -v -iv -w

Last updated: Tue Jul 02 2024 21:16:33

Python implementation: CPython
Python version       : 3.12.4
IPython version      : 8.26.0

readabs    : 0.0.5
pandas     : 2.2.2
matplotlib : 3.9.0
statsmodels: 0.14.2

Watermark: 2.4.3



In [17]:
print("Finished")

Finished
