## Python set-up

In [1]:
# system imports
from math import isfinite
from functools import cache

# analytic imports
import matplotlib.pyplot as plt
import pandas as pd
import readabs as ra
from IPython.display import display
import statsmodels.api as sm  # type: ignore

In [2]:
# local imports
from abs_helper import QUARTERLY_CPI_RANGE
from plotting import line_plot, finalise_plot, set_chart_dir, clear_chart_dir
from henderson import hma

In [3]:
# pandas display settings
pd.options.display.max_rows = 999999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 100

# save charts in this notebook
plt.style.use("fivethirtyeight")
CHART_DIR = "./CHARTS/Inflation-Model/"
set_chart_dir(CHART_DIR)
clear_chart_dir(CHART_DIR)

# display charts in this notebook
SHOW = False

## Data capture

 ### Identify the data we want to use

In [4]:
@cache
def get_data() -> tuple[dict[str, pd.Series], pd.DataFrame]:
    """Get a dictionary of data items from the ABS."""

    wanted = {
        # "Series ID": ["Category ID", "single-excel-only table name", "Short Series Title"],
        "A3604510W": ["6401.0", "640106", "CPI_TM"],  # change from previous quarter
        "A2314867K": ["6427.0", "642701", "PPI"],  # change from previous year
        "A83895396W": ["6345.0", "634501", "WPI"],  # change from previous year
        "A84423050A": ["6202.0", "6202001", "UER"],  # percent of labour force
    }

    data, meta = {}, {}
    for series_id, (category_id, seo, title) in wanted.items():
        d, m = ra.read_abs_series(category_id, series_id, single_excel_only=seo)
        data[title] = d[series_id]
        meta[title] = m.loc[series_id]
    return data, pd.DataFrame(meta).T


_, META = get_data()
# check we have the correct variables
# META

### Collect and marshall that data
Assume we are only doind a one period prediction

In [5]:
def collect_data(subs: dict | None = None) -> tuple[pd.DataFrame, pd.Index, pd.Series]:
    """Collect the data for the inflation model.
    Arguments:
        subs: Dictionary of substitutions for forward values
    Returns:
        data: DataFrame containing the data
        exogenous: Index of exogenous variable names
        endogenous: Series of endogenous variables (ie CPI_TM)"""

    data_dict, _meta = get_data()
    data_dict["UER"] = ra.monthly_to_qtly(data_dict["UER"])
    data = pd.concat(data_dict, axis=1)

    # forward for one period prediction
    last = data.loc[data.index[-1], "CPI_TM"]
    if isfinite(last):
        next_period = data.index[-1] + 1
        new_index = data.index.append(pd.PeriodIndex([next_period]))
        data = data.reindex(new_index)

    # populate forward exogenous guesses into the model
    last = data.index[-1]
    if subs is not None:
        for k, v in subs.items():
            if subs == "CPI_TM" or isfinite(data.loc[last, k]):
                continue
            data.loc[last, k] = v

    # Adjust missing values in PPI and WPI because they are published after CPI
    # so we may need rolling average approximations to plug in
    for s in ("PPI", "WPI"):
        _ = "Do noting if next lines commented out ..."
        data[f"{s}_ADJ"] = data[s].where(
            data[s].notna(), other=data[s].rolling(4).mean().ffill()
        )
        # data = data.drop(columns=s)

    # smooth WPI/PPI
    h = 7
    for s in ("WPI", "PPI"):
        _ = "Do noting if next lines commented ..."
        data[f"{s}_HMA[{h}]"] = hma(data[f"{s}_ADJ"].dropna(), h)

    # provide differences
    for col in ("PPI_ADJ", "WPI_ADJ", "UER"):
        data[f"Δ{col}"] = data[f"{s}_ADJ"].diff(1)

    # add autoregression
    for ar in (1, 2):
        data[f"CPI_TM-{ar}"] = data["CPI_TM"].shift(ar)

    # add the constant
    data["const"] = 1.0

    # Add a COVID dummy
    data["covid"] = 0
    mask = (data.index >= "2020Q1") & (data.index <= "2021Q1")
    data.loc[mask, "covid"] = 1

    # remove early nans
    remember = data.iloc[-1]
    data = pd.concat([data.dropna(), remember.to_frame().T])

    # exogenous and out-of-sample endogenous where known variables
    exogenous = data.columns.difference(["CPI_TM"])
    endogenous = data.index[-1]
    return data, exogenous, endogenous


DATA, EXOG, ENDOG = collect_data(subs={"UER": 4.1, "WPI": 4.0, "PPI": 4.3})
# DATA.tail()

In [6]:
EXOG

Index(['CPI_TM-1', 'CPI_TM-2', 'PPI', 'PPI_ADJ', 'PPI_HMA[7]', 'UER', 'WPI',
       'WPI_ADJ', 'WPI_HMA[7]', 'const', 'covid', 'ΔPPI_ADJ', 'ΔUER',
       'ΔWPI_ADJ'],
      dtype='object')

In [7]:
ENDOG

Period('2024Q4', 'Q-DEC')

## Build a simple multiple regression model

In [8]:
forecasts = {}


def run_model(data, exog: list[str], title: str, endog: pd.Series | None):
    """Run the model for the given data and exogenous variables."""

    # Fit the model
    y = data["CPI_TM"].dropna()
    X = data.loc[y.index, exog]
    Xnew = data.loc[data.index.difference(X.index), exog]
    print(X.tail())
    model = sm.OLS(y, X)
    fit = model.fit()
    print(fit.summary())

    # QQ plot
    fig = sm.qqplot(fit.resid, line="s")
    name = title.split(" vs ")[1].rsplit(" ", 1)[0]
    finalise_plot(
        fig.axes[0],
        title=f"QQ Plot: {name}",
        xlabel="Theoretical Quantiles",
        ylabel="Sample Quantiles",
        show=SHOW,
    )

    # can we forecast?
    can_forecast = endog is not None and len(Xnew) > 0
    if can_forecast:
        print("Forecasting using:")
        display(Xnew)
        ynewpred = fit.predict(Xnew)  # predict out of sample
        display(ynewpred)
        forecasts[title] = ynewpred.iloc[0]
        ynewpred[y.index[-1]] = y.iloc[-1]  # start from actual
        quarterly = ynewpred.iloc[1]
        annual = (
            (((pd.concat([y.iloc[-3:-1], ynewpred]) / 100) + 1).cumprod() - 1) * 100
        ).iloc[-1]
        projection = (
            f"Projection {ynewpred.index[-1]}: A:{annual:.2f}% Q:{quarterly:.2f}%"
        )
    projection = projection if can_forecast else ""

    # plot avtual vs predicted
    mlist = [f"({x}*{round(y, 3)})" for x, y in fit.params.items()]
    m = "ypred = " + (" + ").join(mlist)
    print("Model: ", m)
    ypred = fit.predict(X)
    model_frame = pd.DataFrame({"Actual": y, "Within sample predicted": ypred})
    if can_forecast:
        model_frame = model_frame.reindex(model_frame.index.union(ynewpred.index))
        model_frame["Out of sample forecast"] = ynewpred
    line_plot(
        model_frame,
        title=title,
        ylabel="Inflation % per quarter",
        color=("cornflowerblue", "darkorange", "darkred"),
        width=(1.5, 2, 3),
        axhspan=QUARTERLY_CPI_RANGE,
        lfooter=m if len(m) < 80 else "",
        y0=True,
        show=SHOW,
    )

In [9]:
# Simple everything model
title_ = "Trimmed Mean Inflation vs All Exogenous Model"
run_model(DATA, EXOG.to_list(), title_, ENDOG)

        CPI_TM-1  CPI_TM-2  PPI  PPI_ADJ  PPI_HMA[7]       UER  WPI  WPI_ADJ  \
2023Q3       0.9       1.2  3.8      3.8    3.835245  3.669866  4.0      4.0   
2023Q4       1.2       0.9  4.1      4.1    4.064755  3.888203  4.3      4.3   
2024Q1       0.8       1.2  4.3      4.3    4.358741  3.896687  4.1      4.1   
2024Q2       1.0       0.8  4.8      4.8    4.398815  4.052892  4.1      4.1   
2024Q3       0.9       1.0  3.9      3.9    4.293566  4.139522  3.5      3.5   

        WPI_HMA[7]  const  covid  ΔPPI_ADJ  ΔUER  ΔWPI_ADJ  
2023Q3    3.994126    1.0    0.0      -0.1  -0.1      -0.1  
2023Q4    4.182517    1.0    0.0       0.3   0.3       0.3  
2024Q1    4.152867    1.0    0.0       0.2   0.2       0.2  
2024Q2    3.954891    1.0    0.0       0.5   0.5       0.5  
2024Q3    3.817864    1.0    0.0      -0.9  -0.9      -0.9  
                            OLS Regression Results                            
Dep. Variable:                 CPI_TM   R-squared:                       0

Unnamed: 0,CPI_TM-1,CPI_TM-2,PPI,PPI_ADJ,PPI_HMA[7],UER,WPI,WPI_ADJ,WPI_HMA[7],const,covid,ΔPPI_ADJ,ΔUER,ΔWPI_ADJ
2024Q4,0.8,0.9,4.3,4.3,4.166687,4.1,4.0,4.0,3.778575,1.0,0.0,0.4,0.4,0.4


2024Q4    0.922074
Freq: Q-DEC, dtype: float64

Model:  ypred = (CPI_TM-1*0.403) + (CPI_TM-2*0.229) + (PPI*-0.047) + (PPI_ADJ*-0.047) + (PPI_HMA[7]*0.129) + (UER*-0.045) + (WPI*0.056) + (WPI_ADJ*0.056) + (WPI_HMA[7]*-0.119) + (const*0.432) + (covid*-0.021) + (ΔPPI_ADJ*0.016) + (ΔUER*0.016) + (ΔWPI_ADJ*0.016)


In [10]:
# REMOVE some cooefficients that are not significantly different from zero
exog_1 = EXOG.difference(
    [
        "WPI",
        "WPI_ADJ",
        "WPI_HMA[7]",
        "PPI",
        "PPI_ADJ",
        "UER",
        "ΔPPI_ADJ",
        "ΔUER",
        "ΔWPI_ADJ",
    ]
).to_list()
title_ = "Trimmed Mean Inflation vs AR2 Model Predicted"
run_model(DATA, exog_1, title_, ENDOG)

        CPI_TM-1  CPI_TM-2  PPI_HMA[7]  const  covid
2023Q3       0.9       1.2    3.835245    1.0    0.0
2023Q4       1.2       0.9    4.064755    1.0    0.0
2024Q1       0.8       1.2    4.358741    1.0    0.0
2024Q2       1.0       0.8    4.398815    1.0    0.0
2024Q3       0.9       1.0    4.293566    1.0    0.0
                            OLS Regression Results                            
Dep. Variable:                 CPI_TM   R-squared:                       0.641
Model:                            OLS   Adj. R-squared:                  0.626
Method:                 Least Squares   F-statistic:                     42.41
Date:                Sat, 28 Dec 2024   Prob (F-statistic):           2.31e-20
Time:                        06:40:46   Log-Likelihood:                 28.732
No. Observations:                 100   AIC:                            -47.46
Df Residuals:                      95   BIC:                            -34.44
Df Model:                           4             

Unnamed: 0,CPI_TM-1,CPI_TM-2,PPI_HMA[7],const,covid
2024Q4,0.8,0.9,4.166687,1.0,0.0


2024Q4    0.862637
Freq: Q-DEC, dtype: float64

Model:  ypred = (CPI_TM-1*0.442) + (CPI_TM-2*0.189) + (PPI_HMA[7]*0.049) + (const*0.133) + (covid*-0.032)


In [11]:
exog_ = ["CPI_TM-1", "UER", "covid", "const"]
title_ = "Trimmed Mean Inflation vs AR1 Model Predicted"
run_model(DATA, exog_, title_, ENDOG)

        CPI_TM-1       UER  covid  const
2023Q3       0.9  3.669866    0.0    1.0
2023Q4       1.2  3.888203    0.0    1.0
2024Q1       0.8  3.896687    0.0    1.0
2024Q2       1.0  4.052892    0.0    1.0
2024Q3       0.9  4.139522    0.0    1.0
                            OLS Regression Results                            
Dep. Variable:                 CPI_TM   R-squared:                       0.618
Model:                            OLS   Adj. R-squared:                  0.606
Method:                 Least Squares   F-statistic:                     51.82
Date:                Sat, 28 Dec 2024   Prob (F-statistic):           5.25e-20
Time:                        06:40:46   Log-Likelihood:                 25.657
No. Observations:                 100   AIC:                            -43.31
Df Residuals:                      96   BIC:                            -32.89
Df Model:                           3                                         
Covariance Type:            nonrobust      

Unnamed: 0,CPI_TM-1,UER,covid,const
2024Q4,0.8,4.1,0.0,1.0


2024Q4    0.860389
Freq: Q-DEC, dtype: float64

Model:  ypred = (CPI_TM-1*0.615) + (UER*-0.082) + (covid*-0.071) + (const*0.705)


In [12]:
# Multi-model
# exog_ = ['CPI_TM-1', 'CPI_TM-2', "UER", "WPI_HMA[7]", "PPI_HMA[7]",  "const"]
# exog_ = ['CPI_TM-1', 'CPI_TM-2', "PPI_HMA[7]"]  # <-- the best model (no employment)
# exog_ = ['CPI_TM-1', 'CPI_TM-2', "WPI_HMA[7]"]
exog_ = ["CPI_TM-1", "CPI_TM-2", "ΔUER", "PPI_HMA[7]", "WPI_HMA[7]", "covid"]
title_ = "Trimmed Mean Inflation vs Multi Model Predicted"
run_model(DATA, exog_, title_, ENDOG)

        CPI_TM-1  CPI_TM-2  ΔUER  PPI_HMA[7]  WPI_HMA[7]  covid
2023Q3       0.9       1.2  -0.1    3.835245    3.994126    0.0
2023Q4       1.2       0.9   0.3    4.064755    4.182517    0.0
2024Q1       0.8       1.2   0.2    4.358741    4.152867    0.0
2024Q2       1.0       0.8   0.5    4.398815    3.954891    0.0
2024Q3       0.9       1.0  -0.9    4.293566    3.817864    0.0
                                 OLS Regression Results                                
Dep. Variable:                 CPI_TM   R-squared (uncentered):                   0.941
Model:                            OLS   Adj. R-squared (uncentered):              0.937
Method:                 Least Squares   F-statistic:                              248.6
Date:                Sat, 28 Dec 2024   Prob (F-statistic):                    2.22e-55
Time:                        06:40:46   Log-Likelihood:                          28.521
No. Observations:                 100   AIC:                                     -45.04


Unnamed: 0,CPI_TM-1,CPI_TM-2,ΔUER,PPI_HMA[7],WPI_HMA[7],covid
2024Q4,0.8,0.9,0.4,4.166687,3.778575,0.0


2024Q4    0.882563
Freq: Q-DEC, dtype: float64

Model:  ypred = (CPI_TM-1*0.462) + (CPI_TM-2*0.234) + (ΔUER*0.04) + (PPI_HMA[7]*0.038) + (WPI_HMA[7]*0.034) + (covid*0.036)


In [13]:
# mean of all models
pd.Series(forecasts).mean()

0.8819156613547314

## Backtest

In [14]:
BACK_DATA = DATA.iloc[:-1]
BACK_INDEX = BACK_DATA.index[-1]
ACTUAL = BACK_DATA.iloc[-1, 0]
BACK_DATA.iloc[-1, 0] = None
BACK_DATA.tail(), BACK_INDEX

(        CPI_TM  PPI  WPI       UER  PPI_ADJ  WPI_ADJ  WPI_HMA[7]  PPI_HMA[7]  \
 2023Q3     1.2  3.8  4.0  3.669866      3.8      4.0    3.994126    3.835245   
 2023Q4     0.8  4.1  4.3  3.888203      4.1      4.3    4.182517    4.064755   
 2024Q1     1.0  4.3  4.1  3.896687      4.3      4.1    4.152867    4.358741   
 2024Q2     0.9  4.8  4.1  4.052892      4.8      4.1    3.954891    4.398815   
 2024Q3     NaN  3.9  3.5  4.139522      3.9      3.5    3.817864    4.293566   
 
         ΔPPI_ADJ  ΔWPI_ADJ  ΔUER  CPI_TM-1  CPI_TM-2  const  covid  
 2023Q3      -0.1      -0.1  -0.1       0.9       1.2    1.0    0.0  
 2023Q4       0.3       0.3   0.3       1.2       0.9    1.0    0.0  
 2024Q1       0.2       0.2   0.2       0.8       1.2    1.0    0.0  
 2024Q2       0.5       0.5   0.5       1.0       0.8    1.0    0.0  
 2024Q3      -0.9      -0.9  -0.9       0.9       1.0    1.0    0.0  ,
 Period('2024Q3', 'Q-DEC'))

In [15]:
exog_ = ["CPI_TM-1", "CPI_TM-2", "ΔUER", "PPI_HMA[7]", "WPI_HMA[7]", "covid"]
title_ = "BT: Trimmed Mean Inflation vs Multi Model Predicted"
run_model(BACK_DATA, exog_, title_, ENDOG)

        CPI_TM-1  CPI_TM-2  ΔUER  PPI_HMA[7]  WPI_HMA[7]  covid
2023Q2       1.2       1.7  -1.0    4.117343    3.770350    0.0
2023Q3       0.9       1.2  -0.1    3.835245    3.994126    0.0
2023Q4       1.2       0.9   0.3    4.064755    4.182517    0.0
2024Q1       0.8       1.2   0.2    4.358741    4.152867    0.0
2024Q2       1.0       0.8   0.5    4.398815    3.954891    0.0
                                 OLS Regression Results                                
Dep. Variable:                 CPI_TM   R-squared (uncentered):                   0.940
Model:                            OLS   Adj. R-squared (uncentered):              0.936
Method:                 Least Squares   F-statistic:                              243.9
Date:                Sat, 28 Dec 2024   Prob (F-statistic):                    1.29e-54
Time:                        06:40:47   Log-Likelihood:                          27.914
No. Observations:                  99   AIC:                                     -43.83


Unnamed: 0,CPI_TM-1,CPI_TM-2,ΔUER,PPI_HMA[7],WPI_HMA[7],covid
2024Q3,0.9,1.0,-0.9,4.293566,3.817864,0.0


2024Q3    0.910566
Freq: Q-DEC, dtype: float64

Model:  ypred = (CPI_TM-1*0.459) + (CPI_TM-2*0.236) + (ΔUER*0.039) + (PPI_HMA[7]*0.039) + (WPI_HMA[7]*0.034) + (covid*0.036)


## Finished

In [16]:
# watermark
%load_ext watermark
%watermark -u -n -t -v -iv -w

Last updated: Sat Dec 28 2024 06:40:47

Python implementation: CPython
Python version       : 3.12.8
IPython version      : 8.31.0

IPython    : 8.31.0
pandas     : 2.2.3
statsmodels: 0.14.4
readabs    : 0.0.17
matplotlib : 3.10.0

Watermark: 2.5.0



In [17]:
print("Finished")

Finished
