# Bayesian pooled polling charts (Using Stan)

## Set-up

In [1]:
# system imports
import itertools
from typing import Any
from pathlib import Path
import json

# analytic imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# stan
import cmdstanpy
from cmdstanpy import CmdStanModel
print(f"cmdstanpy version: {cmdstanpy.__version__}")
cmdstanpy.install_cmdstan()

STAN_DIR = Path("./stan_work/")
STAN_DIR.mkdir(parents=True, exist_ok=True)

cmdstanpy version: 1.2.5
CmdStan install directory: /Users/bryanpalmer/.cmdstan
CmdStan version 2.36.0 already installed
Test model compilation


In [3]:
# local import
import bayes_tools
import plotting
from common import MIDDLE_DATE, VOTING_INTENTION, ensure
from data_capture import retrieve

In [4]:
# plotting related
MODEL_DIR = "../model-images/"
Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)

SHOW = True  # show charts in the notebook
SHOW_MODEL_MAPS = False  # show model maps in notebook

## Get data

In [5]:
data = retrieve()
ensure(data, "You must run the data capture notebook every day.")
print(f"Latest poll (mean date): {data['voting-intention'].iloc[:, -2].max()}")
print(f"Data columns:\n{data['voting-intention'].columns}")
print(f"Poll count by pollster:\n{data['voting-intention']['Brand'].value_counts().sort_index()}")

Latest poll (mean date): 2025-01-12
Data columns:
Index(['Date', 'Brand', 'Interview mode', 'Sample size', 'Primary vote L/NP',
       'Primary vote ALP', 'Primary vote GRN', 'Primary vote ONP',
       'Primary vote UAP', 'Primary vote OTH', '2pp vote ALP', '2pp vote L/NP',
       'First Date', 'Mean Date', 'Last Date'],
      dtype='object')
Poll count by pollster:
Brand
ANU                     1
DemosAU                 1
Dynata                  1
Essential              21
Essential 2            28
Freshwater Strategy    16
Newspoll               30
Newspoll-YouGov         3
RedBridge Group         9
Resolve Strategic      17
Resolve Strategic 2    11
Roy Morgan             66
Wolf & Smith            1
YouGov                 16
Name: count, dtype: int64


## Do the Bayesian pooling ...

### The model

In [6]:
# The model
model_text = """
data {
    // data size
    int<lower=1> n_polls;
    int<lower=1> n_span;
    int<lower=1> n_houses;

    // initialisation data
    real<lower=-3,upper=3> y_start;

    // polling data
    array[n_polls] real<lower=-5, upper=5> y;
    array[n_polls] real<lower=0,upper=1> sampleSigma;
    array[n_polls] int<lower=1,upper=n_houses> house;
    array[n_polls] int<lower=1,upper=n_span> day;
}
parameters {
    array[n_span] real<lower=-5,upper=5> hidden_voting_intention;
    vector[n_houses-1] rawHouseEffects;
    real<lower=0,upper=0.5> daily_sigma;
}
transformed parameters {
    vector[n_houses] houseEffect ;
    houseEffect = append_row(rawHouseEffects, -sum(rawHouseEffects));
}
model{
    // -- house effects model
    rawHouseEffects ~ normal(0, 1); // weakly informative

    // -- temporal model
    daily_sigma ~ uniform(0, 0.01);
    hidden_voting_intention[1] ~ normal(y_start, 1);
    for(i in 2:n_span) {
        hidden_voting_intention[i] ~ normal(hidden_voting_intention[i-1], daily_sigma);
    }

    // -- observational model
    for(poll in 1:n_polls) {
        y[poll] ~ normal(houseEffect[house[poll]] + hidden_voting_intention[day[poll]], sampleSigma[poll]);
    }
}
"""

# compile the model
filepath = STAN_DIR / "model.stan"
with filepath.open("w") as f:
    f.write(model_text)
model = CmdStanModel(
    stan_file=filepath,
)
print(model)
print(model.exe_info())
print(f"Model compiled successfully.")

12:50:41 - cmdstanpy - INFO - compiling stan file /Users/bryanpalmer/Australian-Federal-Election-2025/notebooks/stan_work/model.stan to exe file /Users/bryanpalmer/Australian-Federal-Election-2025/notebooks/stan_work/model
12:50:45 - cmdstanpy - INFO - compiled model executable: /Users/bryanpalmer/Australian-Federal-Election-2025/notebooks/stan_work/model


CmdStanModel: name=model
	 stan_file=/Users/bryanpalmer/Australian-Federal-Election-2025/notebooks/stan_work/model.stan
	 exe_file=/Users/bryanpalmer/Australian-Federal-Election-2025/notebooks/stan_work/model
	 compiler_options=stanc_options={}, cpp_options={}
{'stan_version_major': '2', 'stan_version_minor': '36', 'stan_version_patch': '0', 'STAN_THREADS': 'false', 'STAN_MPI': 'false', 'STAN_OPENCL': 'false', 'STAN_NO_RANGE_CHECKS': 'false', 'STAN_CPP_OPTIMS': 'false'}
Model compiled successfully.


### Fit the data to the model

In [7]:
def get_data_inputs(field='2pp vote ALP') -> dict[str, Any]:
    """compile the data for the model"""

    mean_offset = data['voting-intention'][field].mean()
    scale = 5  # pick a scale to make the data within the range -2 to 2
    y = (data['voting-intention'][field] - mean_offset) / scale  # scale to -2 to 2
    y_start = y[:10].mean()  # initial guess for the hidden variable
    n_polls = len(y)
    house = data['voting-intention']['Brand'].astype("category").cat.codes
    house_codes = dict(zip(house, data['voting-intention']['Brand']))
    n_houses = len(house_codes)
    first_date = data['voting-intention']["Mean Date"].min()
    day = (data['voting-intention']["Mean Date"] - first_date).apply(lambda x: x.n)
    n_span = day.max() + 1

    # print some stats
    print(f"{n_polls=}, {n_houses=}, {n_span=}, {mean_offset=}, {scale=},\n" +
    f"{y_start=:0.3f} {y.max()=:0.3f}, {y.min()=:0.3f}, {y.mean()=:0.3f}, {y.std()=:0.3f}")    


    model_inputs = {
        # data size
        "n_polls": n_polls,
        "n_span": int(n_span), # n_span was a numpy int, which JSON doesn't like
        "n_houses": n_houses,

    
        # poll data
        "y_start": float(y[:10].mean()),
        "y": [float(x) for x in y],
        "sampleSigma": [2 / scale] * n_polls,  # 2% error for testing purposes
        "house": [int(x + 1) for x in house],
        "day": [int(x + 1) for x in day],
    }

    return model_inputs, scale, mean_offset


field = "2pp vote ALP"
model_inputs, scale, mean_offset = get_data_inputs(field)

datapath = STAN_DIR / f"data-{field}.json"
with datapath.open("w") as f:
    json.dump(model_inputs, f)    

fit = model.sample(
    data=datapath,
    show_console=True,
    iter_sampling=1000,
    iter_warmup=500,
    chains=1,
)

12:50:45 - cmdstanpy - INFO - Chain [1] start processing
12:50:45 - cmdstanpy - INFO - Chain [1] done processing
12:50:45 - cmdstanpy - ERROR - Chain [1] error: error during processing Operation not permitted


n_polls=221, n_houses=14, n_span=943, mean_offset=51.98564200690766, scale=5,
y_start=0.832 y.max()=1.903, y.min()=-1.035, y.mean()=-0.000, y.std()=0.581
Chain [1] method = sample (Default)
Chain [1] sample
Chain [1] num_samples = 1000 (Default)
Chain [1] num_warmup = 500
Chain [1] save_warmup = false (Default)
Chain [1] thin = 1 (Default)
Chain [1] adapt
Chain [1] engaged = true (Default)
Chain [1] gamma = 0.05 (Default)
Chain [1] delta = 0.8 (Default)
Chain [1] kappa = 0.75 (Default)
Chain [1] t0 = 10 (Default)
Chain [1] init_buffer = 75 (Default)
Chain [1] term_buffer = 50 (Default)
Chain [1] window = 25 (Default)
Chain [1] save_metric = false (Default)
Chain [1] algorithm = hmc (Default)
Chain [1] hmc
Chain [1] engine = nuts (Default)
Chain [1] nuts
Chain [1] max_depth = 10 (Default)
Chain [1] metric = diag_e (Default)
Chain [1] metric_file =  (Default)
Chain [1] stepsize = 1 (Default)
Chain [1] stepsize_jitter = 0 (Default)
Chain [1] num_chains = 1 (Default)
Chain [1] id = 1 (Defa

RuntimeError: Error during sampling:

Command and output files:
RunSet: chains=1, chain_ids=[1], num_processes=1
 cmd (chain 1):
	['/Users/bryanpalmer/Australian-Federal-Election-2025/notebooks/stan_work/model', 'id=1', 'random', 'seed=92061', 'data', 'file=stan_work/data-2pp vote ALP.json', 'output', 'file=/var/folders/96/8bhlz_x975z93glbxq_2_yqr0000gn/T/tmpn0nt88_c/modelrkmua7k_/model-20250119125045.csv', 'method=sample', 'num_samples=1000', 'num_warmup=500', 'algorithm=hmc', 'adapt', 'engaged=1']
 retcodes=[1]
 per-chain output files (showing chain 1 only):
 csv_file:
	/var/folders/96/8bhlz_x975z93glbxq_2_yqr0000gn/T/tmpn0nt88_c/modelrkmua7k_/model-20250119125045.csv
 console_msgs (if any):
	/var/folders/96/8bhlz_x975z93glbxq_2_yqr0000gn/T/tmpn0nt88_c/modelrkmua7k_/model-20250119125045_0-stdout.txt

## Still to do

In [None]:
# to do

## Finished

In [None]:
%load_ext watermark
%watermark --python --machine --iversions --watermark

In [None]:
print("Finished")