# Polling data capture from Wikipedia

**Purpose:**
 * Capture data from the Wiki Page on Opinion Polling
 * Adjust that data for anomalies (for example, ensureing voting intention sums to 100%)
 * Save the data as a bsis for further analysis

**Make sure to:**
 * run before doing any analysis; and
 * check the data validation before moving on to the analysis.

## Python setup

In [1]:
# system imports
from functools import cache

# analytic imports
import pandas as pd
from IPython.display import display

In [2]:
# local imports
import data_capture as dc
from common import ATTITUDINAL, VOTING_INTENTION, NSW, VIC, QLD, SA, WA, TAS, NT

In [3]:
STATES = [NSW, VIC, QLD, WA, SA, TAS, NT]

## Get raw polling data from Wikipedia

### Get all tables from the Wikipedia web page

In [4]:
@cache
def tables_from_wiki() -> list[pd.DataFrame]:
    """Get Opinion Polling Tables for the 2025 Australian
    Federal Election from the Wikipedia page. Use the @cache
    decorator to avoid repeated web-scraping (when testing).

    Note: web-scraping is fragile, and depends on the page maintaining the
    same structure.  This function will need to be updated if the Wikipedia
    page is reorganised."""

    url = (
        "https://en.wikipedia.org/wiki/"
        + "Opinion_polling_for_the_2025_Australian_federal_election"
    )
    df_list = dc.get_table_list(url)
    print(f"Total number of tables on page: {len(df_list)}")
    return df_list

In [5]:
def get_tables(verbose: bool = False) -> dict[str, pd.DataFrame]:
    """Scan through the captureed tables to find the ones we want.
    The tables are identified by the column names they contain
    and the order they appear on the Wikipedia page."""

    # capture the table-list from the Wikipedia page
    df_list = tables_from_wiki()
    print_tables = False
    if print_tables:
        for i, df in enumerate(df_list):
            print(f"Table {i}")
            display(df.head(3))

    MAX_ANNUAL = 4
    expected_tables = (
        # (label: str, max_tables: int, must_have_flattened_column_name: str)
        (VOTING_INTENTION, MAX_ANNUAL, "Primary vote ALP"),
        # Ignore the three AGE based tables
        (ATTITUDINAL, MAX_ANNUAL, "Preferred prime minister Albanese"),
        # Note: states must be in the same order as they appear on the Wiki page
        (NSW, 1, "Primary vote ALP"),
        (VIC, 1, "Primary vote ALP"),
        (QLD, 1, "Primary vote ALP"),
        (WA, 1, "Primary vote ALP"),
        (SA, 1, "Primary vote ALP"),
        (TAS, 1, "Primary vote ALP"),
        (NT, 1, "Primary vote ALP"),
    )

    def flatten(i: pd.MultiIndex) -> list[str]:
        return [(" ").join(a) if a[0] != a[1] else a[0] for a in i.to_flat_index()]

    def get_cols(n: int) -> list[str]:
        cols = df_list[n].columns
        if isinstance(cols, pd.MultiIndex):
            return flatten(cols)
        return df_list[n].columns.tolist()

    return_box = {}
    table_number = 0
    for label, multiplicity, column_name in expected_tables:
        print(f"Looking for {label} tables - {multiplicity=} {column_name=}")
        while table_number < len(df_list):
            found = []
            flat_cols = get_cols(table_number)
            table_number += 1
            if column_name in flat_cols:
                found.append(table_number - 1)
                if multiplicity > 1:
                    capture_count = 1
                    while table_number < len(df_list) and capture_count < multiplicity:
                        flat_cols = get_cols(table_number)
                        if column_name in flat_cols:
                            found.append(table_number)
                            table_number += 1
                            capture_count += 1
                        else:
                            break
                print(f"About to extract table number(s): {found}")
                combined = dc.get_combined_table(df_list, found, verbose=verbose)
                if combined is not None:
                    print(f"Found {label} table using wiki table(s): {found}")
                    return_box[label] = dc.clean(combined.copy())
                break  # from outer while loop, so we can search for next table(s)
    return return_box


data = get_tables(verbose=False)

Total number of tables on page: 43
Looking for voting-intention tables - multiplicity=4 column_name='Primary vote ALP'
About to extract table number(s): [1, 2, 3, 4]
Found voting-intention table using wiki table(s): [1, 2, 3, 4]
CHECK there may be a problem with these dates in get_dates(): 2025-12-30 00:00:00 2025-01-05 00:00:00 with these tokens ['30', 'Dec', '5', 'Jan', '2025']
--> assuming first day is 2024-12-30 00:00:00
Looking for attitudinal tables - multiplicity=4 column_name='Preferred prime minister Albanese'
About to extract table number(s): [5, 6, 7, 8]
Found attitudinal table using wiki table(s): [5, 6, 7, 8]
Looking for NSW tables - multiplicity=1 column_name='Primary vote ALP'
About to extract table number(s): [12]
Found NSW table using wiki table(s): [12]
Looking for VIC tables - multiplicity=1 column_name='Primary vote ALP'
About to extract table number(s): [15]
Found VIC table using wiki table(s): [15]
Looking for QLD tables - multiplicity=1 column_name='Primary vote 

  .replace("", np.nan)  # NaN empty lines
  .replace("", np.nan)  # NaN empty lines
  .replace("", np.nan)  # NaN empty lines
  .replace("", np.nan)  # NaN empty lines
  .replace("", np.nan)  # NaN empty lines
  .replace("", np.nan)  # NaN empty lines
  .replace("", np.nan)  # NaN empty lines


### Quick look at most recent N polls

In [6]:
def quick_look(n=5):
    """
    Display the last n rows of each table.
    """

    for label_, table_ in data.items():
        print(f"{label_}:")
        display(table_.tail(n))
        print()


quick_look()

voting-intention:


Unnamed: 0,Date,Brand,Interview mode,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote TOP,Primary vote OTH,Primary vote UND,2pp vote ALP,2pp vote L/NP,Primary vote UAP,First Date,Mean Date,Last Date
258,24–30 Mar 2025,Roy Morgan,Online,1377.0,35.0,32.0,13.0,5.5,,14.5,,53.0,47.0,,2025-03-24,2025-03-27,2025-03-30
259,27–29 Mar 2025,Newspoll,Online,1249.0,37.0,33.0,12.0,6.0,,12.0,,51.0,49.0,,2025-03-27,2025-03-28,2025-03-29
260,26–30 Mar 2025,Essential,Online,1100.0,34.0,30.0,12.0,9.0,2.0,8.0,5.0,48.0,47.0,,2025-03-26,2025-03-28,2025-03-30
261,26–30 Mar 2025,Resolve Strategic,Online,3237.0,37.0,29.0,13.0,7.0,,14.0,,50.0,50.0,,2025-03-26,2025-03-28,2025-03-30
262,28–30 Mar 2025,Freshwater Strategy,Online,1059.0,39.0,32.0,12.0,,,17.0,,49.0,51.0,,2025-03-28,2025-03-29,2025-03-30



attitudinal:


Unnamed: 0,Date,Firm,Interview mode,Sample,Preferred prime minister Albanese,Preferred prime minister Dutton,Preferred prime minister Don't Know,Preferred prime minister Net,Albanese Satisfied,Albanese Dissatisfied,Albanese Don't Know,Albanese Net,Dutton Satisfied,Dutton Dissatisfied,Dutton Don't Know,Dutton Net,First Date,Mean Date,Last Date
146,17–21 Mar 2025,DemosAU,Online,,40.0,37.0,23.0,3.0,,,,,,,,,2025-03-17,2025-03-19,2025-03-21
147,27–29 Mar 2025,Newspoll,Online,,49.0,38.0,13.0,11.0,43.0,52.0,5.0,-9.0,37.0,55.0,8.0,-18.0,2025-03-27,2025-03-28,2025-03-29
148,26–30 Mar 2025,Essential,Online,,,,,,41.0,47.0,12.0,-6.0,44.0,46.0,10.0,-2.0,2025-03-26,2025-03-28,2025-03-30
149,26–30 Mar 2025,Resolve Strategic,Online,,42.0,33.0,25.0,9.0,39.0,49.0,12.0,-10.0,37.0,47.0,16.0,-10.0,2025-03-26,2025-03-28,2025-03-30
150,28–30 Mar 2025,Freshwater Strategy,Online,,46.0,45.0,9.0,1.0,37.0,49.0,14.0,-12.0,37.0,47.0,16.0,-10.0,2025-03-28,2025-03-29,2025-03-30



NSW:


Unnamed: 0,Date,Firm,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote UAP,Primary vote IND,Primary vote OTH,2pp vote ALP,2pp vote L/NP,First Date,Mean Date,Last Date
42,21–27 Feb 2025,YouGov,506.0,35.0,26.0,15.0,12.0,,,12.0,48.5,51.5,2025-02-21,2025-02-24,2025-02-27
43,28 Feb – 6 Mar 2025,YouGov,475.0,35.0,29.0,15.0,7.0,,,14.0,50.0,50.0,2025-02-28,2025-03-03,2025-03-06
44,7–13 Mar 2025,YouGov,482.0,36.0,28.0,15.5,7.0,,,13.5,49.5,50.5,2025-03-07,2025-03-10,2025-03-13
45,14–19 Mar 2025,YouGov,773.0,36.0,28.0,16.0,7.0,,,13.0,49.5,50.5,2025-03-14,2025-03-16,2025-03-19
46,26–30 Mar 2025,Resolve Strategic,1027.0,40.0,30.0,11.0,6.0,,10.0,4.0,47.0,53.0,2025-03-26,2025-03-28,2025-03-30



VIC:


Unnamed: 0,Date,Firm,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote UAP,Primary vote ONP,Primary vote IND,Primary vote OTH,2pp vote ALP,2pp vote L/NP,First Date,Mean Date,Last Date
44,28 Feb – 6 Mar 2025,YouGov,379.0,39.0,30.0,9.0,,6.0,,16.0,48.0,52.0,2025-02-28,2025-03-03,2025-03-06
45,7–13 Mar 2025,YouGov,384.0,41.5,28.5,13.0,,5.0,,12.0,47.5,52.5,2025-03-07,2025-03-10,2025-03-13
46,14–19 Mar 2025,YouGov,474.0,42.0,29.0,13.0,,5.0,,11.0,47.5,52.5,2025-03-14,2025-03-16,2025-03-19
47,17–21 Mar 2025,DemosAU,1006.0,34.0,29.0,15.0,,8.0,,14.0,51.0,49.0,2025-03-17,2025-03-19,2025-03-21
48,26–30 Mar 2025,Resolve Strategic,817.0,35.0,27.0,14.0,,6.0,10.0,7.0,49.5,50.5,2025-03-26,2025-03-28,2025-03-30



QLD:


Unnamed: 0,Date,Firm,Sample size,Primary vote LNP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote UAP,Primary vote IND,Primary vote OTH,2pp vote LNP,2pp vote ALP,First Date,Mean Date,Last Date
43,18–23 Feb 2025,Resolve Strategic,306.0,41.0,25.0,12.0,8.0,,9.0,4.0,56.5,43.5,2025-02-18,2025-02-20,2025-02-23
44,21–27 Feb 2025,YouGov,302.0,47.0,23.0,8.0,10.0,,,12.0,62.0,38.0,2025-02-21,2025-02-24,2025-02-27
45,7–13 Mar 2025,YouGov,312.0,34.0,29.5,15.5,10.5,,,10.5,50.5,49.5,2025-03-07,2025-03-10,2025-03-13
46,14–19 Mar 2025,YouGov,378.0,34.0,29.0,15.0,10.0,,,12.0,51.0,49.0,2025-03-14,2025-03-16,2025-03-19
47,26–30 Mar 2025,Resolve Strategic,658.0,39.0,25.0,13.0,12.0,,6.0,6.0,56.5,43.5,2025-03-26,2025-03-28,2025-03-30



WA:


Unnamed: 0,Date,Firm,Sample size,Primary vote ALP,Primary vote L/NP,Primary vote GRN,Primary vote ONP,Primary vote UAP,Primary vote OTH,Primary vote UND,2pp vote ALP,2pp vote L/NP,First Date,Mean Date,Last Date
13,20 Jan – 7 Mar 2025,Newspoll,374.0,37.0,34.0,11.0,9.0,,9.0,,54.0,46.0,2025-01-20,2025-02-12,2025-03-07
14,17 Jan – 15 Mar 2025,Freshwater Strategy,318.0,33.0,35.0,15.0,,,17.0,,52.0,48.0,2025-01-17,2025-02-14,2025-03-15
15,4–5 March 2025,DemosAU,1126.0,36.0,38.0,11.0,6.0,,9.0,,52.0,48.0,2025-03-04,2025-03-04,2025-03-05
16,7–13 Mar 2025,YouGov,155.0,42.5,34.5,8.0,9.0,,6.0,,54.0,46.0,2025-03-07,2025-03-10,2025-03-13
17,14–19 Mar 2025,YouGov,111.0,43.0,34.0,8.0,9.0,,6.0,,54.0,46.0,2025-03-14,2025-03-16,2025-03-19



SA:


Unnamed: 0,Date,Firm,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote UAP,Primary vote OTH,Primary vote UND,2pp vote ALP,2pp vote L/NP,First Date,Mean Date,Last Date
9,7 Oct – 6 Dec 2024,Newspoll,280.0,37.0,35.0,9.0,7.0,,12.0,,53.0,47.0,2024-10-07,2024-11-06,2024-12-06
10,20 Jan – 7 Mar 2025,Newspoll,271.0,37.0,32.0,11.0,11.0,,9.0,,50.0,50.0,2025-01-20,2025-02-12,2025-03-07
11,18–23 Feb 2025,DemosAU,440.0,35.0,34.0,11.0,6.0,,14.0,,53.0,47.0,2025-02-18,2025-02-20,2025-02-23
12,7–13 Mar 2025,YouGov,113.0,33.5,33.0,9.5,8.0,,16.0,,51.0,49.0,2025-03-07,2025-03-10,2025-03-13
13,14–19 Mar 2025,YouGov,307.0,33.5,32.5,10.0,8.0,,16.0,,50.5,49.5,2025-03-14,2025-03-16,2025-03-19



TAS:


Unnamed: 0,Date,Firm,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote JLN,Primary vote UAP,Primary vote IND,Primary vote OTH,2pp vote ALP,2pp vote L/NP,First Date,Mean Date,Last Date
0,21 Jun 2022,Roy Morgan,,,,,,,,,63.0,37.0,2022-06-21,2022-06-21,2022-06-21
1,28 Aug – 12 Oct 2023,Newspoll,366.0,25.0,30.0,13.0,,,,27.0,57.0,43.0,2023-08-28,2023-09-19,2023-10-12
2,1 Feb – 26 May 2024,Accent/RedBridge,107.0,30.0,29.0,15.0,,,,26.0,54.0,46.0,2024-02-01,2024-03-29,2024-05-26
3,10 Jul – 27 Aug 2024,Accent/RedBridge,107.0,35.0,25.0,12.0,,,,28.0,48.0,52.0,2024-07-10,2024-08-03,2024-08-27



NT:


Unnamed: 0,Date,Firm,Sample size,Primary vote CLP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote IND,Primary vote OTH,2pp vote ALP,2pp vote CLP,First Date,Mean Date,Last Date
0,16–18 Nov 2023,Redbridge Group,601.0,40.4,22.2,11.1,11.7,7.2,7.4,43.9,56.1,2023-11-16,2023-11-17,2023-11-18





### Standardise column names

In [7]:
def fix_col_names(input_: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]:
    """
    Standardise the column names in the tables. For some reason,
    the voting intention and attitudinal tables at Wikipedia have
    different column names for the same information.
    """

    fix = {
        # from : to
        "Firm": "Brand",
        "Sample": "Sample size",
    }

    output = {}
    for label_, table_ in input_.items():
        for old_col, new_col in fix.items():
            fix_me_list = table_.columns[
                table_.columns.str.contains(old_col, case=False)
            ]
            if len(fix_me_list) == 1:
                fix_me_string = fix_me_list[0]
                table_ = table_.rename(columns={fix_me_string: new_col})
                print(f"{label_} fixed col from {fix_me_string} to {new_col}")
                output[label_] = table_

    return output


data = fix_col_names(data)
data.keys()

voting-intention fixed col from Sample size to Sample size
attitudinal fixed col from Firm to Brand
attitudinal fixed col from Sample to Sample size
NSW fixed col from Firm to Brand
NSW fixed col from Sample size to Sample size
VIC fixed col from Firm to Brand
VIC fixed col from Sample size to Sample size
QLD fixed col from Firm to Brand
QLD fixed col from Sample size to Sample size
WA fixed col from Firm to Brand
WA fixed col from Sample size to Sample size
SA fixed col from Firm to Brand
SA fixed col from Sample size to Sample size
TAS fixed col from Firm to Brand
TAS fixed col from Sample size to Sample size
NT fixed col from Firm to Brand
NT fixed col from Sample size to Sample size


dict_keys(['voting-intention', 'attitudinal', 'NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'NT'])

### Remove MRP polls
MRP = multi-regression post-stratification polls

In [8]:
def remove_mrp_polls(input_: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]:
    """
    Remove aggregated 'polls' conducted by Multilevel
    Regression and Post-stratification (MRP).
    """

    output = {}

    for label_, table_ in input_.items():
        if label_ in STATES:
            # Happy to keep the MRP polls for the states
            output[label_] = table_
            continue

        # drop by Brand Name
        drop_bool = table_["Brand"].str.contains("Accent", na=False) & table_[
            "Brand"
        ].str.contains("RedBridge", na=False)
        drop_index = drop_bool[drop_bool].index

        # drop bt MRP label
        drop_bool = table_["Brand"].str.contains("MRP", na=False)
        drop_index = drop_index.union(drop_bool[drop_bool].index)

        if len(drop_index) > 0:
            # adjust the table
            print(f"{label_} MRP about to drop:")
            display(table_.loc[drop_index])
            table_ = table_.drop(drop_index)

        # In future we might want to drop by other criteria

        # save the table
        output[label_] = table_

    return output


data = remove_mrp_polls(data)
data.keys()

voting-intention MRP about to drop:


Unnamed: 0,Date,Brand,Interview mode,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote TOP,Primary vote OTH,Primary vote UND,2pp vote ALP,2pp vote L/NP,Primary vote UAP,First Date,Mean Date,Last Date
165,10 Jul – 27 Aug 2024,Accent Research/ RedBridge Group,Online,5976.0,38.0,32.0,12.0,,,18.0,,50.0,50.0,,2024-07-10,2024-08-03,2024-08-27
205,29 Oct – 20 Nov 2024,Accent Research/ RedBridge Group,Online,4909.0,39.0,31.0,11.0,,,19.0,,49.0,51.0,,2024-10-29,2024-11-09,2024-11-20
232,22 Jan – 12 Feb 2025,YouGov MRP,Online,8732.0,37.4,29.1,12.7,9.1,,11.7,,48.9,51.1,,2025-01-22,2025-02-01,2025-02-12
235,4–11 Feb 2025,RedBridge/Accent,Online,1002.0,43.0,33.0,12.0,,,12.0,,48.0,52.0,,2025-02-04,2025-02-07,2025-02-11
242,20–25 Feb 2025,RedBridge/Accent,Online,1002.0,41.0,34.0,12.0,,,13.0,,49.5,50.5,,2025-02-20,2025-02-22,2025-02-25
251,27 Feb – 26 Mar 2025,YouGov MRP,Online,10217.0,35.5,29.8,13.2,9.3,,12.2,,50.2,49.8,,2025-02-27,2025-03-12,2025-03-26


attitudinal MRP about to drop:


Unnamed: 0,Date,Brand,Interview mode,Sample size,Preferred prime minister Albanese,Preferred prime minister Dutton,Preferred prime minister Don't Know,Preferred prime minister Net,Albanese Satisfied,Albanese Dissatisfied,Albanese Don't Know,Albanese Net,Dutton Satisfied,Dutton Dissatisfied,Dutton Don't Know,Dutton Net,First Date,Mean Date,Last Date
130,4–11 Feb 2025,RedBridge/Accent,Online,,,,,,29.0,45.0,26.0,-16.0,31.0,42.0,27.0,-11.0,2025-02-04,2025-02-07,2025-02-11
134,20–25 Feb 2025,RedBridge/Accent,Online,,,,,,30.0,43.0,27.0,-13.0,39.0,43.0,18.0,-4.0,2025-02-20,2025-02-22,2025-02-25


dict_keys(['voting-intention', 'attitudinal', 'NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'NT'])

In [9]:
def check_for_large_samples(
    data_dict: dict[str, pd.DataFrame],
    threshold: int = 3000,
) -> None:
    """
    Check for unusually large sample sizes - may be MRP polling.
    """

    for name, t in data_dict.items():
        sample_col = t.columns[t.columns.str.contains("sample", case=False)][0]
        odd = t.index[t[sample_col].notna() & (t[sample_col] >= threshold)]
        print(odd)
        if len(odd):
            print(
                f"{name}: --CHECK-- Based on sample size, these rows might be MRP data:"
            )
            display(t.loc[odd])
            print("=" * 40)


check_for_large_samples(data)

Index([62, 172, 197, 261], dtype='int64')
voting-intention: --CHECK-- Based on sample size, these rows might be MRP data:


Unnamed: 0,Date,Brand,Interview mode,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote TOP,Primary vote OTH,Primary vote UND,2pp vote ALP,2pp vote L/NP,Primary vote UAP,First Date,Mean Date,Last Date
62,22 Sep – 4 Oct 2023,Resolve Strategic,Online,4728.0,31.0,37.0,12.0,7.0,,11.0,,57.0,43.0,2.0,2023-09-22,2023-09-28,2023-10-04
172,6–29 Aug 2024,Wolf & Smith,Online,10239.0,36.0,29.0,13.0,6.0,,15.0,,51.0,49.0,,2024-08-06,2024-08-17,2024-08-29
197,14–25 Oct 2024,ANU,Online,3622.0,38.2,31.8,11.8,,,,9.5,50.0,50.0,,2024-10-14,2024-10-19,2024-10-25
261,26–30 Mar 2025,Resolve Strategic,Online,3237.0,37.0,29.0,13.0,7.0,,14.0,,50.0,50.0,,2025-03-26,2025-03-28,2025-03-30


Index([0, 38], dtype='int64')
attitudinal: --CHECK-- Based on sample size, these rows might be MRP data:


Unnamed: 0,Date,Brand,Interview mode,Sample size,Preferred prime minister Albanese,Preferred prime minister Dutton,Preferred prime minister Don't Know,Preferred prime minister Net,Albanese Satisfied,Albanese Dissatisfied,Albanese Don't Know,Albanese Net,Dutton Satisfied,Dutton Dissatisfied,Dutton Don't Know,Dutton Net,First Date,Mean Date,Last Date
0,23–31 May 2022,Morning Consult,Online,3770.0,,,,,51.0,24.0,25.0,27.0,,,,,2022-05-23,2022-05-27,2022-05-31
38,29 May – 12 Jun 2023,CT Group,Online,3000.0,,,,,42.0,36.0,22.0,6.0,,,,,2023-05-29,2023-06-05,2023-06-12


Index([], dtype='int64')
Index([], dtype='int64')
Index([], dtype='int64')
Index([], dtype='int64')
Index([], dtype='int64')
Index([], dtype='int64')
Index([], dtype='int64')


## Preliminary data validation

### Distribute undecideds if the pollster has not

Mostly affects the Essential poll.

In [10]:
def distribute_undecided(data_dict: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]:
    """
    Distribute undecided voters to the primary and 2pp vote.

    Note: Essential often does not distribute undecideds to
    the 2pp Vote share.
    """

    if dc.UNDECIDED_COLUMN in data_dict[VOTING_INTENTION]:
        revised = dc.distribute_undecideds(
            table=data_dict[VOTING_INTENTION].copy(),
            col_pattern_list=["Primary vote", "2pp vote"],
        )
        revised = revised.drop(columns=dc.UNDECIDED_COLUMN)
        data_dict[VOTING_INTENTION] = revised
    else:
        print("CHECK: this step was not applied")
        print("Most likely because it has already been applied.")

    return data_dict


data = distribute_undecided(data)
data.keys()

For Primary vote distributed undecideds over 22.96% of rows.
For 2pp vote distributed undecideds over 22.96% of rows.


dict_keys(['voting-intention', 'attitudinal', 'NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'NT'])

### Add in Primary Other if the pollster has not

In [11]:
def fix_primary_other(data_dict: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]:
    """
    Ensure that the 'Primary vote OTH' column is data populated.
    """

    other = "Primary vote OTH"
    majors = ["L/NP", "ALP", "GRN"]
    minor_p = [
        x
        for x in data[VOTING_INTENTION].columns
        if "Primary" in x and not any(z in x for z in majors)
    ]
    major_p = [
        x
        for x in data[VOTING_INTENTION].columns
        if "Primary" in x and any(z in x for z in majors)
    ]

    rows = data[VOTING_INTENTION][minor_p].isna().sum(axis=1) == len(minor_p)
    if rows.sum() > 0:
        print("Changed from ...")
        display(data[VOTING_INTENTION].loc[rows])
        data[VOTING_INTENTION].loc[rows, other] = 100 - data[VOTING_INTENTION].loc[
            rows, major_p
        ].sum(axis=1)

        print("Changed to ...")
        display(data[VOTING_INTENTION].loc[rows])

    return data_dict


data = fix_primary_other(data)
print(data.keys())

Changed from ...


Unnamed: 0,Date,Brand,Interview mode,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote TOP,Primary vote OTH,2pp vote ALP,2pp vote L/NP,Primary vote UAP,First Date,Mean Date,Last Date
197,14–25 Oct 2024,ANU,Online,3622.0,42.63643,35.493154,13.170416,,,,54.75,54.75,,2024-10-14,2024-10-19,2024-10-25


Changed to ...


Unnamed: 0,Date,Brand,Interview mode,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote TOP,Primary vote OTH,2pp vote ALP,2pp vote L/NP,Primary vote UAP,First Date,Mean Date,Last Date
197,14–25 Oct 2024,ANU,Online,3622.0,42.63643,35.493154,13.170416,,,8.7,54.75,54.75,,2024-10-14,2024-10-19,2024-10-25


dict_keys(['voting-intention', 'attitudinal', 'NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'NT'])


## Forced data normalisation

Force columns that should sum to 100 to sum to 100. 
(But we only normalise if the sum is <99 or >101.)

This is a very aggressive treatment, and the rows being forced into
submission need to be considered and reflected upon from time to time.

In [12]:
# All the columns that match the CHECKABLE_100 patterns
# are checked to ensure that they sum to 100. If not, they
# are upweighted/downweighted to sum to 100.

CHECKABLE_100: dict[str, list[str]] = {
    # label: [list of regex-patterns],
    VOTING_INTENTION: [
        r"Primary",
        r"2pp",
    ],
    ATTITUDINAL: [
        r"^Dutton (Satisfied|Dissatisfied|Don't Know)",
        r"^Albanese (Satisfied|Dissatisfied|Don't Know)",
        r"Preferred [Pp]rime [Mm]inister (Dutton|Albanese|Don't Know)",
    ],
}
for state in STATES:
    CHECKABLE_100[state] = CHECKABLE_100[VOTING_INTENTION].copy()

data = dc.normalise(data, CHECKABLE_100, verbose=False)

print(data.keys())

17.90% of rows need normalisation - for voting-intention, Primary.
7.39% of rows need normalisation - for voting-intention, 2pp.
9.40% of rows need normalisation - for attitudinal, ^Dutton (Satisfied|Dissatisfied|Don't Know).
14.09% of rows need normalisation - for attitudinal, ^Albanese (Satisfied|Dissatisfied|Don't Know).
1.34% of rows need normalisation - for attitudinal, Preferred [Pp]rime [Mm]inister (Dutton|Albanese|Don't Know).
31.91% of rows need normalisation - for NSW, Primary.
0.00% of rows need normalisation - for NSW, 2pp.
30.61% of rows need normalisation - for VIC, Primary.
0.00% of rows need normalisation - for VIC, 2pp.
31.25% of rows need normalisation - for QLD, Primary.
0.00% of rows need normalisation - for QLD, 2pp.
5.56% of rows need normalisation - for WA, Primary.
0.00% of rows need normalisation - for WA, 2pp.
0.00% of rows need normalisation - for SA, Primary.
0.00% of rows need normalisation - for SA, 2pp.
25.00% of rows need normalisation - for TAS, Primary

### Recalculate Net attitudinal

In [13]:
# still to write this function

## Manage methodology changes

In [14]:
def methodology_change(data_dict: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]:
    """
    If a pollster firm substantially change the way in which they
    collect data we need to reflect this in the branding for the poll.
    """

    # Essential added education into its weighting
    # from the last poll in October 2023.
    effective_date = "2023-10-24"
    change_from = "Essential"
    change_to = "Essential 2"
    data_dict = dc.methodology(data_dict, effective_date, change_from, change_to)

    # Resolve Strategic appears to have changed in 2024
    effective_date = "2024-01-01"
    change_from = "Resolve Strategic"
    change_to = "Resolve Strategic 2"
    data_dict = dc.methodology(data_dict, effective_date, change_from, change_to)

    # Newspoll changed its methodology in 2025
    effective_date = "2025-02-01"
    change_from = "Newspoll"
    change_to = "Newspoll 2"
    data_dict = dc.methodology(data_dict, effective_date, change_from, change_to)

    return data_dict


data = methodology_change(data)
print(data.keys())

dict_keys(['voting-intention', 'attitudinal', 'NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'NT'])


## Final data validation

Please check any rows identified as a result of this step.

In [15]:
def final_check(data_dict: dict[str, pd.DataFrame]) -> None:
    """Check the final data for any anomalies."""

    for label, check_list in CHECKABLE_100.items():
        # check for primary vote columns where:
        # - major party voting is too high
        # - green vote is too low or too high

        too_much_major_party_vote = 80
        too_little_major_party_vote = 55
        green_too_low = 6
        green_too_high = 17

        if "Primary" in check_list:

            # check major primary vote share
            table = data_dict[label]
            m = table.columns[table.columns.str.contains("Primary")]
            m = m[m.str.contains("ALP|L/NP|CLP|LNP")]
            row_sums = table[m].sum(axis=1, skipna=True)  # ignore NaNs
            t = table.loc[
                (row_sums > 0)  # ignore how NaN rows are row-summed
                & (
                    (row_sums < too_little_major_party_vote)
                    | (row_sums > too_much_major_party_vote)
                )
            ]
            if not t.empty:
                print(f"{label} {check_list}: Major Primary Vote looks PROBLEMATIC!")
                display(t)

            # check Green vote is too low or too high
            g = table["Primary vote GRN"]
            t = table.loc[(g < green_too_low) | (g > green_too_high)]
            if not t.empty:
                print(f"{label} {check_list}: Green Primary Vote looks PROBLEMATIC!")
                display(t)

        # check row sums to 100
        row_check: pd.DataFrame | None = dc.row_sum_check(
            data_dict[label], check_list, tolerance=1.01
        )
        if row_check is None or row_check.empty:
            print(f"Row-sum check: {label} {check_list} looks good.\n")
            continue
        print(label, check_list)
        display(row_check)
        print("\n")


final_check(data)

Row-sum check: voting-intention ['Primary', '2pp'] looks good.

Row-sum check: attitudinal ["^Dutton (Satisfied|Dissatisfied|Don't Know)", "^Albanese (Satisfied|Dissatisfied|Don't Know)", "Preferred [Pp]rime [Mm]inister (Dutton|Albanese|Don't Know)"] looks good.

Row-sum check: NSW ['Primary', '2pp'] looks good.

VIC ['Primary', '2pp']: Major Primary Vote looks PROBLEMATIC!


Unnamed: 0,Date,Brand,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote UAP,Primary vote ONP,Primary vote IND,Primary vote OTH,2pp vote ALP,2pp vote L/NP,First Date,Mean Date,Last Date
38,4–11 Feb 2025,RedBridge/Accent,261.0,51.0,31.0,8.0,,,,10.0,42.0,58.0,2025-02-04,2025-02-07,2025-02-11


Row-sum check: VIC ['Primary', '2pp'] looks good.

Row-sum check: QLD ['Primary', '2pp'] looks good.

WA ['Primary', '2pp']: Major Primary Vote looks PROBLEMATIC!


Unnamed: 0,Date,Brand,Sample size,Primary vote ALP,Primary vote L/NP,Primary vote GRN,Primary vote ONP,Primary vote UAP,Primary vote OTH,Primary vote UND,2pp vote ALP,2pp vote L/NP,First Date,Mean Date,Last Date
9,1–10 Oct 2024,Redbridge,1514.0,50.724638,49.275362,,,,,,54.5,45.5,2024-10-01,2024-10-05,2024-10-10


Row-sum check: WA ['Primary', '2pp'] looks good.

Row-sum check: SA ['Primary', '2pp'] looks good.

Row-sum check: TAS ['Primary', '2pp'] looks good.

Row-sum check: NT ['Primary', '2pp'] looks good.



In [16]:
print(data.keys())

dict_keys(['voting-intention', 'attitudinal', 'NSW', 'VIC', 'QLD', 'WA', 'SA', 'TAS', 'NT'])


## Save the checked data

In [17]:
dc.store(data)

## All done

In [18]:
%load_ext watermark
%watermark --python --machine --conda --iversions --watermark

Python implementation: CPython
Python version       : 3.13.2
IPython version      : 9.0.2

conda environment: 313

Compiler    : Clang 18.1.8 
OS          : Darwin
Release     : 24.3.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

pandas : 2.2.3
IPython: 9.0.2

Watermark: 2.5.0



In [19]:
print("Finished")

Finished
