# Polling data capture from Wikipedia

**Make sure to:**
 * run before doing any analysis; and
 * check the data validation before moving on to the analysis.

## Python setup

In [1]:
# analytic imports
import pandas as pd

In [2]:
# local imports
import data_capture as dc
from common import ATTITUDINAL, MIDDLE_DATE, VOTING_INTENTION

## Get raw polling data from Wikipedia

Note: web-scraping of data is fragile. 
This code will need to be checked from time to time.

### Get all tables from the Wikipedia web page

In [3]:
URL = (
    "https://en.wikipedia.org/wiki/" +
    "Opinion_polling_for_the_2025_Australian_federal_election"
    #"Opinion_polling_for_the_next_Australian_federal_election"  # changed November 2025
)
df_list = dc.get_table_list(URL)
print(f"Total number of tables on page: {len(df_list)}")

Total number of tables on page: 42


In [4]:
# KEEP CELL - quick overview of all the tables at Wiki ...
if False:  # True to print
    for i, table in enumerate(df_list):
        print(f"{i}: {table.columns}\n")

### Data select, merge and clean

Note: For this election cycle, Wikipedia has separate tables for each
calendar year. These tables will need to be updated below each year`m

In [5]:
# The Wikipedia table numbers will need updating each year ...
# And whenever the Wikipedia page is reorganised (such that the tables are renumbered)
VOTING_TABLES = (3, 4, 5, 6)
ATTITUDINAL_TABLES = (7, 8, 9)  

prep = {VOTING_INTENTION: VOTING_TABLES, ATTITUDINAL: ATTITUDINAL_TABLES}
data = {}
for label, table_list in prep.items():
    print("Collating:", label, table_list)
    table = dc.get_combined_table(df_list, table_list, verbose=False).copy()
    table = dc.clean(table)
    data[label] = table
    print(f"{label}: {len(table)} rows {table.index}")
    

Collating: voting-intention (3, 4, 5, 6)


AttributeError: 'DataFrame' object has no attribute 'str'

### Quick look at most recent N polls

In [None]:
# Let's look at the last N polls
N = 3
for label, table in data.items():
    print(f"{label}:")
    #display(table.head(N))
    display(table.tail(N))
    print()

### Standardise column names

In [None]:
fix = {
    # from : to
    "Firm": "Brand",
    "Sample": "Sample size",
}

for label, table in data.items():
    for old_col, new_col in fix.items():
        fix_me_list = table.columns[table.columns.str.contains(old_col, case=False)]
        if len(fix_me_list) == 1:
            fix_me_string = fix_me_list[0]
            table = table.rename(columns={fix_me_string: new_col})
            print(f"{label} fixed col from {fix_me_string} to {new_col}")
            data[label] = table

### Remove MRP polls
MRP = multi-regression post-stratification polls

In [None]:
# Drop where interview mode is MRP
for label, table in data.items():
    drop_bool = (
        table['Brand'].str.contains('Accent Research', na=False) &
        table['Brand'].str.contains('RedBridge', na=False)
    )
    drop_index = drop_bool[drop_bool].index

    if len(drop_index) > 0:
        print(f"{label} MRP about to drop:")
        display(table.loc[drop_index])
        table = table.drop(drop_index)
        data[label] = table

In [None]:
# Check for unusally large sample sizes - may be MRP polling
SAMPLE_CHECK = 3000

for label, table in data.items():
    sample_col = table.columns[table.columns.str.contains("sample", case=False)][0]
    odd = table.index[table[sample_col].notna() & (table[sample_col] >= SAMPLE_CHECK)]
    print(odd)
    if len(odd):
        print(f"{label}: --CHECK-- Based on sample size, these rows might be MRP data:")
        display(table.loc[odd])
        print('=' * 40)


## Preliminary data validation

Note:Essential often does not distribute undecideds to the 2pp Vote share.

In [10]:
# Identify the groups of columns that should add across to 100
# We use this mechanism a few times below.

checkable_100: dict[str, list[str]] = {
    # label: [list of regex-patterns],
    VOTING_INTENTION: [
        r"Primary",
        r"2pp",
    ],
    ATTITUDINAL: [
        r"^Dutton (Satisfied|Dissatisfied|Don't Know)",
        r"^Albanese (Satisfied|Dissatisfied|Don't Know)",
        r"Preferred Prime Minister (Dutton|Albanese|Don't Know)",
    ],
}

In [11]:
# Check the columns that should add across to 100 actually do

if False:  # not always useful at this point
    for label, check_list in checkable_100.items():
        display(dc.row_sum_check(data[label], check_list))

## Distribute undecideds if the pollster has not

Mostly affects the Essential poll.

In [None]:
if dc.UNDECIDED_COLUMN in data[VOTING_INTENTION]:
    revised = dc.distribute_undecideds(
        table=data[VOTING_INTENTION].copy(),
        col_pattern_list=["Primary vote", "2pp vote"],
    )
    revised = revised.drop(columns=dc.UNDECIDED_COLUMN)
    data[VOTING_INTENTION] = revised
else:
    print("CHECK: this step was not applied")
    print("Most likely because it has already been applied.")

## Forced data normalisation

Force columns that should sum to 100 to sum to 100.

This is an aggressive treatment, and the rows being forced into
submission need to be considered and reflected upon from time to time.

In [None]:
forced_checkable = {x: checkable_100[x] for x in checkable_100 if x == VOTING_INTENTION}

data = dc.normalise(data, forced_checkable)

## Final data validation

Please check any rows identified as a result of this step.

In [None]:
for label, check_list in checkable_100.items():
    row_check = dc.row_sum_check(data[label], check_list, tolerance=1.01)
    if row_check is None or not len(row_check):
        print(f"{label} {check_list} looks good.\n")
        continue
    print(label, check_list)
    display(row_check)
    print("\n")

## Manage methodology changes

If a pollster firm substantially change the way in which they collect data we need to reflect this in the branding for the poll.

In [15]:
# Essential added education into its weighting
# from the last poll in October 2023.

effective_date = pd.Timestamp("2023-10-24")
change_from = "Essential"
change_to = "Essential 2"
data = dc.methodology(data, effective_date, change_from, change_to)

In [16]:
# Resolve Strategic appears to have changed in 2024

effective_date = pd.Timestamp("2024-01-01")
change_from = "Resolve Strategic"
change_to = "Resolve Strategic 2"
data = dc.methodology(data, effective_date, change_from, change_to)

## Save the checked data

In [17]:
dc.store(data)

## All done

In [None]:
%load_ext watermark
%watermark --python --machine --conda --iversions --watermark

In [None]:
print("Finished")