# Polling data capture from Wikipedia

**Make sure to:**
 * run before doing any analysis; and
 * check the data validation before moving on to the analysis.

## Python setup

In [1]:
# analytic imports
import pandas as pd

In [2]:
# local imports
import data_capture as dc
from common import ATTITUDINAL, MIDDLE_DATE, VOTING_INTENTION

## Get raw polling data from Wikipedia

Note: web-scraping of data is fragile. 
This code will need to be checked from time to time.

### Get all tables from the Wikipedia web page

In [3]:
URL = (
    "https://en.wikipedia.org/wiki/"
    "Opinion_polling_for_the_next_Australian_federal_election"
)
df_list = dc.get_table_list(URL)
print(f"Total number of tables on page: {len(df_list)}")

Total number of tables on page: 26


In [4]:
# KEEP CELL - quick overview of all the tables at Wiki ...
if False:  # True to print
    for i, table in enumerate(df_list):
        print(f"{i}: {table.columns}\n")

### Data select, merge and clean

Note: For this election cycle, Wikipedia has separate tables for each
calendar year. These tables will need to be updated below each year`m

In [5]:
# The Wikipedia table numbers will need updating each year ...
VOTING_TABLES = (0, 1)
ATTITUDINAL_TABLES = (2, 3)

In [6]:
prep = {VOTING_INTENTION: VOTING_TABLES, ATTITUDINAL: ATTITUDINAL_TABLES}
data = {}
for label, table_list in prep.items():
    table = dc.get_combined_table(df_list, table_list).copy()
    table = dc.clean(table)
    data[label] = table
    print(f"{label}: {len(table)} rows {table.index}")

voting-intention: 74 rows RangeIndex(start=0, stop=74, step=1)
attitudinal: 56 rows RangeIndex(start=0, stop=56, step=1)


### Quick look at most recent N polls

In [7]:
# Let's look at the last N polls
N = 3
for label, table in data.items():
    print(f"{label}:")
    display(table.tail(N))
    print()

voting-intention:


Unnamed: 0,Date,Brand,Interview mode,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote UAP,Primary vote OTH,Primary vote UND,2pp vote ALP,2pp vote L/NP,First Date,Mean Date,Last Date
71,8–12 November 2023,Essential,Online,1150.0,34.0,32.0,12.0,7.0,2.0,8.0,5.0,49.0,47.0,2023-11-08,2023-11-10,2023-11-12
72,10–14 November 2023,YouGov,Online,1582.0,36.0,31.0,13.0,7.0,,13.0,,51.0,49.0,2023-11-10,2023-11-12,2023-11-14
73,13–19 November 2023,Roy Morgan,—,1401.0,37.5,29.5,13.5,6.5,,13.0,,49.5,50.5,2023-11-13,2023-11-16,2023-11-19



attitudinal:


Unnamed: 0,Date,Firm,Interview mode,Sample,Preferred Prime Minister Albanese,Preferred Prime Minister Dutton,Preferred Prime Minister Don't Know,Albanese Satisfied,Albanese Dissatisfied,Albanese Don't Know,Albanese Net,Dutton Satisfied,Dutton Dissatisfied,Dutton Don't Know,Dutton Net,First Date,Mean Date,Last Date
53,11–14 October 2023,Essential,Online,1125.0,,,,46.0,43.0,11.0,3.0,36.0,43.0,21.0,-7.0,2023-10-11,2023-10-12,2023-10-14
54,30 October–3 November 2023,Newspoll,Online,1220.0,46.0,36.0,18.0,42.0,52.0,6.0,-10.0,37.0,50.0,13.0,-13.0,2023-10-30,2023-11-01,2023-11-03
55,1–5 November 2023,Resolve Strategic,Online,1602.0,40.0,27.0,33.0,39.0,46.0,15.0,-7.0,36.0,40.0,25.0,-4.0,2023-11-01,2023-11-03,2023-11-05





## Preliminary data validation

Note:Essential often does not distribute undecideds to the 2pp Vote share.

In [8]:
# Identify the groups of columns that should add across to 100
# We use this mechanism a few times below.

checkable_100: dict[str, list[str]] = {
    # label: [list of regex-patterns],
    VOTING_INTENTION: [
        r"Primary",
        r"2pp",
    ],
    ATTITUDINAL: [
        r"^Dutton (Satisfied|Dissatisfied|Don't Know)",
        r"^Albanese (Satisfied|Dissatisfied|Don't Know)",
        r"Preferred Prime Minister (Dutton|Albanese|Don't Know)",
    ],
}

In [9]:
# Check the columns that should add across to 100 actually do

if False:  # not always useful at this point
    for label, check_list in checkable_100.items():
        display(dc.row_sum_check(data[label], check_list))

## Distribute undecideds if the pollster has not

Mostly affects the Essential poll.

In [10]:
if dc.UNDECIDED_COLUMN in data[VOTING_INTENTION]:
    revised = dc.distribute_undecideds(
        table=data[VOTING_INTENTION].copy(),
        col_pattern_list=["Primary vote", "2pp vote"],
    )
    revised = revised.drop(columns=dc.UNDECIDED_COLUMN)
    data[VOTING_INTENTION] = revised
else:
    print("CHECK: this step was not applied")
    print("Most likely because it has already been applied.")

For Primary vote distributed undecideds over 33.78% of rows.
For 2pp vote distributed undecideds over 33.78% of rows.


## Forced data normalisation

Force columns that should sum to 100 to sum to 100.

This is an aggressive treatment, and the rows being forced into
submission need to be considered and reflected upon from time to time.

In [11]:
forced_checkable = {x: checkable_100[x] for x in checkable_100 if x == VOTING_INTENTION}

data = dc.normalise(data, forced_checkable)

For voting-intention; Pattern: Primary -> Selected columns: ['Primary vote L/NP', 'Primary vote ALP', 'Primary vote GRN', 'Primary vote ONP', 'Primary vote UAP', 'Primary vote OTH']
28.38% of rows need normalisation.


Unnamed: 0,Date,Brand,Interview mode,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote UAP,Primary vote OTH,2pp vote ALP,2pp vote L/NP,First Date,Mean Date,Last Date,Normalisation totals Primary
0,14–17 June 2022,Dynata,Online,1001.0,34.032609,37.326087,13.173913,4.391304,4.391304,7.684783,56.898,52.102,2022-06-14,2022-06-15,2022-06-17,101.0
4,31 August–3 September 2022,Newspoll-YouGov,Online,1505.0,31.0,37.0,13.5,7.0,2.0,10.0,57.0,43.0,2022-08-31,2022-09-01,2022-09-03,100.5
10,30 November–3 December 2022,Newspoll,Online,1508.0,35.0,39.0,11.0,6.0,1.0,9.0,55.0,45.0,2022-11-30,2022-12-01,2022-12-03,101.0
11,30 November–4 December 2022,Resolve Strategic,Online,1611.0,30.0,42.0,11.0,4.0,2.0,8.0,60.0,40.0,2022-11-30,2022-12-02,2022-12-04,97.0
12,7–11 December 2022,Essential,Online,1042.0,31.442308,36.682692,13.625,6.288462,3.144231,17.817308,53.684211,46.315789,2022-12-07,2022-12-09,2022-12-11,109.0
14,17–22 January 2023,Resolve Strategic,Online,1606.0,29.0,42.0,11.0,6.0,2.0,11.0,60.0,40.0,2023-01-17,2023-01-19,2023-01-22,101.0
18,1–6 February 2023,Essential,Online,1000.0,31.470588,34.617647,17.833333,6.294118,1.04902,15.735294,57.894737,42.105263,2023-02-01,2023-02-03,2023-02-06,107.0
20,15–19 February 2023,Essential,Online,1044.0,32.553191,35.808511,15.191489,6.510638,3.255319,8.680851,55.387097,45.612903,2023-02-15,2023-02-17,2023-02-19,102.0
21,15–19 February 2023,Resolve Strategic,Online,1604.0,31.0,40.0,10.0,5.0,1.0,11.0,57.9,42.1,2023-02-15,2023-02-17,2023-02-19,98.0
25,1–5 March 2023,Essential,Online,1141.0,34.357895,34.357895,12.884211,7.515789,2.147368,10.736842,52.688172,47.311828,2023-03-01,2023-03-03,2023-03-05,102.0


For voting-intention; Pattern: 2pp -> Selected columns: ['2pp vote ALP', '2pp vote L/NP']
10.81% of rows need normalisation.


Unnamed: 0,Date,Brand,Interview mode,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote UAP,Primary vote OTH,2pp vote ALP,2pp vote L/NP,First Date,Mean Date,Last Date,Normalisation totals 2pp
0,14–17 June 2022,Dynata,Online,1001.0,33.695652,36.956522,13.043478,4.347826,4.347826,7.608696,56.898,52.102,2022-06-14,2022-06-15,2022-06-17,109.0
20,15–19 February 2023,Essential,Online,1044.0,31.914894,35.106383,14.893617,6.382979,3.191489,8.510638,55.387097,45.612903,2023-02-15,2023-02-17,2023-02-19,101.0
31,12–16 April 2023,Essential,Online,1136.0,31.958763,35.051546,14.43299,6.185567,3.092784,9.278351,54.189474,44.810526,2023-04-12,2023-04-14,2023-04-16,99.0
33,26–30 April 2023,Essential,Online,1130.0,34.042553,35.106383,14.893617,5.319149,2.12766,8.510638,55.819149,43.180851,2023-04-26,2023-04-28,2023-04-30,99.0
34,10–13 May 2023,Resolve Strategic,Online,1610.0,30.30303,42.424242,12.121212,5.050505,2.020202,8.080808,62.22,39.78,2023-05-10,2023-05-11,2023-05-13,102.0
41,7–11 June 2023,Essential,Online,1123.0,33.684211,33.684211,16.842105,5.263158,1.052632,9.473684,54.765957,44.234043,2023-06-07,2023-06-09,2023-06-11,99.0
47,19–23 July 2023,Essential,Online,1150.0,34.042553,32.978723,14.893617,7.446809,1.06383,9.574468,53.157895,47.842105,2023-07-19,2023-07-21,2023-07-23,101.0
71,8–12 November 2023,Essential,Online,1150.0,35.789474,33.684211,12.631579,7.368421,2.105263,8.421053,51.552083,49.447917,2023-11-08,2023-11-10,2023-11-12,101.0


## Final data validation

Please check any rows identified as a result of this step.

In [12]:
for label, check_list in checkable_100.items():
    row_check = dc.row_sum_check(data[label], check_list, tolerance=1.01)
    if row_check is None or not len(row_check):
        print(f"{label} {check_list} looks good.\n")
        continue
    print(label, check_list)
    display(row_check)
    print("\n")

voting-intention ['Primary', '2pp'] looks good.

attitudinal ["^Dutton (Satisfied|Dissatisfied|Don't Know)", "^Albanese (Satisfied|Dissatisfied|Don't Know)", "Preferred Prime Minister (Dutton|Albanese|Don't Know)"]


Unnamed: 0,Date,Firm,Interview mode,Sample,Preferred Prime Minister Albanese,Preferred Prime Minister Dutton,Preferred Prime Minister Don't Know,Albanese Satisfied,Albanese Dissatisfied,Albanese Don't Know,...,Dutton Satisfied,Dutton Dissatisfied,Dutton Don't Know,Dutton Net,First Date,Mean Date,Last Date,^Dutton (Satisfied|Dissatisfied|Don't Know),^Albanese (Satisfied|Dissatisfied|Don't Know),Preferred Prime Minister (Dutton|Albanese|Don't Know)
15,16–22 November 2022,Morning Consult,Online,,,,,56.0,31.0,25.0,...,,,,,2022-11-16,2022-11-19,2022-11-22,0.0,112.0,0.0
28,29 March – 1 April 2023,Newspoll,Online,1500.0,58.0,26.0,16.0,56.0,35.0,9.0,...,35.0,48.0,21.0,-13.0,2023-03-29,2023-03-30,2023-04-01,104.0,,






## Manage methodology changes

If a pollster firm substantially change the way in which they collect data we need to reflect this in the branding for the poll.

In [13]:
# Essential added education into its weighting
# from the last poll in October 2023.

effective_date = pd.Timestamp("2023-10-24")
change_from = "Essential"
change_to = "Essential2"
data = dc.methodology(data, effective_date, change_from, change_to)

## Save the checked data

In [14]:
dc.store(data)

## All done

In [15]:
%load_ext watermark
%watermark -u -n -t -v -iv -w

Last updated: Wed Nov 22 2023 02:20:40

Python implementation: CPython
Python version       : 3.11.6
IPython version      : 8.17.2

pandas: 2.1.3

Watermark: 2.4.3



In [16]:
print("Finished")

Finished
