# Polling data capture from Wikipedia

**Make sure to:**
 * run before doing any analysis; and
 * check the data validation before moving on to the analysis.

## Python setup

In [1]:
# analytic imports
import pandas as pd

In [2]:
# local imports
from common import VOTING_INTENTION, ATTITUDINAL
import data_capture

## Get raw polling data from Wikipedia

Note: web-scraping of data is fragile. 
This code will need to be checked from time to time.

### Get all tables from the Wikipedia web page

In [3]:
URL = (
    "https://en.wikipedia.org/wiki/"
    "Opinion_polling_for_the_next_Australian_federal_election"
)
df_list = data_capture.get_table_list(URL)
print(f"Total number of tables on page: {len(df_list)}")

Total number of tables on page: 26


In [4]:
# KEEP CELL - quick overview of all the tables ...
if False:  # True to print
    for i, table in enumerate(df_list):
        print(f"{i}: {table.columns}\n")

### Data select, merge and clean

Note: For this election cycle, Wikipedia has separate tables for each
calendar year. These tables will need to be updated below each year`m

In [5]:
# The Wikipedia table numbers will need updating each year ...
VOTING_TABLES = (0, 1)
ATTITUDINAL_TABLES = (2, 3)

In [6]:
prep = {VOTING_INTENTION: VOTING_TABLES, ATTITUDINAL: ATTITUDINAL_TABLES}
data = {}
for label, table_list in prep.items():
    table = data_capture.get_combined_table(df_list, table_list).copy()
    table = data_capture.clean(table)
    data[label] = table
    print(f"{label}: {len(table)} rows {table.index}")

voting-intention: 66 rows RangeIndex(start=0, stop=66, step=1)
attitudinal: 54 rows RangeIndex(start=0, stop=54, step=1)


In [7]:
# Let's look at the last N polls
N = 3
for label, table in data.items():
    print(f"{label}:")
    display(table.tail(N))
    print()

voting-intention:


Unnamed: 0,Date,Brand,Interview mode,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote UAP,Primary vote OTH,Primary vote UND,2pp vote ALP,2pp vote L/NP,Mean Date
63,6–10 October 2023,YouGov,Online,1519.0,36.0,33.0,14.0,6.0,,11.0,6.0,53.0,47.0,2023-10-08
64,4–12 October 2023,Newspoll,Online,2638.0,35.0,36.0,12.0,6.0,,11.0,,54.0,46.0,2023-10-08
65,16-22 October 2023,Roy Morgan,—,1383.0,36.0,32.0,14.0,4.5,,13.5,,49.5,50.5,2023-10-19



attitudinal:


Unnamed: 0,Date,Firm,Interview mode,Sample,Preferred Prime Minister Albanese,Preferred Prime Minister Dutton,Preferred Prime Minister Don't Know,Albanese Satisfied,Albanese Dissatisfied,Albanese Don't Know,Albanese Net,Dutton Satisfied,Dutton Dissatisfied,Dutton Don't Know,Dutton Net,Mean Date
51,3–6 October 2023,Newspoll,Online,1225.0,50.0,33.0,17.0,45.0,46.0,9.0,-1.0,37.0,50.0,13.0,-13.0,2023-10-04
52,4–12 October 2023,Newspoll,Online,2638.0,51.0,31.0,18.0,46.0,46.0,8.0,0.0,35.0,53.0,12.0,-18.0,2023-10-08
53,11–14 October 2023,Essential,Online,1125.0,,,,46.0,43.0,11.0,3.0,36.0,43.0,21.0,-7.0,2023-10-12





### Distribute undecideds if the pollster has not

Mostly affects the Essential poll.

In [8]:
UND_COLUMN = "Primary vote UND"
if UND_COLUMN in data[VOTING_INTENTION]:
    revised = data_capture.distribute_undecideds(
        table=data[VOTING_INTENTION].copy(),
        undec_col=UND_COLUMN,
        col_pattern_list=["Primary", "2pp"],
    )
    revised = revised.drop(columns=UND_COLUMN)
    data[VOTING_INTENTION] = revised
else:
    print("CHECK: this step was not applied")
    print("Most likely because it has already been applied.")

### Data validation

Please check any rows identified as a result of this step.

In [9]:
# Check the columns that should add across to 100 actually do
checkables = {
    VOTING_INTENTION: [
        "Primary",
        "2pp",
    ],
    ATTITUDINAL: [
        r"^Dutton (Satisfied|Dissatisfied|Don't Know)",
        r"^Albanese (Satisfied|Dissatisfied|Don't Know)",
        r"Preferred Prime Minister (Dutton|Albanese|Don't Know)",
    ],
}

for label, check in checkables.items():
    display(data_capture.row_addition_check(data[label], check))

Unnamed: 0,Date,Brand,Interview mode,Sample size,Primary vote L/NP,Primary vote ALP,Primary vote GRN,Primary vote ONP,Primary vote UAP,Primary vote OTH,2pp vote ALP,2pp vote L/NP,Mean Date,Primary
11,30 November–4 December 2022,Resolve Strategic,Online,1611.0,30.0,42.0,11.0,4.0,2.0,8.0,60.0,40.0,2022-12-02,97.0
12,7–11 December 2022,Essential,Online,1042.0,31.442308,36.682692,13.625,6.288462,3.144231,17.817308,53.684211,46.315789,2022-12-09,109.0
18,1–6 February 2023,Essential,Online,1000.0,31.470588,34.617647,17.833333,6.294118,1.04902,15.735294,57.894737,42.105263,2023-02-03,107.0
20,15–19 February 2023,Resolve Strategic,Online,1604.0,31.0,40.0,10.0,5.0,1.0,11.0,57.9,42.1,2023-02-17,98.0
21,15–19 February 2023,Essential,Online,1044.0,32.553191,35.808511,15.191489,6.510638,3.255319,8.680851,55.387097,45.612903,2023-02-17,102.0
25,1–5 March 2023,Essential,Online,1141.0,34.357895,34.357895,12.884211,7.515789,2.147368,10.736842,52.688172,47.311828,2023-03-03,102.0
43,21–25 June 2023,Essential,Online,1148.0,31.875,34.0,14.875,7.4375,2.125,11.6875,55.319149,44.680851,2023-06-23,102.0
46,12–15 July 2023,Resolve Strategic,Online,1610.0,30.0,39.0,11.0,6.0,1.0,11.0,59.0,41.0,2023-07-13,98.0


Unnamed: 0,Date,Firm,Interview mode,Sample,Preferred Prime Minister Albanese,Preferred Prime Minister Dutton,Preferred Prime Minister Don't Know,Albanese Satisfied,Albanese Dissatisfied,Albanese Don't Know,Albanese Net,Dutton Satisfied,Dutton Dissatisfied,Dutton Don't Know,Dutton Net,Mean Date,^Dutton (Satisfied|Dissatisfied|Don't Know),^Albanese (Satisfied|Dissatisfied|Don't Know)
15,16–22 November 2022,Morning Consult,Online,,,,,56.0,31.0,25.0,25.0,,,,,2022-11-19,,112.0
28,29 March – 1 April 2023,Newspoll,Online,1500.0,58.0,26.0,16.0,56.0,35.0,9.0,21.0,35.0,48.0,21.0,-13.0,2023-03-30,104.0,


## Save the checked data

In [10]:
data_capture.store(data)

## All done

In [11]:
%load_ext watermark
%watermark -u -n -t -v -iv -w

Last updated: Fri Oct 27 2023 10:25:08

Python implementation: CPython
Python version       : 3.11.6
IPython version      : 8.16.1

pandas: 2.1.1

Watermark: 2.4.3



In [12]:
print("Finished")

Finished
