# Exploratory: from an aggregated 2pp polling estimate to a 2pp election result.

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Python-setup" data-toc-modified-id="Python-setup-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Python setup</a></span></li><li><span><a href="#Election-2pp-results-data" data-toc-modified-id="Election-2pp-results-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Election 2pp results data</a></span></li><li><span><a href="#Historic-polling-data" data-toc-modified-id="Historic-polling-data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Historic polling data</a></span></li></ul></div>

## Python setup

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO

## Election 2pp results data

In [2]:
# https://www.aec.gov.au/elections/federal_elections/tpp-results.htm
e_results = """
Election 	NSW 	VIC 	QLD 	WA  	SA  	TAS 	ACT 	NT  	AUST
10.12.49 	49.2 	50.3 	57.3 	53.0 	49.4 	53.5 	NA  	NA  	51.0
28.4.51 	49.5 	49.3 	56.5 	54.4 	48.2 	52.6 	NA  	NA  	50.7
29.5.54 	47.0 	49.3 	55.2 	51.9 	47.3 	50.0 	NA  	NA  	49.3
10.12.55 	50.5 	59.2 	56.1 	56.5 	51.0 	52.5 	NA  	NA  	54.2
22.11.58 	50.2 	57.7 	58.6 	59.2 	50.2 	50.2 	NA  	NA  	54.1
9.12.61 	45.2 	55.5 	49.3 	56.5 	45.8 	44.0 	NA  	NA  	49.5
30.11.63 	50.7 	57.5 	51.9 	56.2 	46.7 	45.5 	NA  	NA  	52.6
26.11.66 	56.1 	60.7 	55.7 	54.3 	57.6 	46.1 	44.2 	NA  	56.9
25.10.69 	48.4 	55.1 	50.1 	47.4 	45.8 	43.9 	28.4 	59.2 	49.8
2.12.72  	44.6 	49.6 	50.5 	51.7 	47.3 	39.5 	32.0 	54.5 	47.3
18.5.74  	45.1 	49.4 	54.6 	51.5 	47.5 	44.6 	40.3 	52.7 	48.3
13.12.75 	53.2 	56.2 	60.2 	58.8 	55.1 	55.9 	50.7 	54.9 	55.7
10.12.77 	52.4 	55.5 	58.0 	60.1 	51.3 	56.2 	45.7 	52.5 	54.6
18.10.80 	49.6 	49.3 	53.1 	53.3 	50.4 	52.9 	41.4 	51.2 	50.4
5.3.83  	45.91 	45.50 	50.54 	45.00 	47.70 	56.42 	34.51 	48.13 	46.77
1.12.84  	47.09 	46.91 	52.29 	49.14 	48.33 	53.37 	38.04 	51.39 	48.23
11.7.87  	49.71 	47.70 	50.68 	49.09 	49.76 	53.10 	36.75 	47.78 	49.17
24.3.90  	47.87 	52.54 	49.81 	52.87 	50.50 	52.10 	41.46 	44.98 	50.10
13.3.93  	45.62 	48.20 	51.57 	53.98 	52.67 	45.35 	38.81 	44.69 	48.56
2.3.96  	52.56 	49.70 	60.22 	56.00 	57.26 	48.42 	44.54 	50.37 	53.63
3.10.98  	48.46 	46.47 	53.05 	50.54 	53.11 	42.68 	37.56 	49.43 	49.02
10.11.01 	51.66 	47.86 	54.86 	51.62 	54.08 	42.27 	38.92 	47.51 	50.95
09.10.04 	51.93 	51.00 	57.09 	55.40 	54.36 	45.81 	38.46 	47.85 	52.74
24.11.07  	46.32 	45.73 	49.56 	53.26 	47.60 	43.79 	36.60 	44.59 	47.30
21.08.10 	51.16 	44.69 	55.14 	56.41 	46.82 	39.38 	38.33 	49.26 	49.88
7.9.13  	54.35 	49.80 	56.98 	58.28 	52.36 	48.77 	40.09 	50.35 	53.49
02.07.16 	50.53 	48.17 	54.10 	54.66 	47.73 	42.64 	38.87 	42.94 	50.36
18.05.19 	51.78 	46.86 	58.44 	55.55 	49.29 	44.04 	38.39 	45.80 	51.53
"""

In [22]:
# get the above text data into a pandas DataFrame
e_results_df = pd.read_csv(StringIO(e_results), sep='\s+',
                           parse_dates=['Election'], dayfirst=True)

In [23]:
# fix pandas desire to parse dates before 1970 as being in the future
e_results_df['Election'] = e_results_df['Election'].where(
    e_results_df['Election'].dt.year < 2020, 
    other=e_results_df['Election'] 
    - pd.Timedelta(days=100*365.25)).dt.date
    # Note, pd.Timedelta() does not have a years argument

e_results_df

Unnamed: 0,Election,NSW,VIC,QLD,WA,SA,TAS,ACT,NT,AUST
0,1949-12-10,49.2,50.3,57.3,53.0,49.4,53.5,,,51.0
1,1951-04-28,49.5,49.3,56.5,54.4,48.2,52.6,,,50.7
2,1954-05-29,47.0,49.3,55.2,51.9,47.3,50.0,,,49.3
3,1955-12-10,50.5,59.2,56.1,56.5,51.0,52.5,,,54.2
4,1958-11-22,50.2,57.7,58.6,59.2,50.2,50.2,,,54.1
5,1961-12-09,45.2,55.5,49.3,56.5,45.8,44.0,,,49.5
6,1963-11-30,50.7,57.5,51.9,56.2,46.7,45.5,,,52.6
7,1966-11-26,56.1,60.7,55.7,54.3,57.6,46.1,44.2,,56.9
8,1969-10-25,48.4,55.1,50.1,47.4,45.8,43.9,28.4,59.2,49.8
9,1972-12-02,44.6,49.6,50.5,51.7,47.3,39.5,32.0,54.5,47.3


## Historic polling data

In [24]:
# aggregated Australian polling data provided by @EthanOfHouseK on twitter
polling_data = pd.read_excel('../data/Australian Federal Polling Database.xlsx')

In [33]:
polling_data.columns

Index(['StartDate', 'EndDate', 'ElectionYear', 'FieldDays', 'DaysToElection',
       'DaysFromLastElection', 'Incumbent', 'Pollster', 'SamplingMethod',
       'SampleSize', 'ALP', 'LNC', 'Liberals', 'Nationals', 'DLP', 'Democrats',
       'Greens', 'PHON', 'Others', 'Undecided', 'Published2pp', 'RespAlloc2pp',
       'Raw2pp', 'Est2pp', '2pp', 'RoundedEst2pp', 'PrimMargin', 'Result2pp',
       'DevALP', 'DevLNC', 'DevLib', 'DevNat', 'DevDLP', 'DevDem', 'DevGrn',
       'DevPHON', 'DevOth', 'DevPub2pp', 'DevEst2pp', 'Dev2pp',
       'DevPastInc2pp', 'DevInc2pp', 'AbsDevALP', 'AbsDevLNC', 'AbsDevLib',
       'AbsDevNat', 'AbsDevDLP', 'AbsDevDem', 'AbsDevGrn', 'AbsDevPHON',
       'AbsDevOth', 'AbsDevPub2pp', 'AbsDevEst2pp', 'AbsDev2pp'],
      dtype='object')

In [38]:
# Will use mid-point of collection period to locate a poll in the time series
polling_data['MeanDate'] = (
    polling_data['StartDate'] 
    + (polling_data['EndDate'] - polling_data['StartDate']) / 2
).dt.date

In [41]:
# Let's look at the 2pp population estimates in this data
# There seems to be alot of them ...
tpp = ['MeanDate', 'Pollster', 'Published2pp', 'RespAlloc2pp', 
       'Raw2pp', 'Est2pp', '2pp', 'RoundedEst2pp',]
polling_data[tpp].sample(20).sort_index()

Unnamed: 0,MeanDate,Pollster,Published2pp,RespAlloc2pp,Raw2pp,Est2pp,2pp,RoundedEst2pp
71,1963-02-14,MorganGallup,,,,48.7,48.5,48.5
133,1973-09-05,MorganGallup,,,,53.4,53.5,53.5
206,1976-11-23,MorganGallup,,,,54.8,55.0,55.0
220,1977-06-21,MorganGallup,,,,53.7,53.5,53.5
245,1978-06-13,MorganGallup,,,,55.3,55.5,55.5
359,1983-02-19,MorganGallup,,,,45.2,45.0,45.0
425,1985-04-09,MorganGallup,,,,48.9,49.0,49.0
495,1987-03-28,Newspoll,,,,46.0,46.0,46.0
1083,1997-01-12,MorganGallup,57.0,,,57.3,57.0,57.5
1184,1998-07-11,Newspoll,,,,47.6,47.5,47.5


In [42]:
polling_data[tpp].tail(20).sort_index()

Unnamed: 0,MeanDate,Pollster,Published2pp,RespAlloc2pp,Raw2pp,Est2pp,2pp,RoundedEst2pp
3189,2021-05-14,Resolve,,,,48.4,48.5,48.5
3190,2021-05-21,Essential,,,44.0,48.0,48.5,48.0
3191,2021-06-03,YouGovNewspoll,50.0,,,49.4,50.0,49.5
3192,2021-06-02,Morgan,49.0,,,48.9,49.0,49.0
3193,2021-06-04,Essential,,,44.0,48.8,47.0,49.0
3194,2021-06-10,Resolve,,,,49.2,49.0,49.0
3195,2021-06-16,Morgan,49.5,,,50.1,49.5,50.0
3196,2021-06-18,Essential,,,45.0,48.2,48.5,48.0
3197,2021-06-24,YouGovNewspoll,49.0,,,48.9,49.0,49.0
3198,2021-07-02,Essential,,,44.0,48.3,49.0,48.5
