# Capture historic polling data from Wikipedia

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Python-set-up" data-toc-modified-id="Python-set-up-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Python set-up</a></span></li><li><span><a href="#Raw-data-capture" data-toc-modified-id="Raw-data-capture-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Raw data capture</a></span></li><li><span><a href="#Data-cleaning" data-toc-modified-id="Data-cleaning-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Data cleaning</a></span></li><li><span><a href="#Compile-a-table-of-polls-immediately-prior-to-an-election" data-toc-modified-id="Compile-a-table-of-polls-immediately-prior-to-an-election-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Compile a table of polls immediately prior to an election</a></span></li></ul></div>

## Python set-up

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# local imports
import common

## Raw data capture

In [2]:
links = {
    # election-date: [url, table-number],
    '2019-05-18': ['https://en.wikipedia.org/wiki/Opinion_polling_for_the_2019_Australian_federal_election',
           1],
    '2016-07-02': ['https://en.wikipedia.org/wiki/National_opinion_polling_for_the_2016_Australian_federal_election',
          2],
    '2013-09-07': ['https://en.wikipedia.org/wiki/Opinion_polling_for_the_2013_Australian_federal_election',
          1],
    
    # This data does not include polling firm ...
    #'2010-08-21': ['https://en.wikipedia.org/wiki/Opinion_polling_for_the_2010_Australian_federal_election',
    #      0],
    
    # Earlier polling data does not appear to be available on Wikipedia
}

In [3]:
def capture_raw_tables(links):
    raw_tables = {}
    for elect_date, (url, number) in links.items():
        text = common.get_url_text(url)
        table = common.get_table_from_text(number, text)
        raw_tables[elect_date] = table
    return raw_tables

raw_tables = capture_raw_tables(links)

In [4]:
if False: # display tables
    for elect_date, table in raw_tables.items():
        print(elect_date)
        display(table)

## Data cleaning

In [5]:
clean_tables = {}
for election_date, table in raw_tables.items():
    clean_tables[election_date] = common.clean(table)


## Compile a table of polls immediately prior to an election