# Calculate Seat 2pp vote shares following redistribution

The AEC provides two lots of TPP data:
1. TPP votes that can be attributed to individual polling places; and
2. TPP votes that can only be attributed to an electorate (eg. postal votes)

The idea here is to map all polling places to their new electorates, post redistribution;
and in the process, up-weight those votes to attribute any electorate-wide votes.

## Python set-up

In [1]:
import shapely
import shapefile
import pandas as pd

pd.set_option('display.max_rows', 500)

## Data acquisition 

### Shapefiles for new divisions (post redistribution)

In [2]:
def get_shapefiles() -> tuple[dict[str, list[str]], dict[str, list[shapely.geometry.Polygon]]]:
    """Interogate the shape files for the newly created divisions.
    Return a tuple:
    - a distionary with state names as keys, and a list of seat names as values, and
    - a dictionary with seat names as keys, and a list of polygons as values.
    """ 
    
    redists = {
        "NSW": r"../redistributions/input-data-2022/NSW-october-2024-ESRI.zip",
        "VIC": r"../redistributions/input-data-2022/VIC-october-2024-ESRI.zip",
        "WA": r"../redistributions/input-data-2022/WA-september-2024-ESRI.zip",
        # NT - redistribution process incomplete
    }

    shapes = {
        state: shapefile.Reader(filename) for state, filename in redists.items()
    }

    new_seats: dict[str, list[str]] = {} # state -> list of seat names
    polygons_for_seat: dict[str, list[shapely.geometry.Polygon]] = {}  # seat name -> list of polygons
    for state, shape in shapes.items():
        seats_in_state = []
        for rec in shape.iterShapeRecords():
            seat_polygon_list = []
            new_seat_name = rec.record['Elect_div']
            seats_in_state.append(new_seat_name)
            shape = rec.shape
            parts = shape.parts  # each part is a separate polygon
            for p in parts:
                points = (
                    shape.points[p:parts[parts.index(p)+1]] 
                    if parts.index(p) < len(parts)-1 
                    else shape.points[p:]
                )
                assert points[0] == points[-1], f"First and last points mismatch: {(points[0], points[-1])}"
                polygon = shapely.geometry.Polygon(points)
                seat_polygon_list.append(polygon)
            polygons_for_seat[new_seat_name] = seat_polygon_list
        new_seats[state] = seats_in_state

    return new_seats, polygons_for_seat

state_seats, seat_polygons = get_shapefiles()

# check the polygon counts for each seat
if False:
    for state, seats in state_seats.items():
        for seat in seats:
            print(state, seat, len(seat_polygons[seat]))


### 2022 Polling place data

In [3]:
REMOVE_PPVC = True # remove the PPVCs from the polling places data

def get_polling_places() -> pd.DataFrame:
    """Get the polling places data for the 2022 redistribution."""
    
    pp = pd.read_csv(
        "../redistributions/input-data-2022/" +
        "GeneralPollingPlacesDownload-27966.csv",
        skiprows=1, header=0, index_col="PollingPlaceID"
    )
    assert pp.index.is_unique
    assert pp.index.isna().sum() == 0

    # remove the data for the AEC National EAV Centres - this
    # is all geo-coded to Canberra, so we cannot use it for locations. 
    pp = pp[~((pp["PremisesNm"] == "AEC National EAV Centre") 
              | (pp["PremisesNm"] == "AEC National EAV2 Centre"))]

    # remove pre-poll voting centres - We will upweight the polling places
    # in the same division to account for the loss of these pre-poll locations
    if REMOVE_PPVC:
        pp = pp[~pp["PollingPlaceNm"].str.contains("PPVC", case=True)] 

    # remove the data where we do not have Lat/Lon coordinates 
    # (eg. hospital teams and multiple remote sites)
    pp = pp[pp["Longitude"].notna()]
    assert pp["Latitude"].notna().all()

    return pp


polling_places = get_polling_places()

# check an example polling place
display(polling_places.loc[polling_places.PollingPlaceNm.str.contains("Narooma")])


Unnamed: 0_level_0,State,DivisionID,DivisionNm,PollingPlaceTypeID,PollingPlaceNm,PremisesNm,PremisesAddress1,PremisesAddress2,PremisesAddress3,PremisesSuburb,PremisesStateAb,PremisesPostCode,Latitude,Longitude
PollingPlaceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
710,NSW,117,Eden-Monaro,1,Narooma,Narooma Sport and Leisure Centre,100 Bluewater Dr,,,NAROOMA,NSW,2546.0,-36.215989,150.129379


### Identify a new (post redistribution) electorate for a polling_place

In [4]:
def find_electorate(polling_place: int) -> str | None:
    """Find the electorate for a polling place."""
    
    if polling_place not in polling_places.index:
        print(f"Polling place {polling_place} not found.")
        return None
    
    state = polling_places.loc[polling_place, "State"]
    if state not in state_seats:
        print(f"State {state} not found.")
        return None
    
    lat = polling_places.loc[polling_place, "Latitude"]
    lon = polling_places.loc[polling_place, "Longitude"]
    point = shapely.geometry.Point(lon, lat)

    for seat in state_seats[state]:
        for polygon in seat_polygons[seat]:
            if point.within(polygon) :
               return seat

    print(f"Polling place {polling_place} not found in any seat.")
    return None


# check that this code works
print(find_electorate(710))  # Narooma NSW should work
print(find_electorate(6913)) # Ceduna SA should fail


Eden-monaro
State SA not found.
None


### 2022 TPP totals by division

In [5]:
KEY_COLS = [
    "DivisionNm", # should be the first column
    "Liberal/National Coalition Votes", 
    "Australian Labor Party Votes", # Labor should be the last column
]

def get_ttp_seat_totals() -> pd.DataFrame:
    """Get the TPP raw seat totals for the 2022 election.
    Seat totals include votes where we dont have
    a polling booth as a proxy for the voter's address/location
    (eg. postal voting, overseas voting, etc)."""
    
    tpp = pd.read_csv(
        "../redistributions/input-data-2022/" +
        "HouseTppByDivisionDownload-27966.csv",
        skiprows=1, header=0, index_col="DivisionNm"
    )[KEY_COLS[1:] + ["StateAb"]]
    assert tpp.index.is_unique
    assert tpp.index.isna().sum() == 0

    return tpp


tpp_seat_total = get_ttp_seat_totals()


### 2022 TPP totals by polling place (ie. booth attributable)

In [6]:
def get_tpp_by_polling_place(pp: pd.DataFrame) -> pd.DataFrame:
    """Get the TPP by polling place for the 2022 election.
    Return the vote totals by polling place."""
    
    # check that we have Lat/Lon coordinates for all polling places
    assert pp.Latitude.notna().all() and pp.Longitude.notna().all()

    # read the AEC TPP data
    tpp = pd.read_csv(
        "../redistributions/input-data-2022/" +
        "HouseTppByPollingPlaceDownload-27966.csv",
        skiprows=1, header=0, index_col="PollingPlaceID",
    )
    assert tpp.index.is_unique
    
    # remove places we cannot map to Lat/Lon 
    # coordinates via a PollingPlaceID
    tpp = tpp[tpp.index.isin(pp.index)]

    return tpp

booth_tpp_by_place = get_tpp_by_polling_place(polling_places)


In [7]:
def aggregate_places_to_seats(tpp: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Aggregate the TPP by polling place to TPP by seat.
    Return a tuple:
    - a dataframe of total TPP booth vote by seat
    - a dataframe of individual booth vote adjusted for unmappable data."""

    tpp_copy = tpp.copy()
    
    # We have two polling places in redistributed states that 
    # don't map to new seats cleanly. Not sure why. 
    # A pain, but should not affect overall results much.
    potential_removal = []
    for state in state_seats.keys():
        state_places = polling_places[polling_places.State == state]
        for place in state_places.index:
            f = find_electorate(place)
            if not f:
                potential_removal.append(place)    

    for p in potential_removal:
        if p in tpp_copy.index:
            print(f"Removing polling place {p}: {polling_places.loc[p, "PollingPlaceNm"]} from the data.")
            tpp_copy.drop(p, inplace=True)

    booth_tpp = tpp_copy[KEY_COLS[1:] + ["DivisionNm"]].groupby("DivisionNm").sum()

    return booth_tpp, tpp_copy

booth_tpp_by_seat, adj_booth_tpp_by_place = aggregate_places_to_seats(booth_tpp_by_place)

# check the data
#display(booth_tpp_by_seat)
#display(adj_booth_tpp_by_place.head())

Polling place 83845 not found in any seat.
Polling place 2887 not found in any seat.
Removing polling place 83845: Darlinghurst South from the data.
Removing polling place 2887: Surry Hills from the data.


## Data Manipulation

In [8]:
def calculate_up_weights(
    tpp_seat_total: pd.DataFrame, tpp_place_totals: pd.DataFrame
) -> pd.DataFrame:
    """Calculate the up-weights that will be applied
    to the local polling place data to bring vote 
    totals in line with the divisional totals."""

    weights: list[dict] = []
    for division in tpp_seat_total.index:
        state = tpp_seat_total.loc[division, "StateAb"]
        u = {"State": state}
        for party in KEY_COLS[1:]:
            u[party] = (
                tpp_seat_total.loc[division, party] 
                / tpp_place_totals.loc[division, party]
            )
        u_frame = pd.DataFrame(u, index=[division])
        weights.append(u_frame)
    all_weights = pd.concat(weights)
    return all_weights


up_weights = calculate_up_weights(tpp_seat_total, booth_tpp_by_seat)
#display(up_weights)

In [9]:
def redistrubute() -> pd.DataFrame:
    """Redistribute the TPP vote totals by polling place
    to the new electoral divisions.  Only in respect of the
    new divisions created by the post-2022 redistributions.
    Returns a DataFrame of the redistributed vote totals."""  

    new_distro = adj_booth_tpp_by_place.copy()
    new_distro["New Division"] = None
    new_distro[KEY_COLS[1:]] = new_distro[KEY_COLS[1:]].astype(float)  # convert to float

    # allocate polling places to new divisions
    state_map = {}
    for state in state_seats:
        places = adj_booth_tpp_by_place[adj_booth_tpp_by_place.StateAb == state]
        for place in places.index:
            old_electorate = places.loc[place, "DivisionNm"]
            electorate = find_electorate(place)
            if electorate:
                state_map[electorate] = state
                new_distro.loc[place, "New Division"] = electorate
                for party in KEY_COLS[1:]:
                    new_distro.loc[place, party] *= up_weights.loc[old_electorate, party]
            else:
                # should not happen - if it does - some code is wrong somewhere above
                print(f"--SHIT-- Polling place {place} not found in any seat.")

    # calculate the redistribution
    new_distro = new_distro[new_distro["New Division"].notna()]
    redistributed = new_distro[KEY_COLS[1:] + ["New Division"]].groupby("New Division").sum()
    redistributed["State"] = redistributed.index.map(state_map)
    return redistributed


redistribution = redistrubute()
display(redistribution.head())


Unnamed: 0_level_0,Liberal/National Coalition Votes,Australian Labor Party Votes,State
New Division,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aston,57705.474508,52360.058921,VIC
Ballarat,35730.0,60770.0,VIC
Banks,51547.334338,44213.46881,NSW
Barton,37671.256145,60711.283207,NSW
Bendigo,40351.464827,63192.537684,VIC


## Construct a TPP pendulum

In [10]:
COL_NAME = "Labor TPP"


def calculate_labor_tpp(frame: pd.DataFrame) -> pd.DataFrame:
    """Calculate the TPP for the Labor party."""
    
    series = (
        frame[KEY_COLS[-1]] / frame[KEY_COLS[1:]].sum(axis=1)
    ).round(4) * 100 # per cent  # this degree of precision is overkill
    series.name = COL_NAME
    return pd.DataFrame(series)        

In [11]:
def get_pendulum():
    """Calculate the pendulum for the 2022 election."""
    
    projected_tpp = []
    APPROX = "Approximated"

    for state in sorted(tpp_seat_total["StateAb"].unique()):

        if state in state_seats:
            # The redistribution case - use projected data
            state_data = redistribution[redistribution["State"] == state]
            approx = True

        else:
            # the no redistribution case - use existing divisional data from AEC
            state_data = tpp_seat_total[tpp_seat_total["StateAb"] == state]
            approx = False

        # collate
        state_labor_tpp = calculate_labor_tpp(state_data)
        state_labor_tpp[APPROX] = approx
        state_labor_tpp["State"] = state
        projected_tpp.append(state_labor_tpp)

    # publish - note we need to fix some names so that the data aligns
    # across AEC datasets.
    fix_seats = {
        # you would think the AEC could be consistent in their naming conventions
        "Mcmahon": "McMahon", 
        "Eden-monaro": "Eden-Monaro", 
        "Mcewen": "McEwen", 
        "Mcpherson": "McPherson",
        "O'connor": "O'Connor",
    }
    nat_labor_tpp = pd.concat(projected_tpp)
    nat_labor_tpp = nat_labor_tpp.rename(index=fix_seats)
    comparable = nat_labor_tpp.index[nat_labor_tpp.index.isin(tpp_seat_total.index)]
    previously = calculate_labor_tpp(tpp_seat_total.loc[comparable])
    nat_labor_tpp["Pre-redistribution TPP"] = previously
    nat_labor_tpp["Change in Labor's TPP"] = (
        nat_labor_tpp[COL_NAME] 
        - nat_labor_tpp["Pre-redistribution TPP"] 
    )
    nat_labor_tpp = nat_labor_tpp.sort_values(COL_NAME)
    order = ["State", COL_NAME, "Pre-redistribution TPP", "Change in Labor's TPP", APPROX]
    return nat_labor_tpp[order]


pendulum = get_pendulum()
lost = tpp_seat_total.index[~tpp_seat_total.index.isin(pendulum.index)]
print("Number of seats in next Parliament:", len(pendulum))
display(pendulum)
print("Lost seats:", lost.to_list())


Number of seats in next Parliament: 150


Unnamed: 0,State,Labor TPP,Pre-redistribution TPP,Change in Labor's TPP,Approximated
Maranoa,QLD,27.88,27.88,0.0,False
Gippsland,VIC,29.43,29.43,0.0,True
Mallee,VIC,31.01,31.01,0.0,True
Parkes,NSW,32.18,32.16,0.02,True
Barker,SA,33.38,33.38,0.0,False
Farrer,NSW,33.65,33.65,0.0,True
Nicholls,VIC,33.83,32.85,0.98,True
New England,NSW,34.13,33.57,0.56,True
Calare,NSW,34.55,34.55,0.0,True
Groom,QLD,35.83,35.83,0.0,False


Lost seats: ['Higgins', 'North Sydney']


In [12]:
print(pendulum.to_html())

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>State</th>
      <th>Labor TPP</th>
      <th>Pre-redistribution TPP</th>
      <th>Change in Labor's TPP</th>
      <th>Approximated</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Maranoa</th>
      <td>QLD</td>
      <td>27.88</td>
      <td>27.88</td>
      <td>0.00</td>
      <td>False</td>
    </tr>
    <tr>
      <th>Gippsland</th>
      <td>VIC</td>
      <td>29.43</td>
      <td>29.43</td>
      <td>0.00</td>
      <td>True</td>
    </tr>
    <tr>
      <th>Mallee</th>
      <td>VIC</td>
      <td>31.01</td>
      <td>31.01</td>
      <td>0.00</td>
      <td>True</td>
    </tr>
    <tr>
      <th>Parkes</th>
      <td>NSW</td>
      <td>32.18</td>
      <td>32.16</td>
      <td>0.02</td>
      <td>True</td>
    </tr>
    <tr>
      <th>Barker</th>
      <td>SA</td>
      <td>33.38</td>
      <td>33.38</td>
      <td>0.00</td>
      <td>False</td>
    </tr>
    <t