# Calculate seat vote shares following redistribution

The idea here is to map all polling places to their new electorates, post redistribution;
and in the process, up-weight those votes to attribute any electorate-wide votes.

## Python set-up

In [1]:
import shapely
import shapefile
import pandas as pd

pd.set_option('display.max_rows', 500)

## Data acquisition 

### Shapefiles for new 2025 divisions (post redistribution)

In [2]:
def get_shapefiles() -> tuple[dict[str, list[str]], dict[str, list[shapely.geometry.Polygon]]]:
    """Interogate the shape files for the newly created divisions.
    Return a tuple, comprising:
    - a dictionary with state names as keys, and a list of seat names as values, and
    - a dictionary with seat names as keys, and a list of polygons as values.
    """ 
    
    redists = {
        "NSW": r"../redistributions/input-data-2022/NSW-october-2024-ESRI.zip",
        "VIC": r"../redistributions/input-data-2022/VIC-october-2024-ESRI.zip",
        "WA": r"../redistributions/input-data-2022/WA-september-2024-ESRI.zip",
        # NT - redistribution process incomplete
    }

    shapes = {
        state: shapefile.Reader(filename) for state, filename in redists.items()
    }

    # build a dictionary of seat names for each state, 
    # and a dictionary of polygons for each seat.
    new_seats: dict[str, list[str]] = {}
    polygons_for_seat: dict[str, list[shapely.geometry.Polygon]] = {}
    for state, shape in shapes.items():
        seats_in_state = []
        for rec in shape.iterShapeRecords():
            seat_polygon_list = []
            new_seat_name = rec.record['Elect_div']
            seats_in_state.append(new_seat_name)
            shape = rec.shape
            parts = shape.parts  # each part is a separate polygon
            for p in parts:
                points = (
                    # points are in (lon, lat) order, which is the
                    # (x, y) order for the traditional map orientation.
                    shape.points[p:parts[parts.index(p)+1]] 
                    if parts.index(p) < len(parts)-1 
                    else shape.points[p:]
                )
                # quick sanity check
                assert points[0] == points[-1], f"First and last points mismatch: {(points[0], points[-1])}"
                polygon = shapely.geometry.Polygon(points)
                seat_polygon_list.append(polygon)
            polygons_for_seat[new_seat_name] = seat_polygon_list
        new_seats[state] = seats_in_state

    return new_seats, polygons_for_seat


# Get the shapefiles
state_seats, seat_polygons = get_shapefiles()

# Check the data capture
print(f"Redistribution seat count: {len(seat_polygons)}")

# Check the polygon counts for each seat
if False:
    for state, seats in state_seats.items():
        for seat in seats:
            print(state, seat, len(seat_polygons[seat]))

Redistribution seat count: 100


### 2022 Polling place location data

In [3]:
def get_polling_places() -> pd.DataFrame:
    """Get the polling places data for the 2022 redistribution,
    excluding EAVs, PPVCs and locations without lon/lat data."""
    
    pp = pd.read_csv(
        "../redistributions/input-data-2022/" +
        "GeneralPollingPlacesDownload-27966.csv",
        skiprows=1, header=0, index_col="PollingPlaceID"
    )
    assert pp.index.is_unique
    assert pp.index.isna().sum() == 0
    print(f"The raw AEC data has {len(pp)} polling places.")

    # Electronic assisted voting (EAV) centres are not polling places
    # and are not required for this analysis. They are all geo-coded to
    # Canberra, so we cannot use it for locations. 
    eav_mask = pp["PremisesNm"].str.contains("EAV", case=True)
    print(f"Removed {eav_mask.sum()} EAV centres.")
    pp = pp.loc[~eav_mask]
    print(len(pp))

    # Remove pre-poll voting centres (PPVCs) - We will upweight the polling 
    # places in the same division to account for the loss of these pre-poll 
    # locations. Why? Because there are only a few pre-poll locations for 
    # each division (typically < 10), and they are not always within the  
    # division. Furthermore, people may travel some distance to use PPVCs
    remove_ppvc = True
    ppvc_mask = pp["PollingPlaceNm"].str.contains("PPVC", case=True)
    if remove_ppvc and ppvc_mask.sum() > 0:
        print(f"Removed {ppvc_mask.sum()} PPVCs.")
        pp = pp.loc[~ppvc_mask]
        print(len(pp))

    # remove the data where we do not have Lat/Lon coordinates 
    # (eg. hospital teams and multiple remote sites)
    missing_lon_mask = pp["Longitude"].isna()
    print(f"Removed {missing_lon_mask.sum()} polling places with missing lats/lons.")
    pp = pp.loc[~missing_lon_mask]
    assert pp["Latitude"].notna().all()

    print(f"We have {len(pp)} polling places remaining.")
    return pp


polling_places = get_polling_places()

# check an example polling place
display(polling_places.loc[polling_places.PollingPlaceNm.str.contains("Narooma")])

The raw AEC data has 8479 polling places.
Removed 302 EAV centres.
8177
Removed 1019 PPVCs.
7158
Removed 64 polling places with missing lats/lons.
We have 7094 polling places remaining.


Unnamed: 0_level_0,State,DivisionID,DivisionNm,PollingPlaceTypeID,PollingPlaceNm,PremisesNm,PremisesAddress1,PremisesAddress2,PremisesAddress3,PremisesSuburb,PremisesStateAb,PremisesPostCode,Latitude,Longitude
PollingPlaceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
710,NSW,117,Eden-Monaro,1,Narooma,Narooma Sport and Leisure Centre,100 Bluewater Dr,,,NAROOMA,NSW,2546.0,-36.215989,150.129379


### Identify the new (post redistribution) electorate for a polling_place

In [4]:
def find_electorate(polling_place: int) -> str | None:
    """Find the electorate for a polling place."""
    
    if polling_place not in polling_places.index:
        print(f"Polling place {polling_place} not found.")
        return None
    
    state = polling_places.loc[polling_place, "State"]
    if state not in state_seats:
        print(f"State {state} not found.")
        return None
    
    lat = polling_places.loc[polling_place, "Latitude"]
    lon = polling_places.loc[polling_place, "Longitude"]
    point = shapely.geometry.Point(lon, lat)

    for seat in state_seats[state]:
        for polygon in seat_polygons[seat]:
            if point.within(polygon) :
               return seat

    print(f"Polling place {polling_place} not found in any seat.")
    return None


# check that this code works
print(find_electorate(710))  # Narooma NSW should work
print(find_electorate(6913)) # Ceduna SA should fail 
# (Because SA was not redistributed this round)


Eden-monaro
State SA not found.
None


### 2022 TPP totals by division

In [5]:
KEY_COLS = [
    "DivisionNm", # should be the first column
    "Liberal/National Coalition Votes", 
    "Australian Labor Party Votes", # Labor should be the last column
]

def get_ttp_seat_totals() -> pd.DataFrame:
    """Get the TPP raw seat totals for the 2022 election.
    Seat totals include votes where we dont have
    a polling booth as a proxy for the voter's address/location
    (eg. postal voting, overseas voting, etc)."""
    
    tpp = pd.read_csv(
        "../redistributions/input-data-2022/" +
        "HouseTppByDivisionDownload-27966.csv",
        skiprows=1, header=0, index_col="DivisionNm"
    )[KEY_COLS[1:] + ["StateAb"]]
    assert tpp.index.is_unique
    assert tpp.index.isna().sum() == 0

    return tpp


# Capture the TPP seat totals
tpp_seat_total = get_ttp_seat_totals()

# Sum the national TPP total - used as a cross-check for upweighting
aec_total_vote_count = tpp_seat_total[KEY_COLS[1:]].sum().sum()
print(f"Total national formal TPP votes: {aec_total_vote_count:,}")

Total national formal TPP votes: 14,659,042


### 2022 TPP totals by polling place (ie. booth attributable)

In [6]:
def get_tpp_by_polling_place(pp: pd.DataFrame) -> pd.DataFrame:
    """Get the TPP by polling place for the 2022 election.
    Return the vote totals by polling place."""
    
    # check that we have Lat/Lon coordinates for all polling places
    assert pp.Latitude.notna().all() and pp.Longitude.notna().all()

    # read the AEC TPP data
    tpp = pd.read_csv(
        "../redistributions/input-data-2022/" +
        "HouseTppByPollingPlaceDownload-27966.csv",
        skiprows=1, header=0, index_col="PollingPlaceID",
    )
    assert tpp.index.is_unique
    
    # remove places we cannot map to Lat/Lon 
    # coordinates via a PollingPlaceID
    tpp = tpp[tpp.index.isin(pp.index)]

    return tpp

booth_tpp_by_place = get_tpp_by_polling_place(polling_places)


In [7]:
def aggregate_places_to_seats(tpp: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Aggregate the TPP by polling place to TPP by seat.
    Return a tuple:
    - a dataframe of total TPP booth vote by seat
    - a dataframe of individual booth vote adjusted for unmappable data."""

    tpp_copy = tpp.copy()
    
    # We have two polling places in redistributed states that 
    # don't map to new seats cleanly. Not sure why. 
    # A pain, but should not affect overall results much 
    # as the total number of votes is smallish.
    potential_removal = []
    for state in state_seats.keys():
        state_places = polling_places[polling_places.State == state]
        for place in state_places.index:
            f = find_electorate(place)
            if not f:
                potential_removal.append(place)    

    lost_votes = 0
    for p in potential_removal:
        if p in tpp_copy.index:
            count = tpp_copy.loc[p, KEY_COLS[1:]].sum()
            lost_votes += count
            print(f"Removing polling place {p}: {polling_places.loc[p, "PollingPlaceNm"]} " +
                  f"from the data, with {count} votes.")
            tpp_copy.drop(p, inplace=True)
    if lost_votes:
        print(f"Sum of lost votes: {lost_votes:,}")

    booth_tpp = tpp_copy[KEY_COLS[1:] + ["DivisionNm"]].groupby("DivisionNm").sum()

    return booth_tpp, tpp_copy

booth_tpp_by_seat, adj_booth_tpp_by_place = aggregate_places_to_seats(booth_tpp_by_place)

# check the data
#display(booth_tpp_by_seat)
#display(adj_booth_tpp_by_place.head())

Polling place 83845 not found in any seat.
Polling place 2887 not found in any seat.
Removing polling place 83845: Darlinghurst South from the data, with 76 votes.
Removing polling place 2887: Surry Hills from the data, with 376 votes.
Sum of lost votes: 452


### Load political parties and map to a 4-way broad grouping

In [8]:
def get_party_groups() -> tuple[dict[str, str], list[str]]:
    """Get Party groups:
    Returns a tuple comprising:
    - a dictionary mapping party-abreviation (key) to party-group (value), and
    - a list of party abbreviations."""
    
    party_data = pd.read_csv(
        "../redistributions/input-data-2022/GeneralPartyDetailsDownload-27966.csv",
        skiprows=1, header=0
    )
    the_parties = sorted(party_data.PartyAb.unique())

    groups = {
        "Labor": ["ALP"],
        "Green": ["GRN"],
        "Coalition": ["CLP", "LNP", "LP", "NP"],
    }

    other = set(the_parties)
    for label, party_list in groups.items():
        for party in party_list:
            assert party in other
            other.remove (party)
    groups["Other"] = sorted(list(other))

    p_map = {}
    for label, party_list in groups.items():
        for party in party_list:
            p_map[party] = label
    
    return p_map, the_parties


# get the broad party groups
party_map, parties = get_party_groups()

# check the data   
print(party_map)


{'ALP': 'Labor', 'GRN': 'Green', 'CLP': 'Coalition', 'LNP': 'Coalition', 'LP': 'Coalition', 'NP': 'Coalition', 'AJP': 'Other', 'ASP': 'Other', 'AUC': 'Other', 'AUD': 'Other', 'AUP': 'Other', 'AUVA': 'Other', 'CEC': 'Other', 'CYA': 'Other', 'DAVI': 'Other', 'DHJP': 'Other', 'DPDA': 'Other', 'FIN': 'Other', 'GAP': 'Other', 'GVIC': 'Other', 'HMP': 'Other', 'IAP': 'Other', 'IMO': 'Other', 'JLN': 'Other', 'KAP': 'Other', 'KCBR': 'Other', 'LDP': 'Other', 'ON': 'Other', 'REAS': 'Other', 'RPT': 'Other', 'SAL': 'Other', 'SOPA': 'Other', 'SPP': 'Other', 'SUN': 'Other', 'TLOC': 'Other', 'TNL': 'Other', 'UAPP': 'Other', 'VNS': 'Other', 'WAP': 'Other', 'XEN': 'Other'}


### Load primary first preference votes by seat and group

In [9]:
def get_first_prefs(p_map: dict[str, str]) -> tuple[dict[str, str], pd.DataFrame, pd.DataFrame]:
    """Get the first preference votes for the 2022 election.
    Return a tuple:
    - an updated party_map
    - a dataframe of total first preference votes by seat
    - a dataframe of the first preference votes as percentages by seat."""
    
    # load the raw AEC data
    primary = pd.read_csv(
        "../redistributions/input-data-2022/" +
        "HouseFirstPrefsByCandidateByVoteTypeDownload-27966.csv",
        skiprows=1, header=0
    )

    # remove informal votes
    primary = primary[(primary.Surname != "Informal") & (primary.GivenNm != "Informal")]

    # check the formal vote count - should match the national TPP vote data.
    print(f"Total formal first preference votes: {primary.TotalVotes.sum():,}")

    # Augment the party map - independents are missing from the party map
    missing = set(primary.PartyAb.unique()) - set(p_map.keys())
    for m in missing:
        print(f"Adding {m} to the party map.")
        p_map[m] = "Other"

    # Map and Group the data
    totals = {}
    for div_name in primary.DivisionNm.unique():
        div_data = primary[primary.DivisionNm == div_name].copy()
        div_data["Party Group"] = div_data.PartyAb.map(p_map)
        totals[div_name] = div_data.groupby("Party Group")["TotalVotes"].sum()
    fp_votes = pd.DataFrame(totals).T
    print(f"Total first preference votes (after mapping/grouping)): {fp_votes.sum().sum():,}")
    fp_percent = fp_votes.div(fp_votes.sum(axis=1), axis=0).round(4) * 100
    fp_percent.sort_index()

    return p_map, fp_votes, fp_percent


# get the first preference votes
party_map, first_prefs, first_prefs_percent = get_first_prefs(party_map)

# show the data
display(first_prefs_percent.sort_values("Coalition", ascending=False))

Total formal first preference votes: 14,659,042
Adding IND to the party map.
Adding NAFD to the party map.
Total first preference votes (after mapping/grouping)): 14,659,042


Party Group,Coalition,Green,Labor,Other
Maranoa,56.26,4.87,15.29,23.59
Barker,55.63,7.41,20.85,16.11
Cook,55.53,9.9,24.99,9.58
Gippsland,54.14,8.47,19.23,18.16
Mitchell,52.61,11.83,25.51,10.06
New England,52.47,7.74,18.56,21.24
Farrer,52.26,9.11,18.99,19.64
Parkes,49.32,4.73,20.22,25.74
Mallee,49.09,5.34,16.76,28.8
Berowra,49.08,15.58,22.23,13.1


## TPP Data Manipulation

In [10]:
def calculate_up_weights(
    tpp_seat_total: pd.DataFrame, tpp_place_totals: pd.DataFrame
) -> pd.DataFrame:
    """Calculate the up-weights that will be applied
    to the local polling place data to bring vote 
    totals in line with the divisional totals."""

    weights: list[dict] = []
    for division in tpp_seat_total.index:
        state = tpp_seat_total.loc[division, "StateAb"]
        u = {"State": state}
        for party in KEY_COLS[1:]:
            u[party] = (
                tpp_seat_total.loc[division, party] 
                / tpp_place_totals.loc[division, party]
            )
        u_frame = pd.DataFrame(u, index=[division])
        weights.append(u_frame)
    all_weights = pd.concat(weights)
    return all_weights


up_weights = calculate_up_weights(tpp_seat_total, booth_tpp_by_seat)
#display(up_weights)

In [11]:
def redistrubute() -> pd.DataFrame:
    """Redistribute the TPP vote totals by polling place
    to the new electoral divisions.  Only in respect of the
    new divisions created by the post-2022 redistributions.
    Returns a DataFrame of the redistributed vote totals."""  

    new_distro = adj_booth_tpp_by_place.copy()
    new_distro["New Division"] = None
    new_distro[KEY_COLS[1:]] = new_distro[KEY_COLS[1:]].astype(float)  # convert to float

    # allocate polling places to new divisions
    state_map = {}
    for state in state_seats:
        places = adj_booth_tpp_by_place[adj_booth_tpp_by_place.StateAb == state]
        for place in places.index:
            old_electorate = places.loc[place, "DivisionNm"]
            electorate = find_electorate(place)
            if electorate:
                state_map[electorate] = state
                new_distro.loc[place, "New Division"] = electorate
                for party in KEY_COLS[1:]:
                    new_distro.loc[place, party] *= up_weights.loc[old_electorate, party]
            else:
                # should not happen - if it does - some code is wrong somewhere above
                print(f"--SHIT-- Polling place {place} not found in any seat.")

    # calculate the redistribution
    new_distro = new_distro[new_distro["New Division"].notna()]
    redistributed = new_distro[KEY_COLS[1:] + ["New Division"]].groupby("New Division").sum()
    redistributed["State"] = redistributed.index.map(state_map)
    return redistributed


redistribution = redistrubute()
display(redistribution.head())


Unnamed: 0_level_0,Liberal/National Coalition Votes,Australian Labor Party Votes,State
New Division,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Aston,57705.474508,52360.058921,VIC
Ballarat,35730.0,60770.0,VIC
Banks,51547.334338,44213.46881,NSW
Barton,37671.256145,60711.283207,NSW
Bendigo,40351.464827,63192.537684,VIC


### Construct a TPP pendulum

In [12]:
COL_NAME = "Labor TPP"


def calculate_labor_tpp(frame: pd.DataFrame) -> tuple[pd.DataFrame, float]:
    """Calculate the TPP for the Labor party.
    Return a tuple comprising:
    - a single column DataFrame with Labor's TPP % vote share for each seat, and
    - a float with the total number of votes."""

    row_totals = frame[KEY_COLS[1:]].sum(axis=1)
    vote_totals = row_totals.sum()

    # Note: precision to 2 decimal places is overkill
    series = (frame[KEY_COLS[-1]] / row_totals).round(4) * 100 # per cent  
    series.name = COL_NAME

    return pd.DataFrame(series), vote_totals    

In [13]:
def get_pendulum():
    """Calculate the pendulum for the 2022 election."""
    
    projected_tpp = []
    APPROX = "Approximated"

    nationa_vote_total = 0
    for state in sorted(tpp_seat_total["StateAb"].unique()):

        if state in state_seats:
            # The redistribution case - use projected data
            state_data = redistribution[redistribution["State"] == state]
            approx = True

        else:
            # the no redistribution case - use existing divisional data from AEC
            state_data = tpp_seat_total[tpp_seat_total["StateAb"] == state]
            approx = False

        # collate
        state_labor_tpp, state_vote_total = calculate_labor_tpp(state_data)
        nationa_vote_total += state_vote_total
        state_labor_tpp[APPROX] = approx
        state_labor_tpp["State"] = state
        projected_tpp.append(state_labor_tpp)

    # provide a national vote count. Useful to cross-check with the AEC
    # count above - should be the same if up-weighting worked as planned.
    print(f"Total national formal TPP votes: {nationa_vote_total:,}")
    assert int(nationa_vote_total) == aec_total_vote_count

    # publish - note we need to fix some names so that the data aligns
    # across AEC datasets.
    fix_seats = {
        # you would think the AEC could be consistent in their naming conventions
        "Mcmahon": "McMahon", 
        "Eden-monaro": "Eden-Monaro", 
        "Mcewen": "McEwen", 
        "Mcpherson": "McPherson",
        "O'connor": "O'Connor",
    }
    nat_labor_tpp = pd.concat(projected_tpp)
    nat_labor_tpp = nat_labor_tpp.rename(index=fix_seats)
    comparable = nat_labor_tpp.index[nat_labor_tpp.index.isin(tpp_seat_total.index)]
    previously, _ = calculate_labor_tpp(tpp_seat_total.loc[comparable])
    nat_labor_tpp["Pre-redistribution TPP"] = previously
    nat_labor_tpp["Change in Labor's TPP"] = (
        nat_labor_tpp[COL_NAME] 
        - nat_labor_tpp["Pre-redistribution TPP"] 
    )
    nat_labor_tpp = nat_labor_tpp.sort_values(COL_NAME)
    order = ["State", COL_NAME, "Pre-redistribution TPP", "Change in Labor's TPP", APPROX]
    return nat_labor_tpp[order]


pendulum = get_pendulum()
lost = tpp_seat_total.index[~tpp_seat_total.index.isin(pendulum.index)]
print("Number of seats in next Parliament:", len(pendulum))
display(pendulum)
print("Lost seats:", lost.to_list())


Total national formal TPP votes: 14,659,042.0
Number of seats in next Parliament: 150


Unnamed: 0,State,Labor TPP,Pre-redistribution TPP,Change in Labor's TPP,Approximated
Maranoa,QLD,27.88,27.88,0.0,False
Gippsland,VIC,29.43,29.43,0.0,True
Mallee,VIC,31.01,31.01,0.0,True
Parkes,NSW,32.18,32.16,0.02,True
Barker,SA,33.38,33.38,0.0,False
Farrer,NSW,33.65,33.65,0.0,True
Nicholls,VIC,33.83,32.85,0.98,True
New England,NSW,34.13,33.57,0.56,True
Calare,NSW,34.55,34.55,0.0,True
Groom,QLD,35.83,35.83,0.0,False


Lost seats: ['Higgins', 'North Sydney']


In [14]:
PRODUCE_HTML = False
if PRODUCE_HTML:
    print(pendulum.to_html())

## Data Manipulation - Primary Votes