In [25]:
import xml.etree.ElementTree as ET
from pathlib import Path
import datetime as dt
import xmltodict
import pandas as pd
import os
import zipfile
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go 
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import linregress
import numpy as np

def read_eml(path: Path):

    """Convert EML file to dictionary"""

    try:
        return xmltodict.parse(path.read_text(encoding="utf-8"))

    except UnicodeDecodeError as e:
        print(path.name)
        print(e)
        return None
    
# Get the user's home directory
current_directory = os.getcwd()

abs_demo = pd.read_csv('demographicData.csv')

In [26]:
# HAVE YOU MOVED THE ZIP FILE INTO THE WORKING DIRECTORY??

zip_filename = 'aec-mediafeed-Detailed-Verbose-29581-20231012184312.zip'

# Let's proceed. 

# Construct the full path to your file
zip_file_path = os.path.join(current_directory, zip_filename)

# unzipped directory
unzip_directory_name = zip_filename[:-4]

# Create a ZipFile object and extract the contents
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_directory_name)

print('Unzipped to: ' + unzip_directory_name)
print('Now filtering to spreadsheet ...')

os.chdir(unzip_directory_name + '//xml')
file_name = os.listdir()[0]

f = open(file_name, encoding='utf-8')
text = f.read()
f.close()
temp = xmltodict.parse(text)

print('File opened...')

# Get the current working directory
current_directory = os.getcwd()

# Get the parent directory
parent_directory = os.path.dirname(current_directory)

# Get the grandparent directory
grandparent_directory = os.path.dirname(parent_directory)

# Change the working directory to the grandparent directory
os.chdir(grandparent_directory)

df = pd.DataFrame()

for PollingDistrict in temp['MediaFeed']['Results']['Election']['Referendum']['Contests']['Contest']['PollingDistricts']['PollingDistrict']:
    temp_dict = {
        'Seat': PollingDistrict['PollingDistrictIdentifier']['Name'],
        'SeatID': PollingDistrict['PollingDistrictIdentifier']['@Id'],
        'State': PollingDistrict['PollingDistrictIdentifier']['StateIdentifier']['@Id'],
        'Enrolment': float(PollingDistrict['Enrolment']['#text']),
        'Yes': float(PollingDistrict['ProposalResults']['Option'][0]['Votes']['#text']),
        'No': float(PollingDistrict['ProposalResults']['Option'][1]['Votes']['#text']),
        'Informal': float(PollingDistrict['ProposalResults']['Informal']['Votes']['#text']),
        'Complete': PollingDistrict['ProposalResults']['@PollingPlacesExpected'] == PollingDistrict['ProposalResults']['@PollingPlacesReturned'],
        'Votes_returned': float(PollingDistrict['ProposalResults']['Total']['Votes']['#text']),
        'Votes_returned_pc': float(PollingDistrict['ProposalResults']['Total']['Votes']['@Percentage'])
    }
    temp_dict['Informal_pc_tot'] = temp_dict['Informal'] / temp_dict['Enrolment']
    temp_dict['Yes_pc_tot'] = temp_dict['Yes'] / temp_dict['Enrolment']
    temp_dict['No_pc_tot'] = temp_dict['No'] / temp_dict['Enrolment']
    if temp_dict['Votes_returned'] == 0:
        temp_dict['Informal_pc_so_far'] = 'NULL'
        temp_dict['Yes_pc_so_far'] = 'NULL'
        temp_dict['No_pc_so_far'] = 'NULL'
        temp_dict['Yes_lead_pts_so_far'] = 'NULL'
    else:
        temp_dict['Informal_pc_so_far'] = temp_dict['Informal'] / temp_dict['Votes_returned']
        temp_dict['Yes_pc_so_far'] = temp_dict['Yes'] / temp_dict['Votes_returned']
        temp_dict['No_pc_so_far'] = temp_dict['No'] / temp_dict['Votes_returned']
        temp_dict['Yes_lead_pts_so_far'] = temp_dict['Yes_pc_so_far'] - 0.5
    df = pd.concat(
        [df, pd.DataFrame(temp_dict, index=[0])], ignore_index=True
    )

df = df.sort_values(by='Votes_returned_pc', ascending=False)
print(df)



Unzipped to: aec-mediafeed-Detailed-Verbose-29581-20231012184312
Now filtering to spreadsheet ...
File opened...
            Seat SeatID State  Enrolment      Yes       No  Informal  \
0           Bean    318   ACT   111176.0  49183.0  39335.0     948.0   
2         Fenner    102   ACT   103863.0  45879.0  36690.0     880.0   
15   Eden-Monaro    117   NSW   117428.0  42508.0  48994.0    1010.0   
1       Canberra    101   ACT   101798.0  43243.0  34606.0     816.0   
22          Hume    125   NSW   126270.0  44792.0  51617.0    1047.0   
..           ...    ...   ...        ...      ...      ...       ...   
141       Durack    312    WA   124218.0   1498.0   1527.0      66.0   
36        Parkes    139   NSW   110985.0   1220.0   1503.0      26.0   
32   New England    135   NSW   116060.0   1029.0   1219.0      36.0   
57        Dawson    158   QLD   113963.0      0.0      0.0       0.0   
66       Herbert    165   QLD   119674.0      0.0      0.0       0.0   

     Complete  Votes_r

In [27]:
# Merge the two datasets
df = df.drop(columns='State')
master = df.merge(abs_demo, how='left', left_on='Seat', right_on='CED_NAME_2021')
master = master.drop(columns = 'CED_NAME_2021')

# Change electoral results to decimals
master['Australian Labor Party Percentage'] = master['Australian Labor Party Percentage']/100
master['Coalition point lead'] = master['Coalition point lead']/100
master['ALP margin'] = -master['Coalition point lead']

In [28]:
# FILTER THE DATAFRAME
filtered_master = master[master['Votes_returned'] > 0]
filtered_master['Swing'] = filtered_master['Yes_lead_pts_so_far'] - filtered_master['ALP margin'].values



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [30]:
#SWING ANALYSIS

covariates = ['indig', 'age', 't_ed', 'income']

results = []
fig = make_subplots(rows=2, cols=2)

for covariate in covariates:
    slope, intercept, r_value, p_value, std_err = linregress(filtered_master[covariate].astype(float), filtered_master['Swing'].astype(float))
    r_squared = r_value**2
    result = {
        'Covariate': covariate,
        'Slope': slope,
        'Intercept': intercept,
        'R Value': r_value,
        'R-Squared': r_squared,
        'P-Value': p_value,
        'Standard Error': std_err
    }
    results.append(result)

    # Create a scatter plot for the data points
    temp_chart = go.Scatter(x=filtered_master[covariate], y=filtered_master['Swing'], mode='markers')
    
    # Add a line of best fit to the plot
    y_fit = intercept + slope * filtered_master[covariate]
    temp_chart_line = go.Scatter(x=filtered_master[covariate], y=y_fit, mode='lines')
    
    fig.add_trace(temp_chart, row=(covariates.index(covariate) // 2) + 1, col=(covariates.index(covariate) % 2) + 1)
    fig.add_trace(temp_chart_line, row=(covariates.index(covariate) // 2) + 1, col=(covariates.index(covariate) % 2) + 1)

# Update the layout of the subplots for better visualization
fig.update_layout(title_text="Linear Regression Analysis")
fig.update_xaxes(title_text=r"% of seat Indigenous", row=1, col=1)
fig.update_xaxes(title_text="Median seat age", row=1, col=2)
fig.update_xaxes(title_text=r"% tertiary educated", row=2, col=1)
fig.update_xaxes(title_text="Median income", row=2, col=2)
fig.update_yaxes(title_text="Swing", row=1, col=1)
fig.update_yaxes(title_text="Swing", row=1, col=2)
fig.update_yaxes(title_text="Swing", row=2, col=1)
fig.update_yaxes(title_text="Swing", row=2, col=2)

# Convert the results to a DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Print or use the results
print(results_df)

# Show the plot
fig.show()


  Covariate     Slope  Intercept   R Value  R-Squared       P-Value  \
0     indig  0.004596  -0.058318  0.183259   0.033584  2.527967e-02   
1       age  0.015415  -0.646895  0.572063   0.327256  2.519244e-14   
2      t_ed -0.003196   0.040118 -0.356196   0.126876  8.256129e-06   
3    income -0.000134   0.067626 -0.214990   0.046221  8.462281e-03   

   Standard Error  
0        0.002034  
1        0.001823  
2        0.000692  
3        0.000050  


In [38]:
# 2PP VS YES ANALYSIS

#SWING ANALYSIS

covariates = ['indig', 'age', 't_ed', 'income']

results_alp = []
results_yes = []
fig = make_subplots(rows=2, cols=2)

for covariate in covariates:
    slope, intercept, r_value, p_value, std_err = linregress(filtered_master[covariate].astype(float), filtered_master['ALP margin'].astype(float))
    r_squared = r_value**2
    result_alp = {
        'Covariate': covariate,
        'Slope': slope,
        'Intercept': intercept,
        'R Value': r_value,
        'R-Squared': r_squared,
        'P-Value': p_value,
        'Standard Error': std_err
    }
    results_alp.append(result)
    slope, intercept, r_value, p_value, std_err = linregress(filtered_master[covariate].astype(float), filtered_master['Yes_lead_pts_so_far'].astype(float))
    r_squared = r_value**2
    result_yes = {
        'Covariate': covariate,
        'Slope': slope,
        'Intercept': intercept,
        'R Value': r_value,
        'R-Squared': r_squared,
        'P-Value': p_value,
        'Standard Error': std_err
    }
    results_yes.append(result)    

    # Create a scatter plot for the data points
    temp_chart_alp = go.Scatter(x=filtered_master[covariate], y=filtered_master['ALP margin'], mode='markers')
    
    # Add a line of best fit to the plot
    y_fit = result_alp['Intercept'] + result_alp['Slope'] * filtered_master[covariate]
    temp_chart_line_alp = go.Scatter(x=filtered_master[covariate], y=y_fit, mode='lines', name='2PP 2022')

    # Create a scatter plot for the data points
    temp_chart_yes = go.Scatter(x=filtered_master[covariate], y=filtered_master['Yes_lead_pts_so_far'], mode='markers')
    
    # Add a line of best fit to the plot
    y_fit = result_yes['Intercept'] + result_yes['Slope'] * filtered_master[covariate]
    temp_chart_line_yes = go.Scatter(x=filtered_master[covariate], y=y_fit, mode='lines', name='Yes 2023')    
    
    fig.add_trace(temp_chart_alp, row=(covariates.index(covariate) // 2) + 1, col=(covariates.index(covariate) % 2) + 1)
    fig.add_trace(temp_chart_line_alp, row=(covariates.index(covariate) // 2) + 1, col=(covariates.index(covariate) % 2) + 1)
    fig.add_trace(temp_chart_yes, row=(covariates.index(covariate) // 2) + 1, col=(covariates.index(covariate) % 2) + 1)
    fig.add_trace(temp_chart_line_yes, row=(covariates.index(covariate) // 2) + 1, col=(covariates.index(covariate) % 2) + 1)    

# Update the layout of the subplots for better visualization
fig.update_layout(title_text="Linear Regression Analysis")
fig.update_xaxes(title_text=r"% of seat Indigenous", row=1, col=1)
fig.update_xaxes(title_text="Median seat age", row=1, col=2)
fig.update_xaxes(title_text=r"% tertiary educated", row=2, col=1)
fig.update_xaxes(title_text="Median income", row=2, col=2)
fig.update_yaxes(title_text="Swing", row=1, col=1)
fig.update_yaxes(title_text="Swing", row=1, col=2)
fig.update_yaxes(title_text="Swing", row=2, col=1)
fig.update_yaxes(title_text="Swing", row=2, col=2)

# Convert the results to a DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Print or use the results
print(results_df)

# Show the plot
fig.show()


  Covariate     Slope  Intercept   R Value  R-Squared       P-Value  \
0     indig  0.004596  -0.058318  0.183259   0.033584  2.527967e-02   
1       age  0.015415  -0.646895  0.572063   0.327256  2.519244e-14   
2      t_ed -0.003196   0.040118 -0.356196   0.126876  8.256129e-06   
3    income -0.000134   0.067626 -0.214990   0.046221  8.462281e-03   

   Standard Error  
0        0.002034  
1        0.001823  
2        0.000692  
3        0.000050  


In [37]:
covariates = ['indig', 'age', 't_ed', 'income']

results_alp = []
results_yes = []
fig = make_subplots(rows=2, cols=2)

for covariate in covariates:
    slope_alp, intercept_alp, r_value_alp, p_value_alp, std_err_alp = linregress(filtered_master[covariate].astype(float), filtered_master['ALP margin'].astype(float))
    r_squared_alp = r_value_alp**2
    result_alp = {
        'Covariate': covariate,
        'Slope': slope_alp,
        'Intercept': intercept_alp,
        'R Value': r_value_alp,
        'R-Squared': r_squared_alp,
        'P-Value': p_value_alp,
        'Standard Error': std_err_alp
    }
    results_alp.append(result_alp)
    
    slope_yes, intercept_yes, r_value_yes, p_value_yes, std_err_yes = linregress(filtered_master[covariate].astype(float), filtered_master['Yes_lead_pts_so_far'].astype(float))
    r_squared_yes = r_value_yes**2
    result_yes = {
        'Covariate': covariate,
        'Slope': slope_yes,
        'Intercept': intercept_yes,
        'R Value': r_value_yes,
        'R-Squared': r_squared_yes,
        'P-Value': p_value_yes,
        'Standard Error': std_err_yes
    }
    results_yes.append(result_yes)

    # Create a scatter plot for the data points
    temp_chart_alp = go.Scatter(x=filtered_master[covariate], y=filtered_master['ALP margin'], mode='markers', name='Data (ALP)')
    
    # Add a line of best fit to the plot
    y_fit_alp = intercept_alp + slope_alp * filtered_master[covariate]
    temp_chart_line_alp = go.Scatter(x=filtered_master[covariate], y=y_fit_alp, mode='lines', name='Fit (ALP)')

    # Create a scatter plot for the data points
    temp_chart_yes = go.Scatter(x=filtered_master[covariate], y=filtered_master['Yes_lead_pts_so_far'], mode='markers', name='Data (Yes)')
    
    # Add a line of best fit to the plot
    y_fit_yes = intercept_yes + slope_yes * filtered_master[covariate]
    temp_chart_line_yes = go.Scatter(x=filtered_master[covariate], y=y_fit_yes, mode='lines', name='Fit (Yes)')    

    fig.add_trace(temp_chart_alp, row=(covariates.index(covariate) // 2) + 1, col=(covariates.index(covariate) % 2) + 1)
    fig.add_trace(temp_chart_line_alp, row=(covariates.index(covariate) // 2) + 1, col=(covariates.index(covariate) % 2) + 1)
    fig.add_trace(temp_chart_yes, row=(covariates.index(covariate) // 2) + 1, col=(covariates.index(covariate) % 2) + 1)
    fig.add_trace(temp_chart_line_yes, row=(covariates.index(covariate) // 2) + 1, col=(covariates.index(covariate) % 2) + 1)

# Update the layout of the subplots for better visualization
fig.update_layout(title_text="Linear Regression Analysis")
fig.update_xaxes(title_text=r"% of seat Indigenous", row=1, col=1)
fig.update_xaxes(title_text="Median seat age", row=1, col=2)
fig.update_xaxes(title_text=r"% tertiary educated", row=2, col=1)
fig.update_xaxes(title_text="Median income", row=2, col=2)
fig.update_yaxes(title_text="Swing (ALP)", row=1, col=1)
fig.update_yaxes(title_text="Swing (ALP)", row=1, col=2)
fig.update_yaxes(title_text="Swing (Yes)", row=2, col=1)
fig.update_yaxes(title_text="Swing (Yes)", row=2, col=2)

# Convert the results to DataFrames for easier analysis
results_df_alp = pd.DataFrame(results_alp)
results_df_yes = pd.DataFrame(results_yes)

# Print or use the results
print(results_df_alp)
print(results_df_yes)

# Show the plot
fig.show()


  Covariate     Slope  Intercept   R Value  R-Squared       P-Value  \
0     indig -0.006831   0.046806 -0.268623   0.072158  9.248301e-04   
1       age -0.015706   0.639561 -0.574883   0.330491  1.759383e-14   
2      t_ed  0.003624  -0.070047  0.398324   0.158662  4.879114e-07   
3    income  0.000146  -0.096665  0.231688   0.053679  4.467914e-03   

   Standard Error  
0        0.002020  
1        0.001844  
2        0.000688  
3        0.000051  
  Covariate     Slope  Intercept   R Value  R-Squared   P-Value  \
0     indig -0.002235  -0.011512 -0.205975   0.042426  0.011730   
1       age -0.000292  -0.007334 -0.025011   0.000626  0.762059   
2      t_ed  0.000428  -0.029930  0.110199   0.012144  0.180930   
3    income  0.000012  -0.029039  0.046053   0.002121  0.577040   

   Standard Error  
0        0.000876  
1        0.000961  
2        0.000318  
3        0.000022  


In [6]:
# vs. 2022 2PP

# Create a scatter plot using Plotly Express
fig = px.scatter(filtered_master, x='Yes_lead_pts_so_far', y='Coalition point lead', color='Party', hover_data=['Seat'])

# Add axis lines at 0,0
fig.update_layout(xaxis=dict(zeroline=True), yaxis=dict(zeroline=True))

# Add a title
fig.update_layout(title='Yes vs. 2022 Coalition 2PP')

# Show the plot
fig.show()

In [13]:
# Select the columns of interest
columns_of_interest = ['indig', 'age', 't_ed', 'income', 'Coalition point lead', 'dem_rating']

# Separate the target variable and independent variables
X = filtered_master[columns_of_interest]
y = filtered_master['Yes_lead_pts_so_far'] - filtered_master

# One-hot encode the 'dem_rating' variable
X = pd.get_dummies(X, columns=['dem_rating'], drop_first=True)

In [15]:
# Correlation matrix
correlation_matrix = X.corr()
print(correlation_matrix)

                                  indig       age      t_ed    income  \
indig                          1.000000  0.047944 -0.442604 -0.242934   
age                            0.047944  1.000000 -0.338137 -0.326882   
t_ed                          -0.442604 -0.338137  1.000000  0.813072   
income                        -0.242934 -0.326882  0.813072  1.000000   
Coalition point lead           0.268623  0.574883 -0.398324 -0.231688   
dem_rating_Outer Metropolitan -0.187274 -0.248108 -0.087545 -0.101393   
dem_rating_Provincial          0.060008  0.106512 -0.231486 -0.181170   
dem_rating_Rural               0.461417  0.531382 -0.496655 -0.356301   

                               Coalition point lead  \
indig                                      0.268623   
age                                        0.574883   
t_ed                                      -0.398324   
income                                    -0.231688   
Coalition point lead                       1.000000   
dem_rating_O

In [40]:
covariate = 'income'

[float(x) for x in filtered_master[covariate]]

[1183.0,
 1145.0,
 873.0,
 1289.0,
 855.0,
 682.0,
 643.0,
 824.0,
 722.0,
 776.0,
 635.0,
 727.0,
 782.0,
 780.0,
 826.0,
 723.0,
 1110.0,
 810.0,
 1055.0,
 644.0,
 801.0,
 931.0,
 777.0,
 717.0,
 819.0,
 929.0,
 763.0,
 747.0,
 788.0,
 797.0,
 934.0,
 792.0,
 780.0,
 694.0,
 1241.0,
 623.0,
 1115.0,
 1058.0,
 742.0,
 748.0,
 897.0,
 778.0,
 887.0,
 763.0,
 906.0,
 858.0,
 936.0,
 1068.0,
 602.0,
 798.0,
 772.0,
 724.0,
 1018.0,
 689.0,
 749.0,
 1017.0,
 888.0,
 691.0,
 812.0,
 866.0,
 770.0,
 867.0,
 867.0,
 997.0,
 787.0,
 825.0,
 891.0,
 958.0,
 983.0,
 832.0,
 830.0,
 734.0,
 931.0,
 907.0,
 813.0,
 695.0,
 1330.0,
 769.0,
 726.0,
 1026.0,
 576.0,
 991.0,
 1218.0,
 1102.0,
 761.0,
 745.0,
 768.0,
 838.0,
 521.0,
 960.0,
 745.0,
 1402.0,
 775.0,
 806.0,
 1228.0,
 556.0,
 844.0,
 852.0,
 730.0,
 1517.0,
 881.0,
 1254.0,
 710.0,
 1140.0,
 923.0,
 759.0,
 942.0,
 732.0,
 809.0,
 1042.0,
 694.0,
 936.0,
 764.0,
 718.0,
 734.0,
 1062.0,
 722.0,
 786.0,
 813.0,
 759.0,
 776.0,
 626.0,
 6