In [1]:
import xml.etree.ElementTree as ET
from pathlib import Path
import datetime as dt
import xmltodict
import pandas as pd
import os
import zipfile
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

def read_eml(path: Path):

    """Convert EML file to dictionary"""

    try:
        return xmltodict.parse(path.read_text(encoding="utf-8"))

    except UnicodeDecodeError as e:
        print(path.name)
        print(e)
        return None
    
# Get the user's home directory
current_directory = os.getcwd()

abs_demo = pd.read_csv('demographicData.csv')

In [2]:
# HAVE YOU MOVED THE ZIP FILE INTO THE WORKING DIRECTORY??

zip_filename = 'aec-mediafeed-Detailed-Verbose-29581-20231012184312.zip'

# Let's proceed. 

# Construct the full path to your file
zip_file_path = os.path.join(current_directory, zip_filename)

# unzipped directory
unzip_directory_name = zip_filename[:-4]

# Create a ZipFile object and extract the contents
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_directory_name)

print('Unzipped to: ' + unzip_directory_name)
print('Now filtering to spreadsheet ...')

os.chdir(unzip_directory_name + '//xml')
file_name = os.listdir()[0]

f = open(file_name, encoding='utf-8')
text = f.read()
f.close()
temp = xmltodict.parse(text)

print('File opened...')

# Get the current working directory
current_directory = os.getcwd()

# Get the parent directory
parent_directory = os.path.dirname(current_directory)

# Get the grandparent directory
grandparent_directory = os.path.dirname(parent_directory)

# Change the working directory to the grandparent directory
os.chdir(grandparent_directory)

df = pd.DataFrame()

for PollingDistrict in temp['MediaFeed']['Results']['Election']['Referendum']['Contests']['Contest']['PollingDistricts']['PollingDistrict']:
    temp_dict = {
        'Seat': PollingDistrict['PollingDistrictIdentifier']['Name'],
        'SeatID': PollingDistrict['PollingDistrictIdentifier']['@Id'],
        'State': PollingDistrict['PollingDistrictIdentifier']['StateIdentifier']['@Id'],
        'Enrolment': float(PollingDistrict['Enrolment']['#text']),
        'Yes': float(PollingDistrict['ProposalResults']['Option'][0]['Votes']['#text']),
        'No': float(PollingDistrict['ProposalResults']['Option'][1]['Votes']['#text']),
        'Informal': float(PollingDistrict['ProposalResults']['Informal']['Votes']['#text']),
        'Complete': PollingDistrict['ProposalResults']['@PollingPlacesExpected'] == PollingDistrict['ProposalResults']['@PollingPlacesReturned'],
        'Votes_returned': float(PollingDistrict['ProposalResults']['Total']['Votes']['#text']),
        'Votes_returned_pc': float(PollingDistrict['ProposalResults']['Total']['Votes']['@Percentage'])
    }
    temp_dict['Informal_pc_tot'] = temp_dict['Informal'] / temp_dict['Enrolment']
    temp_dict['Yes_pc_tot'] = temp_dict['Yes'] / temp_dict['Enrolment']
    temp_dict['No_pc_tot'] = temp_dict['No'] / temp_dict['Enrolment']
    if temp_dict['Votes_returned'] == 0:
        temp_dict['Informal_pc_so_far'] = 'NULL'
        temp_dict['Yes_pc_so_far'] = 'NULL'
        temp_dict['No_pc_so_far'] = 'NULL'
        temp_dict['Yes_lead_pts_so_far'] = 'NULL'
    else:
        temp_dict['Informal_pc_so_far'] = temp_dict['Informal'] / temp_dict['Votes_returned']
        temp_dict['Yes_pc_so_far'] = temp_dict['Yes'] / temp_dict['Votes_returned']
        temp_dict['No_pc_so_far'] = temp_dict['No'] / temp_dict['Votes_returned']
        temp_dict['Yes_lead_pts_so_far'] = temp_dict['Yes_pc_so_far'] - 0.5
    df = pd.concat(
        [df, pd.DataFrame(temp_dict, index=[0])], ignore_index=True
    )

df = df.sort_values(by='Votes_returned_pc', ascending=False)
print(df)



Unzipped to: aec-mediafeed-Detailed-Verbose-29581-20231012184312
Now filtering to spreadsheet ...
File opened...
            Seat SeatID State  Enrolment      Yes       No  Informal  \
0           Bean    318   ACT   111176.0  49183.0  39335.0     948.0   
2         Fenner    102   ACT   103863.0  45879.0  36690.0     880.0   
15   Eden-Monaro    117   NSW   117428.0  42508.0  48994.0    1010.0   
1       Canberra    101   ACT   101798.0  43243.0  34606.0     816.0   
22          Hume    125   NSW   126270.0  44792.0  51617.0    1047.0   
..           ...    ...   ...        ...      ...      ...       ...   
141       Durack    312    WA   124218.0   1498.0   1527.0      66.0   
36        Parkes    139   NSW   110985.0   1220.0   1503.0      26.0   
32   New England    135   NSW   116060.0   1029.0   1219.0      36.0   
57        Dawson    158   QLD   113963.0      0.0      0.0       0.0   
66       Herbert    165   QLD   119674.0      0.0      0.0       0.0   

     Complete  Votes_r

In [None]:
# Table by state


In [3]:
# Merge the two datasets
df = df.drop(columns='State')
master = df.merge(abs_demo, how='left', left_on='Seat', right_on='CED_NAME_2021')
master = master.drop(columns = 'CED_NAME_2021')

# Change electoral results to decimals
master['Australian Labor Party Percentage'] = master['Australian Labor Party Percentage']/100
master['Coalition point lead'] = master['Coalition point lead']/100

In [4]:
# FILTER THE DATAFRAME
filtered_master = master[master['Votes_returned'] > 0]

In [6]:
# vs. 2022 2PP

# Create a scatter plot using Plotly Express
fig = px.scatter(filtered_master, x='Yes_lead_pts_so_far', y='Coalition point lead', color='Party', hover_data=['Seat'])

# Add axis lines at 0,0
fig.update_layout(xaxis=dict(zeroline=True), yaxis=dict(zeroline=True))

# Add a title
fig.update_layout(title='Yes vs. 2022 Coalition 2PP')

# Show the plot
fig.show()

In [13]:
# Select the columns of interest
columns_of_interest = ['indig', 'age', 't_ed', 'income', 'Coalition point lead', 'dem_rating']

# Separate the target variable and independent variables
X = filtered_master[columns_of_interest]
y = filtered_master['Yes_lead_pts_so_far']

# One-hot encode the 'dem_rating' variable
X = pd.get_dummies(X, columns=['dem_rating'], drop_first=True)

In [15]:
# Correlation matrix
correlation_matrix = X.corr()
print(correlation_matrix)

                                  indig       age      t_ed    income  \
indig                          1.000000  0.047944 -0.442604 -0.242934   
age                            0.047944  1.000000 -0.338137 -0.326882   
t_ed                          -0.442604 -0.338137  1.000000  0.813072   
income                        -0.242934 -0.326882  0.813072  1.000000   
Coalition point lead           0.268623  0.574883 -0.398324 -0.231688   
dem_rating_Outer Metropolitan -0.187274 -0.248108 -0.087545 -0.101393   
dem_rating_Provincial          0.060008  0.106512 -0.231486 -0.181170   
dem_rating_Rural               0.461417  0.531382 -0.496655 -0.356301   

                               Coalition point lead  \
indig                                      0.268623   
age                                        0.574883   
t_ed                                      -0.398324   
income                                    -0.231688   
Coalition point lead                       1.000000   
dem_rating_O

In [7]:
filtered_master.to_csv('filtered_master.csv', index=False)

In [16]:
# vs. 2022 2PP

# Create a scatter plot using Plotly Express
fig = px.scatter(filtered_master, x='indig', y='t_ed', color='Party', hover_data=['Seat'])

# Add axis lines at 0,0
fig.update_layout(xaxis=dict(zeroline=True), yaxis=dict(zeroline=True))

# Add a title
fig.update_layout(title='Indigenous % vs. >= Bach degree %')

# Show the plot
fig.show()