In [25]:
import pandas as pd

# Extracting marriage postal survey participant data

In [109]:
df = pd.read_csv('Resources/participant-information.csv')

In [110]:
df.head()

Unnamed: 0,electoral_division,state,gender,age,age_lower,age_upper,eligible,participant
0,Canberra,Australian Capital Territory Divisions,female,18-19,18,19,2433,2092
1,Canberra,Australian Capital Territory Divisions,female,20-24,20,24,6171,5084
2,Canberra,Australian Capital Territory Divisions,female,25-29,25,29,6464,5343
3,Canberra,Australian Capital Territory Divisions,female,30-34,30,34,6359,5260
4,Canberra,Australian Capital Territory Divisions,female,35-39,35,39,6134,4990


# Extracting electorate division data

In [111]:
id_df = pd.read_csv('01-output_electorate_division_information/electoral_division.csv')
id_df.head()

Unnamed: 0,division_id,electoral_division,state
0,179,Adelaide,SA
1,197,Aston,VIC
2,198,Ballarat,VIC
3,103,Banks,NSW
4,180,Barker,SA


# Transforming the dataframe to look at age demographics of postal survey

In [112]:
df.columns

Index(['electoral_division', 'state', 'gender', 'age', 'age_lower',
       'age_upper', 'eligible', 'participant'],
      dtype='object')

In [113]:
df.dtypes

electoral_division    object
state                 object
gender                object
age                   object
age_lower              int64
age_upper              int64
eligible              object
participant           object
dtype: object

In [114]:
# need to change  eligible and participant to int64

In [115]:
# checking for duplicates
# df['DivisionID'].is_unique

In [116]:
len(df)

4500

In [117]:
# dropping any na values 
df = df.dropna()
len(df)

4500

In [118]:
# mergine the two dataframes
df = pd.merge(df, id_df, how="outer", on="electoral_division")
df.head()

Unnamed: 0,electoral_division,state_x,gender,age,age_lower,age_upper,eligible,participant,division_id,state_y
0,Canberra,Australian Capital Territory Divisions,female,18-19,18,19,2433,2092,101,ACT
1,Canberra,Australian Capital Territory Divisions,female,20-24,20,24,6171,5084,101,ACT
2,Canberra,Australian Capital Territory Divisions,female,25-29,25,29,6464,5343,101,ACT
3,Canberra,Australian Capital Territory Divisions,female,30-34,30,34,6359,5260,101,ACT
4,Canberra,Australian Capital Territory Divisions,female,35-39,35,39,6134,4990,101,ACT


In [119]:
# columns we want to keep
columns = ['division_id','electoral_division','age', 'age_lower', 'age_upper', 'eligible', 'participant']
df = df[columns]
df.head()

Unnamed: 0,division_id,electoral_division,age,age_lower,age_upper,eligible,participant
0,101,Canberra,18-19,18,19,2433,2092
1,101,Canberra,20-24,20,24,6171,5084
2,101,Canberra,25-29,25,29,6464,5343
3,101,Canberra,30-34,30,34,6359,5260
4,101,Canberra,35-39,35,39,6134,4990


In [120]:
print(df["age"].max())
print(df["age"].min())

85+
18-19


In [121]:
print(df["age"].unique())

['18-19' '20-24' '25-29' '30-34' '35-39' '40-44' '45-49' '50-54' '55-59'
 '60-64' '65-69' '70-74' '75-79' '80-84' '85+']


In [122]:
# need to change the age ranges to match the ABS age groups
# create the new bins
bins = [18, 34, 49, 64, 79, 125]

# Create labels for these bins
group_labels = ["18-34", "35-49", "50-64", "65-79", "80+"]

In [123]:
# place the data series into a new column inside of the DataFrame
df["age_group"]=pd.cut(df["age_upper"], bins, labels=group_labels)
df

Unnamed: 0,division_id,electoral_division,age,age_lower,age_upper,eligible,participant,age_group
0,101,Canberra,18-19,18,19,2433,2092,18-34
1,101,Canberra,20-24,20,24,6171,5084,18-34
2,101,Canberra,25-29,25,29,6464,5343,18-34
3,101,Canberra,30-34,30,34,6359,5260,18-34
4,101,Canberra,35-39,35,39,6134,4990,35-49
...,...,...,...,...,...,...,...,...
4495,248,Tangney,65-69,65,69,3587,3208,65-79
4496,248,Tangney,70-74,70,74,2783,2550,65-79
4497,248,Tangney,75-79,75,79,1791,1649,65-79
4498,248,Tangney,80-84,80,84,1257,1148,80+


In [128]:
# grouping by age_group and electoral_division
grouped_df = df.groupby(["division_id","electoral_division","age_group"])

In [129]:
grouped_df = grouped_df["age_upper"].count()
grouped_df

division_id  electoral_division  age_group
101          Adelaide            18-34        0
                                 35-49        0
                                 50-64        0
                                 65-79        0
                                 80+          0
                                             ..
317          Wright              18-34        0
                                 35-49        0
                                 50-64        0
                                 65-79        0
                                 80+          0
Name: age_upper, Length: 112500, dtype: int64

In [None]:
## remember to rename cols at end

In [35]:
# columns we want to keep
columns = ['DivisionID','DivisionNm', 'Enrolment', 'TotalVotes', 'TotalPercentage']
turnout_df = df[columns]
turnout_df

Unnamed: 0,DivisionID,DivisionNm,Enrolment,TotalVotes,TotalPercentage
0,179,Adelaide,109217,98662,90.34
1,197,Aston,96043,89111,92.78
2,198,Ballarat,110755,103168,93.15
3,103,Banks,104891,96488,91.99
4,180,Barker,105600,98673,93.44
...,...,...,...,...,...
145,153,Werriwa,108557,97942,90.22
146,150,Whitlam,112051,104069,92.88
147,178,Wide Bay,102856,94617,91.99
148,234,Wills,113851,100915,88.64


In [36]:
# renaming columns
turnout_df.columns = ['division_id', 'electoral_division', 'no_enrolled', 'total_votes', 'turnout%']
turnout_df.head()

Unnamed: 0,division_id,electoral_division,no_enrolled,total_votes,turnout%
0,179,Adelaide,109217,98662,90.34
1,197,Aston,96043,89111,92.78
2,198,Ballarat,110755,103168,93.15
3,103,Banks,104891,96488,91.99
4,180,Barker,105600,98673,93.44


In [37]:
# set the division_id as index
turnout_df = turnout_df.set_index("division_id")
turnout_df.head()

Unnamed: 0_level_0,electoral_division,no_enrolled,total_votes,turnout%
division_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
179,Adelaide,109217,98662,90.34
197,Aston,96043,89111,92.78
198,Ballarat,110755,103168,93.15
103,Banks,104891,96488,91.99
180,Barker,105600,98673,93.44


In [39]:
# export df to csv
turnout_df.to_csv("04-ouput_2016_federal_election_turnout/fedelect_turnout.csv")