In [67]:
import pandas as pd

# Extracting marriage postal survey participant data

In [68]:
df = pd.read_csv('Resources/participant-information.csv')

In [69]:
df.head()

Unnamed: 0,electoral_division,state,gender,age,age_lower,age_upper,eligible,participant
0,Canberra,Australian Capital Territory Divisions,female,18-19,18,19,2433,2092
1,Canberra,Australian Capital Territory Divisions,female,20-24,20,24,6171,5084
2,Canberra,Australian Capital Territory Divisions,female,25-29,25,29,6464,5343
3,Canberra,Australian Capital Territory Divisions,female,30-34,30,34,6359,5260
4,Canberra,Australian Capital Territory Divisions,female,35-39,35,39,6134,4990


# Extracting electorate division data

In [70]:
id_df = pd.read_csv('01-output_electoral_division/electoral_division.csv')
id_df.head()

Unnamed: 0,division_id,electoral_division,state
0,179,Adelaide,SA
1,197,Aston,VIC
2,198,Ballarat,VIC
3,103,Banks,NSW
4,180,Barker,SA


# Transforming the dataframe to look at age demographics of postal survey

In [71]:
# looking at the column names for marriage postal survey participant database
df.columns

Index(['electoral_division', 'state', 'gender', 'age', 'age_lower',
       'age_upper', 'eligible', 'participant'],
      dtype='object')

In [72]:
# looking at the data types for marriage postal survey participant database
df.dtypes

electoral_division    object
state                 object
gender                object
age                   object
age_lower              int64
age_upper              int64
eligible              object
participant           object
dtype: object

In [73]:
# converting participant column to string then float 
df["participant"] = df["participant"].astype('str')
df["participant"] = df["participant"].str.replace(",","").astype(float)

In [74]:
# checking the data types have changed successfully
df.dtypes

electoral_division     object
state                  object
gender                 object
age                    object
age_lower               int64
age_upper               int64
eligible               object
participant           float64
dtype: object

In [75]:
len(df)

4500

In [76]:
# dropping any na values 
df = df.dropna()
len(df)

4500

In [77]:
# looking at the age groups from the marriage postal survey
print(df["age"].max())
print(df["age"].min())
print(df["age"].unique())

85+
18-19
['18-19' '20-24' '25-29' '30-34' '35-39' '40-44' '45-49' '50-54' '55-59'
 '60-64' '65-69' '70-74' '75-79' '80-84' '85+']


In [78]:
# need to change the age ranges to match the ABS age groups
# create the new bins
bins = [18, 34, 49, 64, 79, 125]

# create labels for these bins, this is the same as ABS age groups, without 0-17 years
group_labels = ["18-34", "35-49", "50-64", "65-79", "80+"]

In [79]:
# place the data series into a new column inside of the DataFrame
df["age_group"]=pd.cut(df["age_upper"], bins, labels=group_labels)
df.head()

Unnamed: 0,electoral_division,state,gender,age,age_lower,age_upper,eligible,participant,age_group
0,Canberra,Australian Capital Territory Divisions,female,18-19,18,19,2433,2092.0,18-34
1,Canberra,Australian Capital Territory Divisions,female,20-24,20,24,6171,5084.0,18-34
2,Canberra,Australian Capital Territory Divisions,female,25-29,25,29,6464,5343.0,18-34
3,Canberra,Australian Capital Territory Divisions,female,30-34,30,34,6359,5260.0,18-34
4,Canberra,Australian Capital Territory Divisions,female,35-39,35,39,6134,4990.0,35-49


In [80]:
# grouping by electoral_division and age_group
grouped_df = df.groupby(["electoral_division","age_group"])

In [81]:
# applying aggregate to groupby - sum of participants by new age groups 
participants = grouped_df["participant"].sum()
participants

electoral_division  age_group
Adelaide            18-34        24474.0
                    35-49        20770.0
                    50-64        22353.0
                    65-79        15429.0
                    80+           5396.0
                                  ...   
Wright              18-34        17881.0
                    35-49        21457.0
                    50-64        23840.0
                    65-79        16050.0
                    80+           3317.0
Name: participant, Length: 750, dtype: float64

In [82]:
# creating a pandas dataframe with aggregated eligible data as a column
participants_age = pd.DataFrame({
    "number_participants" : participants
})
participants_age

Unnamed: 0_level_0,Unnamed: 1_level_0,number_participants
electoral_division,age_group,Unnamed: 2_level_1
Adelaide,18-34,24474.0
Adelaide,35-49,20770.0
Adelaide,50-64,22353.0
Adelaide,65-79,15429.0
Adelaide,80+,5396.0
...,...,...
Wright,18-34,17881.0
Wright,35-49,21457.0
Wright,50-64,23840.0
Wright,65-79,16050.0


In [83]:
# making age_group a column
participants_age = participants_age.reset_index(level=['age_group'])
participants_age.head()

Unnamed: 0_level_0,age_group,number_participants
electoral_division,Unnamed: 1_level_1,Unnamed: 2_level_1
Adelaide,18-34,24474.0
Adelaide,35-49,20770.0
Adelaide,50-64,22353.0
Adelaide,65-79,15429.0
Adelaide,80+,5396.0


In [84]:
# making electoral_division a column
participants_age = participants_age.reset_index(level=['electoral_division'])
participants_age.head()

Unnamed: 0,electoral_division,age_group,number_participants
0,Adelaide,18-34,24474.0
1,Adelaide,35-49,20770.0
2,Adelaide,50-64,22353.0
3,Adelaide,65-79,15429.0
4,Adelaide,80+,5396.0


In [85]:
# unstacking the age_groups so they become columns
participants_age = participants_age.set_index(["age_group", "electoral_division"]).unstack(level=0)
participants_age.head()

Unnamed: 0_level_0,number_participants,number_participants,number_participants,number_participants,number_participants
age_group,18-34,35-49,50-64,65-79,80+
electoral_division,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Adelaide,24474.0,20770.0,22353.0,15429.0,5396.0
Aston,20745.0,18667.0,21236.0,13420.0,3638.0
Ballarat,22345.0,22008.0,24709.0,18264.0,5257.0
Banks,20562.0,20416.0,22008.0,14571.0,6289.0
Barker,14079.0,16493.0,24109.0,20212.0,6231.0


In [86]:
# looking at the column names
participants_age.columns

MultiIndex([('number_participants', '18-34'),
            ('number_participants', '35-49'),
            ('number_participants', '50-64'),
            ('number_participants', '65-79'),
            ('number_participants',   '80+')],
           names=[None, 'age_group'])

In [87]:
participants_age.index

Index(['Adelaide', 'Aston', 'Ballarat', 'Banks', 'Barker', 'Barton', 'Bass',
       'Batman', 'Bendigo', 'Bennelong',
       ...
       'Wakefield', 'Wannon', 'Warringah', 'Watson', 'Wentworth', 'Werriwa',
       'Whitlam', 'Wide Bay', 'Wills', 'Wright'],
      dtype='object', name='electoral_division', length=150)

In [88]:
# renaming the columns
participants_age.columns = ['ages_18-34', 'ages_35-49', 'ages_50-64', 'ages_65-79', 'ages_80+']
participants_age.head()

Unnamed: 0_level_0,ages_18-34,ages_35-49,ages_50-64,ages_65-79,ages_80+
electoral_division,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adelaide,24474.0,20770.0,22353.0,15429.0,5396.0
Aston,20745.0,18667.0,21236.0,13420.0,3638.0
Ballarat,22345.0,22008.0,24709.0,18264.0,5257.0
Banks,20562.0,20416.0,22008.0,14571.0,6289.0
Barker,14079.0,16493.0,24109.0,20212.0,6231.0


In [89]:
# making electoral_division a column instead of index
participants_age = participants_age.reset_index(level=['electoral_division'])
participants_age.columns

Index(['electoral_division', 'ages_18-34', 'ages_35-49', 'ages_50-64',
       'ages_65-79', 'ages_80+'],
      dtype='object')

In [90]:
# looking at the length of the dataframe before merging with division id
len(participants_age)

150

In [91]:
# merging with id_df dataframe
combined_df = pd.merge(participants_age, id_df, how="outer", on="electoral_division")
combined_df.head()

Unnamed: 0,electoral_division,ages_18-34,ages_35-49,ages_50-64,ages_65-79,ages_80+,division_id,state
0,Adelaide,24474.0,20770.0,22353.0,15429.0,5396.0,179,SA
1,Aston,20745.0,18667.0,21236.0,13420.0,3638.0,197,VIC
2,Ballarat,22345.0,22008.0,24709.0,18264.0,5257.0,198,VIC
3,Banks,20562.0,20416.0,22008.0,14571.0,6289.0,103,NSW
4,Barker,14079.0,16493.0,24109.0,20212.0,6231.0,180,SA


In [92]:
len(combined_df)

150

In [93]:
# columns we want to keep, in the order we want to keepn them
columns = ['division_id', 'ages_18-34', 'ages_35-49', 'ages_50-64', 'ages_65-79', 'ages_80+']
combined_df = combined_df[columns]
combined_df.head()

Unnamed: 0,division_id,ages_18-34,ages_35-49,ages_50-64,ages_65-79,ages_80+
0,179,24474.0,20770.0,22353.0,15429.0,5396.0
1,197,20745.0,18667.0,21236.0,13420.0,3638.0
2,198,22345.0,22008.0,24709.0,18264.0,5257.0
3,103,20562.0,20416.0,22008.0,14571.0,6289.0
4,180,14079.0,16493.0,24109.0,20212.0,6231.0


In [94]:
# are all the division id values unique?
combined_df['division_id'].is_unique

True

In [95]:
# setting the index as division_id
combined_df = combined_df.set_index("division_id")
combined_df.head()

Unnamed: 0_level_0,ages_18-34,ages_35-49,ages_50-64,ages_65-79,ages_80+
division_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
179,24474.0,20770.0,22353.0,15429.0,5396.0
197,20745.0,18667.0,21236.0,13420.0,3638.0
198,22345.0,22008.0,24709.0,18264.0,5257.0
103,20562.0,20416.0,22008.0,14571.0,6289.0
180,14079.0,16493.0,24109.0,20212.0,6231.0


# Exploratory data analysis

In [96]:
combined_df.describe()

Unnamed: 0,ages_18-34,ages_35-49,ages_50-64,ages_65-79,ages_80+
count,150.0,150.0,150.0,150.0,150.0
mean,20992.033333,20629.66,21890.7,15800.18,4849.273333
std,4977.168869,3797.793451,2808.196607,3768.641072,1513.840052
min,9850.0,10237.0,10196.0,4027.0,510.0
25%,17770.0,18675.75,20103.0,13303.5,3762.75
50%,20752.5,20371.5,21910.5,15228.5,4956.0
75%,22913.0,22380.25,23670.0,18294.75,5818.5
max,40693.0,34439.0,30462.0,27832.0,8493.0


# Exporting to CSV

In [99]:
combined_df.to_csv("07-output_marriage_postal_participants_by_age/marriage_postal_participants_by_age.csv")