In [1]:
# Imports
import pandas as pd

# Extracting

### Extracting commonwealth electorate data

In [2]:
# Reading in the csv and viewing as a dataframe
education_level_df = pd.read_csv("Resources/commonwealth_electorate_data_Table8.csv")

education_level_df

Unnamed: 0.1,Unnamed: 0,Year 12 or equivalent completion (people aged 20 to 24 years),Certificate III or higher qualification (people aged 15 years and over),Unnamed: 3,Unnamed: 4
0,,,,,
1,Adelaide,82.4%,49.3%,,
2,Aston,79.7%,49.3%,,
3,Ballarat,68.6%,45.1%,,
4,Banks,83.4%,47.8%,,
...,...,...,...,...,...
148,Whitlam,62.2%,42.8%,,
149,Wide Bay,63.4%,40.0%,,
150,Wills,84.2%,52.0%,,
151,Wright,70.0%,41.6%,,


# Transforming

### Removing NaNs

In [3]:
# Checking the number of NaNs in each column
education_level_df.isnull().sum()

Unnamed: 0                                                                   2
Year 12 or equivalent completion (people aged 20 to 24 years)                2
Certificate III or higher qualification (people aged 15 years and over)      2
Unnamed: 3                                                                 153
Unnamed: 4                                                                 153
dtype: int64

This indicates that the columns 'Unnamed: 3' and 'Unnamed: 4' contain only NaN values and can be removed.

In [4]:
# Dropping the columns with NaN values
education_level_df = education_level_df.drop(columns = ["Unnamed: 3", "Unnamed: 4"])
education_level_df

Unnamed: 0.1,Unnamed: 0,Year 12 or equivalent completion (people aged 20 to 24 years),Certificate III or higher qualification (people aged 15 years and over)
0,,,
1,Adelaide,82.4%,49.3%
2,Aston,79.7%,49.3%
3,Ballarat,68.6%,45.1%
4,Banks,83.4%,47.8%
...,...,...,...
148,Whitlam,62.2%,42.8%
149,Wide Bay,63.4%,40.0%
150,Wills,84.2%,52.0%
151,Wright,70.0%,41.6%


In [5]:
# Removing the rows with Nan values, row 0 and row 152. This should leave 151 rows.
education_level_df = education_level_df.dropna()
education_level_df

Unnamed: 0.1,Unnamed: 0,Year 12 or equivalent completion (people aged 20 to 24 years),Certificate III or higher qualification (people aged 15 years and over)
1,Adelaide,82.4%,49.3%
2,Aston,79.7%,49.3%
3,Ballarat,68.6%,45.1%
4,Banks,83.4%,47.8%
5,Barker,57.9%,34.9%
...,...,...,...
147,Werriwa,75.7%,37.3%
148,Whitlam,62.2%,42.8%
149,Wide Bay,63.4%,40.0%
150,Wills,84.2%,52.0%


### Renaming columns

In [6]:
# renaming columns
education_level_df.columns = ['electoral_division', 'year_12_completion(%)', 'higher_education_completion(%)']
education_level_df.head()

Unnamed: 0,electoral_division,year_12_completion(%),higher_education_completion(%)
1,Adelaide,82.4%,49.3%
2,Aston,79.7%,49.3%
3,Ballarat,68.6%,45.1%
4,Banks,83.4%,47.8%
5,Barker,57.9%,34.9%


### Assessing and converting datatypes

In [7]:
# Checking datatypes
education_level_df.dtypes

electoral_division                object
year_12_completion(%)             object
higher_education_completion(%)    object
dtype: object

The datatypes of year_12_completion(%) and higher_education_completion(%) are objects. The percentage sign needs to be removed and datatype changed for ease of use later.

In [8]:
# should I remove the percentage sign, set the two columns as float and at a percentge sign to the column names???

In [9]:
# Converting data type of 'year_12_completion(%)' and 'higher_education_completion(%)' columns 
# First converting columns of objects to strings

education_df = education_level_df.copy()
education_df["year_12_completion(%)"] = education_df["year_12_completion(%)"].astype('str')
education_df["higher_education_completion(%)"] = education_df["higher_education_completion(%)"].astype('str')

In [10]:
# Then converting columns of strings to floats, removing percentage sign
education_df["year_12_completion(%)"] = education_df["year_12_completion(%)"].str.replace("%","").astype(float)
education_df["higher_education_completion(%)"] = education_df["higher_education_completion(%)"].str.replace("%","").astype(float)


In [11]:
# Previewing the dataframe
education_df.head()

Unnamed: 0,electoral_division,year_12_completion(%),higher_education_completion(%)
1,Adelaide,82.4,49.3
2,Aston,79.7,49.3
3,Ballarat,68.6,45.1
4,Banks,83.4,47.8
5,Barker,57.9,34.9


In [12]:
# Checking datatypes
education_df.dtypes

electoral_division                 object
year_12_completion(%)             float64
higher_education_completion(%)    float64
dtype: object

### Merging with a second dataframe

Want to add the electoral division id to the election_results dataframe as an index and as a foreign key.

In [13]:
# Reading in electoral division csv
electoral_division_df = pd.read_csv("01-output_electorate_division_information/electoral_division.csv")
electoral_division_df.head()

Unnamed: 0,division_id,electoral_division,state
0,179,Adelaide,SA
1,197,Aston,VIC
2,198,Ballarat,VIC
3,103,Banks,NSW
4,180,Barker,SA


In [14]:
# dropping state column
electoral_division_df = electoral_division_df.drop(columns = ["state"])
electoral_division_df.head()

Unnamed: 0,division_id,electoral_division
0,179,Adelaide
1,197,Aston
2,198,Ballarat
3,103,Banks
4,180,Barker


In [15]:
# merging the dataframes
# merged_df = education_level_df.merge(electoral_division_df, how = 'left', on = 'electoral_division')
merged_df = electoral_division_df.merge(education_df, how = 'left', on = 'electoral_division')
merged_df.head()

Unnamed: 0,division_id,electoral_division,year_12_completion(%),higher_education_completion(%)
0,179,Adelaide,82.4,49.3
1,197,Aston,79.7,49.3
2,198,Ballarat,68.6,45.1
3,103,Banks,83.4,47.8
4,180,Barker,57.9,34.9


In [16]:
# Setting the index
merged_df = merged_df.set_index('division_id')
merged_df.head()

Unnamed: 0_level_0,electoral_division,year_12_completion(%),higher_education_completion(%)
division_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
179,Adelaide,82.4,49.3
197,Aston,79.7,49.3
198,Ballarat,68.6,45.1
103,Banks,83.4,47.8
180,Barker,57.9,34.9


### Exporting to csv

In [17]:
# Exporting dataframe to csv
# merged_df.to_csv("10-output_commonwealth_electorate_education/commonwealth_electorate_edu.csv")