In [1]:
import pandas as pd

In [2]:
#Load in tables of data itself and not the summary sheets
df = pd.read_excel('../data/landing/projections/VIF2023_projections.xlsx', sheet_name=['Component_Detail_Annual', '5yr_Age_Groups_Sex_2011_2051'], header=None)

In [3]:
population_changes = df['Component_Detail_Annual']
indv_changes = df['5yr_Age_Groups_Sex_2011_2051']

In [4]:
#Here we index to only grab the table data and it's content in the excel sheets
population_changes = population_changes[9:50]
#Rename each column to accruately represent the table data
population_changes.columns = ["Year", "Population at Start of Period", "Births", "Deaths", "Natural Increase", "Overseas Migration Arrivals", "Overseas Migration Departures", "Net Overseas Migration", "Interstate Migration Arrivals", "Interstate Migration Departures", "Net Interstate Migration", "Net Migration", "Population at End of Period", "Change in Population", "Annual Population Growth Rate", "Statistical Difference*"]

In [5]:
#We'll grab only useful columns
population_changes = population_changes[["Year", "Net Overseas Migration", "Net Interstate Migration", "Population at End of Period", "Change in Population"]]
population_changes

Unnamed: 0,Year,Net Overseas Migration,Net Interstate Migration,Population at End of Period,Change in Population
9,2011,,,5537817.0,
10,2012,56174.0,2417.0,5651091.0,113274.0
11,2013,59034.0,6420.0,5772669.0,121578.0
12,2014,56906.0,9739.0,5894917.0,122248.0
13,2015,60694.0,11079.0,6022322.0,127405.0
14,2016,72215.0,17639.0,6173172.0,150850.0
15,2017,91243.0,13748.0,6302608.0,129436.0
16,2018,86968.0,9101.0,6423038.0,120430.0
17,2019,85476.0,6480.0,6537305.0,114267.0
18,2020,60597.0,-2652.0,6615046.0,77741.0


In [6]:
#Rename columns to match the table data
indv_changes.columns = indv_changes.iloc[8]
#Makes sure we only read in data
indv_changes = indv_changes[9:]
#This just makes sure pandas displays numbers in full an integer format
pd.set_option("display.float_format", "{:.0f}".format)

In [7]:
#Here we change the dataframe columns so we have year and population as two seperate variables
indv_changes = indv_changes.melt(
    id_vars=["Sex", "Age", "5-year Age/Sex group", "15-year Age/Sex group"],
    var_name="Year",
    value_name="Population"
)

In [8]:
indv_changes

Unnamed: 0,Sex,Age,5-year Age/Sex group,15-year Age/Sex group,Year,Population
0,Males,0 to 4,Males - 0 to 4,Males - 0 to 14,2011,180813
1,Males,5 to 9,Males - 5 to 9,Males - 0 to 14,2011,170480
2,Males,10 to 14,Males - 10 to 14,Males - 0 to 14,2011,169404
3,Males,15 to 19,Males - 15 to 19,Males - 15 to 29,2011,181658
4,Males,20 to 24,Males - 20 to 24,Males - 15 to 29,2011,211018
...,...,...,...,...,...,...
2373,Persons,75 to 79,Persons - 75 to 79,Persons - 75 and over,2051,365301
2374,Persons,80 to 84,Persons - 80 to 84,Persons - 75 and over,2051,312742
2375,Persons,85 to 89,Persons - 85 to 89,Persons - 75 and over,2051,223741
2376,Persons,90 and over,Persons - 90 and over,Persons - 75 and over,2051,173444


In [9]:
#Now we get the total popualtion for each 15 year age group
indv_changes = (
    indv_changes
    .groupby(["15-year Age/Sex group", "Year"], as_index=False)["Population"]
    .sum()
)

In [10]:
indv_changes

Unnamed: 0,15-year Age/Sex group,Year,Population
0,Females - 0 to 14,2011,494039
1,Females - 0 to 14,2012,504330
2,Females - 0 to 14,2013,516192
3,Females - 0 to 14,2014,527555
4,Females - 0 to 14,2015,538712
...,...,...,...
774,Persons - Total,2047,9822517
775,Persons - Total,2048,9949489
776,Persons - Total,2049,10076122
777,Persons - Total,2050,10202395


In [11]:
#We'll now make each 15-age groups it's own variable with it's corresponding year
indv_changes = indv_changes.pivot(
    index="Year",
    columns="15-year Age/Sex group",
    values="Population"
).reset_index()

In [None]:
#We already have Total population from first dataframe
indv_changes = indv_changes.drop(columns=["Persons - Total"])

In [13]:
#Final merged data frame of both tables
population_data = pd.merge(population_changes, indv_changes, on="Year", how="left")

In [14]:
#We'll remove 2011 data due to missing values and limited relevance given it's age
population_data = population_data.drop(population_data.index[0])
population_data

Unnamed: 0,Year,Net Overseas Migration,Net Interstate Migration,Population at End of Period,Change in Population,Females - 0 to 14,Females - 15 to 29,Females - 30 to 44,Females - 45 to 59,Females - 60 to 74,...,Males - 30 to 44,Males - 45 to 59,Males - 60 to 74,Males - 75 and over,Persons - 0 to 14,Persons - 15 to 29,Persons - 30 to 44,Persons - 45 to 59,Persons - 60 to 74,Persons - 75 and over
1,2012,56174,2417,5651091,113274,504330,594739,611612,551808,376650,...,600841,533198,360042,155779,1035904,1209620,1212453,1085006,736692,371416
2,2013,59034,6420,5772669,121578,516192,605220,623533,561278,390237,...,613988,541284,372187,161017,1060155,1229200,1237521,1102562,762424,380807
3,2014,56906,9739,5894917,122248,527555,617404,634535,572255,403419,...,624196,550136,383494,166488,1083457,1252465,1258731,1122391,786913,390960
4,2015,60694,11079,6022322,127405,538712,630121,647200,582989,416966,...,635941,558550,395122,172040,1107752,1276628,1283141,1141539,812088,401174
5,2016,72215,17639,6173172,150850,554284,645158,660095,596288,431794,...,649650,567885,408044,178048,1140064,1306807,1309745,1164173,839838,412545
6,2017,91243,13748,6302608,129436,564594,656880,674186,605943,444968,...,664220,577355,418150,183931,1161960,1332362,1338406,1183298,863118,423464
7,2018,86968,9101,6423038,120430,572918,665905,690961,612136,458708,...,680131,584133,428401,189492,1179414,1355475,1371092,1196269,887109,433679
8,2019,85476,6480,6537305,114267,580345,671350,707938,618196,472181,...,696148,589698,438973,195966,1193892,1373549,1404086,1207894,911154,446730
9,2020,60597,-2652,6615046,77741,583911,659128,723989,624323,491459,...,711262,595353,452955,203877,1200620,1352646,1435251,1219676,944414,462439
10,2021,-53059,-35622,6547822,-67224,580276,622387,724905,618425,498387,...,709637,593036,457591,211640,1192206,1276147,1434542,1211461,955978,477488


In [15]:
# create the processed data directory if it doesn't exist
import os
os.makedirs('../data/processed/population', exist_ok=True)

# Save the processed school location data to a new CSV file
population_data.to_csv('../data/processed/population/population_data.csv', index=False)