# Population Data To Curated

Preprocess raw population data to curated

## Import packages

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

In [2]:
df = pd.read_csv('../../data/raw/processed_population_data.csv')
df.head()

Unnamed: 0,SA2 Code,SA2 Name,2001,2002,2003,2004,2005,2006,2007,2008,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,201011001,Alfredton,5756,6092,6293,6480,6648,6761,7034,7272,...,9714,10338,11039,11852,12649,13537,14434,15507,16841,17991
1,201011002,Ballarat,11497,11708,12015,12189,12269,12356,12408,12480,...,12352,12327,12300,12301,12266,12244,12320,12196,12071,11931
2,201011005,Buninyong,5320,5399,5557,5620,5857,6037,6131,6252,...,6984,7082,7191,7311,7409,7418,7458,7377,7229,7244
3,201011006,Delacombe,4154,4225,4371,4465,4704,5041,5206,5349,...,6267,6583,6846,7195,7622,8183,8890,9755,10648,11790
4,201011007,Smythes Creek,3317,3378,3411,3473,3508,3542,3594,3658,...,3914,3945,3966,3990,4004,4042,4112,4152,4211,4222


In [3]:
# Keep the first two columns and columns from 2015 onwards
cols_to_include = ['SA2 Code', 'SA2 Name'] + [col for col in df.columns if col.isnumeric() and int(col) >= 2015]
df = df[cols_to_include]

df.head()  

Unnamed: 0,SA2 Code,SA2 Name,2015,2016,2017,2018,2019,2020,2021,2022
0,201011001,Alfredton,11039,11852,12649,13537,14434,15507,16841,17991
1,201011002,Ballarat,12300,12301,12266,12244,12320,12196,12071,11931
2,201011005,Buninyong,7191,7311,7409,7418,7458,7377,7229,7244
3,201011006,Delacombe,6846,7195,7622,8183,8890,9755,10648,11790
4,201011007,Smythes Creek,3966,3990,4004,4042,4112,4152,4211,4222


Predict population for 2023

In [4]:
# Data for Linear Regression
X = df.iloc[:, 2:-1].columns.astype(int).values.reshape(-1, 1)  # Years as features (excluding previously added 2023 column)
predictions = []

# Model Training and Prediction
for index, row in df.iterrows():
    y = row[2:-1].values  # Population for respective years (excluding previously added 2023 column)
    model = LinearRegression()
    model.fit(X, y)
    prediction_2023 = model.predict([[2023]])
    predictions.append(prediction_2023[0])

# Adding the predicted 2023 population to the dataframe
df['2023'] = np.round(predictions).astype(int)

df.head()

Unnamed: 0,SA2 Code,SA2 Name,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,201011001,Alfredton,11039,11852,12649,13537,14434,15507,16841,17991,18426
1,201011002,Ballarat,12300,12301,12266,12244,12320,12196,12071,11931,12092
2,201011005,Buninyong,7191,7311,7409,7418,7458,7377,7229,7244,7395
3,201011006,Delacombe,6846,7195,7622,8183,8890,9755,10648,11790,11626
4,201011007,Smythes Creek,3966,3990,4004,4042,4112,4152,4211,4222,4277


## Drop SA2 code and lower case SA2 Name

In [5]:
# Drop the 'SA2 Code' columns
df = df.drop(columns=['SA2 Code'])

# Convert the SA2 Name to lowercase
df['SA2 Name'] = df['SA2 Name'].str.lower()

# Convert the column names to lowercase
df.columns = df.columns.str.lower()

df.head()

Unnamed: 0,sa2 name,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,alfredton,11039,11852,12649,13537,14434,15507,16841,17991,18426
1,ballarat,12300,12301,12266,12244,12320,12196,12071,11931,12092
2,buninyong,7191,7311,7409,7418,7458,7377,7229,7244,7395
3,delacombe,6846,7195,7622,8183,8890,9755,10648,11790,11626
4,smythes creek,3966,3990,4004,4042,4112,4152,4211,4222,4277


## Save the data

In [6]:
df.to_csv('../../data/curated/curated_population.csv', index = False)