# Income Data To Curated

Preprocess the income data to curated

## Import packages

In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [10]:
df = pd.read_csv('../../data/raw/processed_income_data.csv')
df.head()

Unnamed: 0,SA2 Code,SA2 Name,2015,2016,2017,2018,2019
0,201011001,Alfredton,49385,50845,52448,53932,55204
1,201011002,Ballarat,49564,50413,51736,53688,53784
2,201011003,Ballarat - North,45816,46561,49211,50593,52068
3,201011004,Ballarat - South,41544,42531,44293,45828,47010
4,201011005,Buninyong,47511,49179,51034,52377,54308


In [11]:
print(df.dtypes)

SA2 Code     int64
SA2 Name    object
2015        object
2016        object
2017        object
2018        object
2019        object
dtype: object


In [12]:
df['2019'] = pd.to_numeric(df['2019'], errors='coerce')

## Predict income for 2020, 2021 and 2022

GDP growth rate per capita are taken from https://www.macrotrends.net/countries/AUS/australia/gdp-per-capita#:~:text=Australia%20gdp%20per%20capita%20for%202020%20was%20%2451%2C722%2C%20a%205.86,a%203.96%25%20decline%20from%202018.

In [13]:
growth_rates = {
    2020: -0.0586,  # 5.86% decline
    2021: 0.1686,  # 16.86% increase
    2022: 0.067    # 6.7% increase
}
for year, rate in growth_rates.items():
    prev_year = str(year - 1)
    df[str(year)] = df[prev_year] * (1 + rate)

df.head()

Unnamed: 0,SA2 Code,SA2 Name,2015,2016,2017,2018,2019,2020,2021,2022
0,201011001,Alfredton,49385,50845,52448,53932,55204.0,51969.0456,60731.026688,64800.005476
1,201011002,Ballarat,49564,50413,51736,53688,53784.0,50632.2576,59168.856231,63133.169599
2,201011003,Ballarat - North,45816,46561,49211,50593,52068.0,49016.8152,57281.050243,61118.880609
3,201011004,Ballarat - South,41544,42531,44293,45828,47010.0,44255.214,51716.64308,55181.658167
4,201011005,Buninyong,47511,49179,51034,52377,54308.0,51125.5512,59745.319132,63748.255514


## Predict income for 2023 by linear regression

In [14]:

# Convert columns to numeric, coercing any non-numeric values to NaN
for col in df.columns[2:]:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows containing NaN values
df = df.dropna()

# Data for Linear Regression
years_single = df.columns[2:].astype(int).values.reshape(-1, 1)  # Years as features
income_predictions_2023= []

# Model Training and Prediction
for index, row in df.iterrows():
    income = row[2:].values  # Income for respective years
    model = LinearRegression()
    model.fit(years_single, income)
    prediction_2023 = model.predict([[2023]])
    income_predictions_2023.append(prediction_2023[0])

# Adding the predicted 2023 income to the cleaned dataframe
df['2023'] = np.round(income_predictions_2023)


df.head()


Unnamed: 0,SA2 Code,SA2 Name,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,201011001,Alfredton,49385.0,50845.0,52448.0,53932.0,55204.0,51969.0456,60731.026688,64800.005476,63334.0
1,201011002,Ballarat,49564.0,50413.0,51736.0,53688.0,53784.0,50632.2576,59168.856231,63133.169599,61276.0
2,201011003,Ballarat - North,45816.0,46561.0,49211.0,50593.0,52068.0,49016.8152,57281.050243,61118.880609,60116.0
3,201011004,Ballarat - South,41544.0,42531.0,44293.0,45828.0,47010.0,44255.214,51716.64308,55181.658167,54177.0
4,201011005,Buninyong,47511.0,49179.0,51034.0,52377.0,54308.0,51125.5512,59745.319132,63748.255514,62666.0


In [15]:
# Drop the 'SA2 Code' columns
df = df.drop(columns=['SA2 Code'])

# Convert the SA2 Name to lowercase
df['SA2 Name'] = df['SA2 Name'].str.lower()

# Convert the column names to lowercase
df.columns = df.columns.str.lower()

df.head()

Unnamed: 0,sa2 name,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,alfredton,49385.0,50845.0,52448.0,53932.0,55204.0,51969.0456,60731.026688,64800.005476,63334.0
1,ballarat,49564.0,50413.0,51736.0,53688.0,53784.0,50632.2576,59168.856231,63133.169599,61276.0
2,ballarat - north,45816.0,46561.0,49211.0,50593.0,52068.0,49016.8152,57281.050243,61118.880609,60116.0
3,ballarat - south,41544.0,42531.0,44293.0,45828.0,47010.0,44255.214,51716.64308,55181.658167,54177.0
4,buninyong,47511.0,49179.0,51034.0,52377.0,54308.0,51125.5512,59745.319132,63748.255514,62666.0


## Save the data

In [16]:
df.to_csv('../../data/curated/curated_income.csv', index = False)