In [8]:
import os

import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression

In [9]:
# read in data

df = pd.read_excel('../../data/landing/SA2_Income.xlsx', sheet_name = "Table 1.4", header = 6)

In [10]:
# cut out non-victorian results

df = df.loc[579 : 1040]

In [11]:
# create new dataframe with median incomes

process_df = pd.concat([df['SA2'], df['SA2 NAME'], df['2015-16.3'], df['2016-17.3'], df['2017-18.3'], df['2018-19.3'], df['2019-20.3']], axis = 1)
                       
del df

In [12]:
# rename columns
process_df = process_df.rename(columns = {'SA2' : 'SA2 Code', 'SA2 NAME' : 'SA2 Name', '2015-16.3' : '2015', '2016-17.3' : '2016', '2017-18.3' : '2017', '2018-19.3' : '2018', '2019-20.3' : '2019'})

# reset index
process_df = process_df.reset_index().drop(['index'], axis = 1)

In [13]:
# save data

process_df.to_csv('../../data/raw/processed_income_data.csv', index = False)

In [14]:
# Linear regression to predict income in 2020, 2021, 2022, 2023, 2024, 2025 and 2026

# Dropping the unnecessary columns
process_df = process_df.drop(columns=["SA2 Code"])

# remove missing values
process_df = process_df.replace('np', np.NaN).dropna()

# Reshaping data to long format
df_long_uploaded = process_df.melt(id_vars=["SA2 Name"], 
                                    var_name="Year",
                                    value_name="Income")
df_long_uploaded["Year"] = df_long_uploaded["Year"].astype(int)

# Filter out rows with non-numeric values in the Income column
df_long_uploaded = df_long_uploaded[df_long_uploaded['Income'].apply(lambda x: str(x).replace('.', '').isnumeric())]
df_long_uploaded["Income"] = df_long_uploaded["Income"].astype(float)

# Dictionary to store predictions
predictions_corrected_data = {}

# define future year
future_years = [2020, 2021, 2022, 2023, 2024, 2025, 2026]

# Performing linear regression for each SA2 Name
for sa2_name in process_df['SA2 Name'].unique():
    # Filter the dataset for the current SA2 Name
    sa2_data = df_long_uploaded[df_long_uploaded['SA2 Name'] == sa2_name]
    
    # Check if there's enough data for the current SA2 Name
    if sa2_data.shape[0] < 2:
        continue
    
    X = sa2_data['Year'].values.reshape(-1, 1)  # Features (years)
    y = sa2_data['Income'].values               # Target (income)
    
    # Train the model
    model = LinearRegression()
    model.fit(X, y)
    
    # Predict for the future years
    y_pred = model.predict(np.array(future_years).reshape(-1, 1))
    
    # Store predictions
    predictions_corrected_data[sa2_name] = {f"Income in {year}": int(pred) for year, pred in zip(future_years, y_pred)}

# Taking a subset of the predictions for display (first 5 SA2 Names)
subset_predictions_corrected = {key: predictions_corrected_data[key] for key in list(predictions_corrected_data)[:5]}
subset_predictions_corrected


{'Alfredton': {'Income in 2020': 56780,
  'Income in 2021': 58252,
  'Income in 2022': 59725,
  'Income in 2023': 61197,
  'Income in 2024': 62670,
  'Income in 2025': 64142,
  'Income in 2026': 65615},
 'Ballarat': {'Income in 2020': 55351,
  'Income in 2021': 56523,
  'Income in 2022': 57694,
  'Income in 2023': 58866,
  'Income in 2024': 60037,
  'Income in 2025': 61209,
  'Income in 2026': 62380},
 'Ballarat - North': {'Income in 2020': 53810,
  'Income in 2021': 55464,
  'Income in 2022': 57117,
  'Income in 2023': 58771,
  'Income in 2024': 60425,
  'Income in 2025': 62078,
  'Income in 2026': 63732},
 'Ballarat - South': {'Income in 2020': 48509,
  'Income in 2021': 49932,
  'Income in 2022': 51355,
  'Income in 2023': 52778,
  'Income in 2024': 54201,
  'Income in 2025': 55624,
  'Income in 2026': 57047},
 'Buninyong': {'Income in 2020': 55919,
  'Income in 2021': 57598,
  'Income in 2022': 59277,
  'Income in 2023': 60957,
  'Income in 2024': 62636,
  'Income in 2025': 64315,
