# Annual Rent To Curated

Preprocess raw annual rent data to curated

## Import packages, read in data

In [79]:
import os

import pandas as pd
import numpy as np

In [80]:
# read in data

one_flat = pd.read_csv('../../data/raw/annual_rent/1_bedroom_flat_annual_rent.csv')
two_flat = pd.read_csv('../../data/raw/annual_rent/2_bedroom_flat_annual_rent.csv')
two_house = pd.read_csv('../../data/raw/annual_rent/2_bedroom_house_annual_rent.csv')
three_flat = pd.read_csv('../../data/raw/annual_rent/3_bedroom_flat_annual_rent.csv')
three_house = pd.read_csv('../../data/raw/annual_rent/3_bedroom_house_annual_rent.csv')
four_house = pd.read_csv('../../data/raw/annual_rent/4_bedroom_house_annual_rent.csv')
all_prop = pd.read_csv('../../data/raw/annual_rent/all_properties_annual_rent.csv')

## Curating data

In [81]:
# curate rent data function

def curate_rent(df):
    
    # drop group total rows
    df = df[df['Suburb'] != 'Group Total']
    df = df.reset_index(drop = True)
    
    # drop count columns
    df = df.drop(columns = df.filter(like = 'Count').columns)
    
    # convert column names to new format
    newcols = []
    for col in df.columns[1:]:
        
        # remove 'Median' from other column names
        col = col.replace('Median', '').strip()

        # convert column name to datetime
        col = pd.to_datetime(col, format='%b %Y')
        
        # convert datetime to MM/YYYY
        col = col.strftime('%m/%Y')
        
        newcols.append(col)
        
    df.columns = [df.columns[0]] + newcols
    
    # drop columns from before 2015 for relevancy
    dropcols = []
    for col in df.columns[1:]:
        
        if int(col.split('/')[1]) < 2015:
            dropcols.append(col)
            
    df = df.drop(columns = dropcols)
    
    # replace missing values with np.nan
    df = df.replace('-', np.nan)

    # convert data types from str to int
    for col in df.columns[1:]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
        
    # group year average
    for i in range(2015, 2023):
        df[str(i)] = (df[f'03/{i}'] + df[f'06/{i}'] + df[f'09/{i}'] + df[f'12/{i}']) / 4
    df['2023'] = df['03/2023']

    df = df[['Suburb', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']]
    return df

In [82]:
# curate data

one_flat = curate_rent(one_flat)
two_flat = curate_rent(two_flat)
two_house = curate_rent(two_house)
three_flat = curate_rent(three_flat)
three_house = curate_rent(three_house)
four_house = curate_rent(four_house)
all_prop = curate_rent(all_prop)

In [83]:
one_flat

Unnamed: 0,Suburb,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Armadale,310.00,315.00,345.75,377.00,360.00,360.75,336.25,330.00,360.0
1,Carlton North,322.50,340.00,342.50,349.50,367.50,355.75,317.50,317.50,373.0
2,Carlton-Parkville,329.25,333.25,358.25,382.50,381.50,351.50,273.75,284.25,352.0
3,CBD-St Kilda Rd,380.00,382.25,403.50,421.25,435.50,406.25,312.50,353.75,430.0
4,Collingwood-Abbotsford,366.25,377.50,388.75,402.50,412.50,405.00,357.50,378.25,420.0
...,...,...,...,...,...,...,...,...,...,...
140,Traralgon,164.50,171.25,163.75,173.75,186.25,195.00,209.75,243.75,260.0
141,Wanagaratta,148.75,157.00,172.25,180.75,193.75,193.75,197.00,218.25,220.0
142,Warragul,,,,190.75,,,,,
143,Warrnambool,197.50,193.25,188.75,199.00,212.75,217.50,242.50,241.25,250.0


## Imputing Missing Values

Assume that the median rent prices have been increasing linearly across all suburbs from the start to the end of the time frame

The average increase per quarter is calculated then used to fill in the missing values based on the previous/next year

In [84]:
# impute missing values function

def impute_missing(df):
    
    # calculate the average increase over a quarter for all suburbs across the timeframe
    increase = ((df['2023'] - df['2015']) / df['2015']).mean() / 8
    
    # find rows with missing values
    missing = df[df.iloc[:, 1:].isnull().any(axis=1)].index

    for i in missing:
        
        row = df.iloc[i, 1:]
        
        # track the last existing point
        last_val = None
        
        # foward filling from first existing point
        for idx, val in enumerate(row):
            
            # fill missing value if we know last existing
            if pd.isna(val):
                if last_val:
                    
                    # predict using previous slope
                    predict = last_val * (1 + increase)
                    df.iloc[i, idx + 1] = predict
                    last_val = predict
                    
            # update last existing point
            else:
                last_val = val

        # backward filling from first existing point
        for idx in reversed(range(len(row))):
            val = row.iloc[idx]
            
            if pd.isna(val):
                if last_val:
                    
                    # predict using previous slope
                    predict = last_val / (1 + increase)
                    df.iloc[i, idx + 1] = predict
                    last_val = predict
            
            # update last existing point
            else:
                last_val = val
    
    # for rows with all missing values, just use the column average
    df = df.fillna(df.mean())
    
    return df

In [85]:
# impute missing values

one_flat = impute_missing(one_flat)
two_flat = impute_missing(two_flat)
two_house = impute_missing(two_house)
three_flat = impute_missing(three_flat)
three_house = impute_missing(three_house)
four_house = impute_missing(four_house)
all_prop = impute_missing(all_prop)

  df = df.fillna(df.mean())
  df = df.fillna(df.mean())
  df = df.fillna(df.mean())
  df = df.fillna(df.mean())
  df = df.fillna(df.mean())
  df = df.fillna(df.mean())
  df = df.fillna(df.mean())


## Change suburb to lower case


In [86]:
# Convert the suburb values to lowercase
one_flat['Suburb'] = one_flat['Suburb'].str.lower()
two_flat['Suburb'] = two_flat['Suburb'].str.lower()
three_flat['Suburb'] = three_flat['Suburb'].str.lower()
two_house['Suburb'] = two_house['Suburb'].str.lower()
three_house['Suburb'] = three_house['Suburb'].str.lower()
four_house['Suburb'] = four_house['Suburb'].str.lower()
all_prop['Suburb'] = all_prop['Suburb'].str.lower()

## Standardized column name

In [87]:
# Convert the column names to lowercase
one_flat.columns = one_flat.columns.str.lower()
two_flat.columns = two_flat.columns.str.lower()
three_flat.columns = three_flat.columns.str.lower()
two_house.columns = two_house.columns.str.lower()
three_house.columns = three_house.columns.str.lower()
four_house.columns = four_house.columns.str.lower()
all_prop.columns = all_prop.columns.str.lower()

## Save Data

In [88]:
# create new folder for curated rent data
if not os.path.exists('../../data/curated/annual_rent'):
    os.makedirs('../../data/curated/annual_rent')
    
# save data
one_flat.to_csv('../../data/curated/annual_rent/1_bedroom_flat_annual_rent.csv', index = False)
two_flat.to_csv('../../data/curated/annual_rent/2_bedroom_flat_annual_rent.csv', index = False)
two_house.to_csv('../../data/curated/annual_rent/2_bedroom_house_annual_rent.csv', index = False)
three_flat.to_csv('../../data/curated/annual_rent/3_bedroom_flat_annual_rent.csv', index = False)
three_house.to_csv('../../data/curated/annual_rent/3_bedroom_house_annual_rent.csv', index = False)
four_house.to_csv('../../data/curated/annual_rent/4_bedroom_house_annual_rent.csv', index = False)
all_prop.to_csv('../../data/curated/annual_rent/all_properties_annual_rent.csv', index = False)