In [1]:
from pathlib import Path
import json
from functools import reduce
import math
import datetime as dt
import pytz 
from itertools import product
from collections import OrderedDict
import time
import sys

import requests
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely.ops as so
from requests_html import HTMLSession
    
import helpers as hp

%load_ext autoreload
%autoreload 2


In [2]:
def get_rent_data_urls():
    """
    Scrape MBIE's website to get the location of three files of rental bond data.
    Return a list of three dictionaries, each with the following keys and values.
    
    - ``'kind'``: the kind of data; one of 'rent_count', 'rent_mean', 'rent_geo_mean'
    - ``'filename'``: the name of the CSV data file
    - ``'url'``: the URL of the CSV data file
    
    """
    src_url = "https://www.mbie.govt.nz/building-and-energy/tenancy-and-housing/rental-bond-data/"

    # Scrape the HTML page above to get the data urls of interest
    session = HTMLSession()
    r = session.get(src_url)
    urls = [
        link for link in r.html.absolute_links 
        if "Quarterly" in link 
        and link.split("/")[-1].startswith("detailed-")
    ]
    
    # Filter urls to finer set and include some metadata
    return [
        {
            "kind": "rent_count",
            "filename": "detailed-lodged-bonds.csv",
            "url": [u for u in urls if u.endswith("detailed-lodged-bonds.csv")][0],
            "target_path": hp.DATA_DIR/'collected'/"detailed-lodged-bonds.csv",
        },
        {
            "kind": "rent_mean",
            "filename": "detailed-mean-rents.csv",
            "url": [u for u in urls if u.endswith("detailed-mean-rents.csv")][0],
            "target_path": hp.DATA_DIR/'collected'/"detailed-mean-rents.csv",
        },
        {
            "kind": "rent_geo_mean",
            "filename": "detailed-geo-mean-rents.csv",
            "url": [u for u in urls if u.endswith("detailed-geo-mean-rents.csv")][0],
            "target_path": hp.DATA_DIR/'collected'/"detailed-geo-mean-rents.csv",
        },
    ]


# Download rent data

In [3]:
data = get_rent_data_urls()
    
for d in data:
    r = requests.get(d['url'])
    if not r.ok:
        print("Failed to get", d['filename'])
    else:
        print("Getting and saving", d['filename'])
        path = d["target_path"]
        with path.open('w') as tgt:
            tgt.write(r.text)


Getting and saving detailed-lodged-bonds.csv
Getting and saving detailed-mean-rents.csv
Getting and saving detailed-geo-mean-rents.csv


# Process rent data

In [4]:
# Reshape and merge all rent data sets

def clean(f, kind):
    f = f.copy()
    f = f.rename(columns={
        'SAU': 'au2001',
        'Property_Type': 'property_type',
        'Bedrooms': 'num_bedrooms'
    })

    # Drop subtotals
    cond = False
    for col in ['au2001', 'property_type', 'num_bedrooms']:
        cond |= f[col].str.contains('total', case=False)

    f = f[~cond].copy()
    
    # Reshape
    id_vars = ['au2001', 'property_type', 'num_bedrooms']
    value_vars = [c for c in f.columns if '-' in c]
    f = pd.melt(f, id_vars=id_vars, value_vars=value_vars,
      var_name='quarter', value_name=kind)
    
    return f

data = get_rent_data_urls()
frames = []
for d in data:
    path = d["target_path"]
    print(path)
    f = (
        pd.read_csv(path, dtype={'SAU': str})
        .pipe(clean, d["kind"])
    )
    frames.append(f)
    display(f.tail())
    print("latest two quarters =", f.quarter.unique()[-2:].tolist())

f = reduce(lambda x, y: pd.merge(x, y), frames)

# Merge in region data
path = hp.get_path('au2001_csv')
g = pd.read_csv(path, dtype={'au2001': str})
f = f.merge(g)

# Write to file
path = hp.get_path('rents')
f.to_csv(path, index=False)

# Display some
(
    f
    .sort_values("quarter")
    .loc[lambda x: x.rent_count.notna()]
    .tail()
)

/home/araichev/mrcagney_projects/affordability_nz/data/collected/detailed-lodged-bonds.csv


Unnamed: 0,au2001,property_type,num_bedrooms,quarter,rent_count
1084354,612802,House,2,2020-09-01,
1084355,612802,House,3,2020-09-01,
1084356,612802,House,4,2020-09-01,
1084357,613000,House,2,2020-09-01,
1084358,613000,House,3,2020-09-01,


latest two quarters = ['2020-06-01', '2020-09-01']
/home/araichev/mrcagney_projects/affordability_nz/data/collected/detailed-mean-rents.csv


Unnamed: 0,au2001,property_type,num_bedrooms,quarter,rent_mean
1084354,612802,House,2,2020-09-01,
1084355,612802,House,3,2020-09-01,
1084356,612802,House,4,2020-09-01,
1084357,613000,House,2,2020-09-01,
1084358,613000,House,3,2020-09-01,


latest two quarters = ['2020-06-01', '2020-09-01']
/home/araichev/mrcagney_projects/affordability_nz/data/collected/detailed-geo-mean-rents.csv


Unnamed: 0,au2001,property_type,num_bedrooms,quarter,rent_geo_mean
1084354,612802,House,2,2020-09-01,
1084355,612802,House,3,2020-09-01,
1084356,612802,House,4,2020-09-01,
1084357,613000,House,2,2020-09-01,
1084358,613000,House,3,2020-09-01,


latest two quarters = ['2020-06-01', '2020-09-01']


Unnamed: 0,au2001,property_type,num_bedrooms,quarter,rent_count,rent_mean,rent_geo_mean,au_name,territory,region,rental_area
845484,584500,House,2,2020-09-01,5.0,278.0,276.0,Westport Urban,Buller District,West Coast,Buller
845481,584500,Flat or Apartment,2,2020-09-01,5.0,252.0,252.0,Westport Urban,Buller District,West Coast,Buller
131865,511902,House,3,2020-09-01,8.0,582.0,578.0,Crum Park,Waitakere City,Auckland,Titirangi
843928,584405,Flat or Apartment,1,2020-09-01,5.0,150.0,150.0,Orowaiti,Buller District,West Coast,Buller
842708,584303,House,2,2020-09-01,6.0,372.0,371.0,Motueka East,Tasman District,Tasman,Motueka/Rural Tasman


In [5]:
# Print latest 2 quarters
hp.get_latest_quarters(2)

['2020-06-01', '2020-09-01']

# Explore rents

In [6]:
rents = hp.get_data('rents')
quarters = rents['quarter'].unique()
(
    rents
    .loc[lambda x: x.quarter == quarters[-1]]
    .describe()
)

Unnamed: 0,rent_count,rent_mean,rent_geo_mean
count,1861.0,1861.0,1861.0
mean,11.087587,492.23482,482.395486
std,16.565879,170.159985,164.447558
min,5.0,84.0,82.0
25%,6.0,381.0,371.0
50%,8.0,472.0,466.0
75%,12.0,574.0,566.0
max,364.0,1750.0,1664.0


In [7]:
# Slice in time and aggregate 
agg_rents = hp.aggregate_rents(rents, '2020-09-01')
agg_rents.head()

  d['rent_mean'] = (group['rent_mean']*group['rent_count']).sum()/\


Unnamed: 0,rental_area,num_bedrooms,territory,region,rent_count,rent_mean,rent_geo_mean
0,Addington,1,Christchurch City,Canterbury,260.0,196.3,183.400478
1,Addington,2,Christchurch City,Canterbury,115.0,346.278261,333.580638
2,Addington,3,Christchurch City,Canterbury,103.0,417.912621,403.609631
3,Addington,4,Christchurch City,Canterbury,6.0,464.0,456.0
4,Addington,5+,Christchurch City,Canterbury,0.0,,


In [9]:
(
    hp.aggregate_rents(rents, '2020-09-01', groupby_cols=('au2001', 'num_bedrooms'))
    .loc[lambda x: x.region == 'Auckland']
)

  d['rent_mean'] = (group['rent_mean']*group['rent_count']).sum()/\


Unnamed: 0,au2001,num_bedrooms,territory,region,rent_count,rent_mean,rent_geo_mean
288,505300,1,Rodney District,Auckland,0.0,,
289,505300,2,Rodney District,Auckland,10.0,399.500000,396.954657
290,505300,3,Rodney District,Auckland,36.0,452.083333,449.022019
291,505300,4,Rodney District,Auckland,5.0,528.000000,525.000000
292,505300,5+,Rodney District,Auckland,0.0,,
...,...,...,...,...,...,...,...
1985,526701,1,Franklin District,Auckland,0.0,,
1986,526701,2,Franklin District,Auckland,0.0,,
1987,526701,3,Franklin District,Auckland,0.0,,
1988,526701,4,Franklin District,Auckland,5.0,496.000000,490.000000


In [10]:
# What fraction of rental data do we have by num_bedrooms?

def hits(group):
    d = {}
    d['hit_frac'] = group['rent_mean'].count()/group['rent_mean'].shape[0]
    return pd.Series(d)

date = '2020-09-01'
f = hp.aggregate_rents(rents, date, groupby_cols=('au2001', 'num_bedrooms'))
cond = f['region'] == 'Auckland'
print('census area units')
print(f[cond].copy().groupby('num_bedrooms').apply(hits).reset_index())

f = hp.aggregate_rents(rents, date, groupby_cols=('rental_area', 'num_bedrooms'))
cond = f['region'] == 'Auckland'
print('rental area units')
print(f[cond].copy().groupby('num_bedrooms').apply(hits).reset_index())


  d['rent_mean'] = (group['rent_mean']*group['rent_count']).sum()/\


census area units
  num_bedrooms  hit_frac
0            1  0.219814
1            2  0.465318
2            3  0.678063
3            4  0.302594
4           5+  0.040625


  d['rent_mean'] = (group['rent_mean']*group['rent_count']).sum()/\


rental area units
  num_bedrooms  hit_frac
0            1  0.525253
1            2  0.838384
2            3  0.979798
3            4  0.666667
4           5+  0.121212


# Select latest two quarters and slice into regional chunks

In [11]:
# Get latest two quarters of rents
start_date = hp.get_latest_quarters(2)[0]
rents = hp.get_data('rents')
rents = rents.loc[lambda x: x.quarter >= start_date].copy()

# Create regional slices
for region in hp.REGIONS:
    # Build rents
    region_c = region.capitalize()
    region_rents = rents.loc[lambda x: x.region == region_c].copy()
    print(region, region_rents['quarter'].unique(), '#rows =', region_rents.shape[0])
    path = hp.get_path('rents', region)
    print('  Saving to', path)
    
    # Create region directory if it does not exist
    if not path.parent.exists():
        path.parent.mkdir()
        
    region_rents.to_csv(path, index=False)
    

auckland ['2020-06-01' '2020-09-01'] #rows = 5654
  Saving to /home/araichev/mrcagney_projects/affordability_nz/data/processed/auckland/rents.csv
canterbury ['2020-06-01' '2020-09-01'] #rows = 2578
  Saving to /home/araichev/mrcagney_projects/affordability_nz/data/processed/canterbury/rents.csv
wellington ['2020-06-01' '2020-09-01'] #rows = 2460
  Saving to /home/araichev/mrcagney_projects/affordability_nz/data/processed/wellington/rents.csv


# JSONize regional rents for the web, grouping by rental area and number of bedrooms

In [12]:
for region in hp.REGIONS:
    region_rents = hp.get_data('rents', region)
    d = hp.build_json_rents(region_rents)
    path = hp.get_path('rents_json', region)
    print('**', region, '\n', d)
    with path.open('w') as tgt:
        json.dump(d, tgt)

    

  d['rent_mean'] = (group['rent_mean']*group['rent_count']).sum()/\


** auckland 
 {'Albany': {'1': 285.0, '2': 510.0, '3': 635.0, '4': 705.0}, 'Avondale': {'1': 227.0, '2': 410.0, '3': 589.0, '4': 734.0}, 'Balmoral': {'1': 421.0, '2': 517.0, '3': 736.0, '4': 922.0}, 'Beachhaven/Birkdale': {'1': None, '2': 475.0, '3': 582.0, '4': 662.0}, 'Blockhouse Bay/New Windsor': {'1': None, '2': 488.0, '3': 585.0, '4': 690.0}, 'Botony Downs': {'1': None, '2': None, '3': 641.0, '4': 732.0}, 'Browns Bay': {'1': 429.0, '2': 537.0, '3': 632.0, '4': 704.0}, 'Bucklands Beach': {'1': None, '2': 528.0, '3': 661.0, '4': 856.0}, 'Central East': {'1': 367.0, '2': 527.0, '3': 749.0, '4': None}, 'Central West': {'1': 384.0, '2': 500.0, '3': 726.0, '4': None}, 'Chatswood/Birkenhead/Northcote Point': {'1': 205.0, '2': 514.0, '3': 666.0, '4': 960.0}, 'Dannemora': {'1': None, '2': 528.0, '3': 654.0, '4': 727.0}, 'Devonport': {'1': 456.0, '2': 586.0, '3': 723.0, '4': 1122.0}, 'East Coast Bays': {'1': None, '2': 567.0, '3': 685.0, '4': 933.0}, 'Eden Terrace': {'1': 402.0, '2': 550.0,

** wellington 
 {'Brooklyn': {'1': 444.0, '2': 557.0, '3': 718.0, '4': None}, 'Carterton/South Wairarapa': {'1': None, '2': 321.0, '3': 406.0, '4': None}, 'Eastern Bays': {'1': None, '2': 471.0, '3': 630.0, '4': None}, 'Epuni/Avalon': {'1': 204.0, '2': 454.0, '3': 588.0, '4': 674.0}, 'Hataitai': {'1': 375.0, '2': 511.0, '3': 754.0, '4': 839.0}, 'Heretaunga/Silverstream': {'1': None, '2': 365.0, '3': 585.0, '4': None}, 'Hutt Central/Waterloo': {'1': 315.0, '2': 493.0, '3': 591.0, '4': None}, 'Island Bay/Melrose': {'1': 278.0, '2': 476.0, '3': 671.0, '4': None}, 'Johnsonville/Newlands': {'1': 203.0, '2': 494.0, '3': 602.0, '4': 716.0}, 'Karori': {'1': 395.0, '2': 533.0, '3': 640.0, '4': 721.0}, 'Karori South/Makara': {'1': None, '2': 474.0, '3': 613.0, '4': None}, 'Kelburn/Aro Valley': {'1': 329.0, '2': 506.0, '3': 749.0, '4': 838.0}, 'Khandallah': {'1': None, '2': None, '3': 677.0, '4': 920.0}, 'Kilbirnie/Lyall Bay': {'1': 436.0, '2': 501.0, '3': 665.0, '4': 747.0}, 'Kingston/Happy Vall