In [30]:
from pathlib import Path
import json
from functools import reduce
import math
import datetime as dt
import pytz 
from itertools import product
from collections import OrderedDict
import time
import sys

import requests
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely.ops as so
from requests_html import HTMLSession
    
import helpers as hp

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
def get_rent_data_urls():
    """
    Scrape MBIE's website to get the location of three files of rental bond data.
    Return a list of three dictionaries, each with the following keys and values.
    
    - ``'kind'``: the kind of data; one of 'rent_count', 'rent_mean', 'rent_geo_mean'
    - ``'filename'``: the name of the CSV data file
    - ``'url'``: the URL of the CSV data file
    
    """
    src_url = "https://www.mbie.govt.nz/building-and-energy/tenancy-and-housing/rental-bond-data/"

    # Scrape the HTML page above to get the data urls of interest
    session = HTMLSession()
    r = session.get(src_url)
    urls = [
        link for link in r.html.absolute_links 
        if "Quarterly" in link 
        and link.split("/")[-1].startswith("detailed-")
    ]
    
    # Filter urls to finer set and include some metadata
    return [
        {
            "kind": "rent_count",
            "filename": "detailed-lodged-bonds.csv",
            "url": [u for u in urls if u.endswith("detailed-lodged-bonds.csv")][0],
            "target_path": hp.DATA_DIR/'collected'/"detailed-lodged-bonds.csv",
        },
        {
            "kind": "rent_mean",
            "filename": "detailed-mean-rents.csv",
            "url": [u for u in urls if u.endswith("detailed-mean-rents.csv")][0],
            "target_path": hp.DATA_DIR/'collected'/"detailed-mean-rents.csv",
        },
        {
            "kind": "rent_geo_mean",
            "filename": "detailed-geo-mean-rents.csv",
            "url": [u for u in urls if u.endswith("detailed-geo-mean-rents.csv")][0],
            "target_path": hp.DATA_DIR/'collected'/"detailed-geo-mean-rents.csv",
        },
    ]


# Download rent data

In [32]:
data = get_rent_data_urls()
    
for d in data:
    r = requests.get(d['url'])
    if not r.ok:
        print("Failed to get", d['filename'])
    else:
        print("Getting and saving", d['filename'])
        path = d["target_path"]
        with path.open('w') as tgt:
            tgt.write(r.text)


Getting and saving detailed-lodged-bonds.csv
Getting and saving detailed-mean-rents.csv
Getting and saving detailed-geo-mean-rents.csv


# Process rent data

In [33]:
# Reshape and merge all rent data sets

def clean(f, kind):
    f = f.copy()
    f = f.rename(columns={
        'SAU': 'au2001',
        'Property_Type': 'property_type',
        'Bedrooms': 'num_bedrooms'
    })

    # Drop subtotals
    cond = False
    for col in ['au2001', 'property_type', 'num_bedrooms']:
        cond |= f[col].str.contains('total', case=False)

    f = f[~cond].copy()
    
    # Reshape
    id_vars = ['au2001', 'property_type', 'num_bedrooms']
    value_vars = [c for c in f.columns if '-' in c]
    f = pd.melt(f, id_vars=id_vars, value_vars=value_vars,
      var_name='quarter', value_name=kind)
    
    return f

data = get_rent_data_urls()
frames = []
for d in data:
    path = d["target_path"]
    print(path)
    f = (
        pd.read_csv(path, dtype={'SAU': str})
        .pipe(clean, d["kind"])
    )
    frames.append(f)
    display(f.tail())
    print("latest two quarters =", f.quarter.unique()[-2:].tolist())

f = reduce(lambda x, y: pd.merge(x, y), frames)

# Merge in region data
path = hp.get_path('au2001_csv')
g = pd.read_csv(path, dtype={'au2001': str})
f = f.merge(g)

# Write to file
path = hp.get_path('rents')
f.to_csv(path, index=False)

# Display some
(
    f
    .sort_values("quarter")
    .loc[lambda x: x.rent_count.notna()]
    .tail()
)

/home/araichev/mrcagney_projects/affordability_nz/data/collected/detailed-lodged-bonds.csv


Unnamed: 0,au2001,property_type,num_bedrooms,quarter,rent_count
1073485,612802,House,2,2020-06-01,
1073486,612802,House,3,2020-06-01,
1073487,612802,House,4,2020-06-01,
1073488,613000,House,2,2020-06-01,
1073489,613000,House,3,2020-06-01,


latest two quarters = ['2020-03-01', '2020-06-01']
/home/araichev/mrcagney_projects/affordability_nz/data/collected/detailed-mean-rents.csv


Unnamed: 0,au2001,property_type,num_bedrooms,quarter,rent_mean
1073485,612802,House,2,2020-06-01,
1073486,612802,House,3,2020-06-01,
1073487,612802,House,4,2020-06-01,
1073488,613000,House,2,2020-06-01,
1073489,613000,House,3,2020-06-01,


latest two quarters = ['2020-03-01', '2020-06-01']
/home/araichev/mrcagney_projects/affordability_nz/data/collected/detailed-geo-mean-rents.csv


Unnamed: 0,au2001,property_type,num_bedrooms,quarter,rent_geo_mean
1061110,612802,House,2,2020-03-01,
1061111,612802,House,3,2020-03-01,
1061112,612802,House,4,2020-03-01,
1061113,613000,House,2,2020-03-01,
1061114,613000,House,3,2020-03-01,


latest two quarters = ['2019-12-01', '2020-03-01']


Unnamed: 0,au2001,property_type,num_bedrooms,quarter,rent_count,rent_mean,rent_geo_mean,au_name,territory,region,rental_area
265630,521301,House,3,2020-03-01,5.0,642.0,641.0,Drury,Papakura District,Auckland,Papakura/Drury/Karaka
610507,558900,House,3,2020-03-01,7.0,285.0,284.0,Bulls,Rangitikei District,Manawatu-Wanganui,Rangitikei
612686,559500,House,3,2020-03-01,9.0,309.0,305.0,Marton,Rangitikei District,Manawatu-Wanganui,Rangitikei
24413,502101,House,3,2020-03-01,10.0,491.0,489.0,Kamo West,Whangarei District,Northland,Kamo/Tikipunga/Kensington
611814,559400,House,3,2020-03-01,5.0,257.0,253.0,Taihape,Rangitikei District,Manawatu-Wanganui,Rangitikei


In [34]:
# Print latest 2 quarters
hp.get_latest_quarters(2)

['2019-12-01', '2020-03-01']

# Explore rents

In [16]:
rents = hp.get_data('rents')
quarters = rents['quarter'].unique()
(
    rents
    .loc[lambda x: x.quarter == quarters[-1]]
    .describe()
)

Unnamed: 0,rent_count,rent_mean,rent_geo_mean
count,1884.0,1884.0,1872.0
mean,11.572187,510.495754,502.678953
std,16.100111,190.338641,187.067216
min,5.0,90.0,79.0
25%,6.0,389.75,383.75
50%,8.0,481.0,476.0
75%,12.0,597.0,592.0
max,297.0,1612.0,1577.0


In [None]:
# Slice in time and aggregate 
agg_rents = hp.aggregate_rents(rents, '2019-09-01')
agg_rents.head()

In [None]:
(
    hp.aggregate_rents(rents, '2018-06-01', groupby_cols=('au2001', 'num_bedrooms'))
    .loc[lambda x: x.region == 'Auckland']
)

In [None]:
# What fraction of rental data do we have by num_bedrooms?

def hits(group):
    d = {}
    d['hit_frac'] = group['rent_mean'].count()/group['rent_mean'].shape[0]
    return pd.Series(d)

date = '2018-06-01'
f = hp.aggregate_rents(rents, date, groupby_cols=('au2001', 'num_bedrooms'))
cond = f['region'] == 'Auckland'
print('census area units')
print(f[cond].copy().groupby('num_bedrooms').apply(hits).reset_index())

f = hp.aggregate_rents(rents, date, groupby_cols=('rental_area', 'num_bedrooms'))
cond = f['region'] == 'Auckland'
print('rental area units')
print(f[cond].copy().groupby('num_bedrooms').apply(hits).reset_index())


# Select latest two quarters and slice into regional chunks

In [None]:
# Get latest two quarters of rents
start_date = hp.get_latest_quarters(2)[0]
rents = hp.get_data('rents')
rents = rents.loc[lambda x: x.quarter >= start_date].copy()

# Create regional slices
for region in hp.REGIONS:
    # Build rents
    region_c = region.capitalize()
    region_rents = rents.loc[lambda x: x.region == region_c].copy()
    print(region, region_rents['quarter'].unique(), '#rows =', region_rents.shape[0])
    path = hp.get_path('rents', region)
    print('  Saving to', path)
    
    # Create region directory if it does not exist
    if not path.parent.exists():
        path.parent.mkdir()
        
    region_rents.to_csv(path, index=False)
    

# JSONize regional rents for the web, grouping by rental area and number of bedrooms

In [None]:
for region in hp.REGIONS:
    region_rents = hp.get_data('rents', region)
    d = hp.build_json_rents(region_rents)
    path = hp.get_path('rents_json', region)
    print('**', region, '\n', d)
    with path.open('w') as tgt:
        json.dump(d, tgt)

    