In [1]:
from pathlib import Path
import json
from functools import reduce
import math
import datetime as dt
import pytz 
from itertools import product
from collections import OrderedDict
import time

import requests
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely.ops as so

ROOT = Path('../')
DATA_DIR = ROOT/'data'

CRS_NZGD49 = {'init': 'epsg:27200', 'no_defs': True}
CRS_NZTM = {'init': 'epsg:2193', 'no_defs': True}
CRS_WGS84 = {'init': 'epsg:4326'}

%matplotlib inline

# Prepare table of 2001 area units and rental area units

In [None]:
# 2001 census area units
path = DATA_DIR/'raw'/'Geographical Table.csv'
f = pd.read_csv(path, dtype={'SAU': str})
f = f.rename(columns={
    'SAU': 'au2001', 
    'SAU.Desc': 'au_name', 
    'TA': 'territory',
    'Region': 'region',
})
del f['Water']
f.head()

# rental area units
path = DATA_DIR/'raw'/'Market Rent Areas.csv'
g = pd.read_csv(path, dtype={'SAU': str})
g = g.rename(columns={
    'SAU': 'au2001', 
    'MARKET RENT DESCRIPTION': 'rental_area',
    'TA': 'territory',
    'AU NAME': 'au_name',
})

# Clean rental areas
def clean(x):
    y = x.split(' - ')
    y = y[1] if 'District' not in y[1] else y[0]
    return y

g['rental_area'] = g['rental_area'].map(clean)


f = f.merge(g[['au2001', 'rental_area']])

path = DATA_DIR/'au2001.csv'
f.to_csv(str(path), index=False)
f.head()

# Prepare geodata as GeoJSON

In [None]:
# Read Shapefile

path = DATA_DIR/'raw'/'NZ_AU01_region_simplified'/'NZ_AU01_region.shp'
au = gpd.read_file(str(path))
au.crs = CRS_NZGD49
au = au.to_crs(CRS_WGS84)
au = au.rename(columns={'AU01': 'au2001', 'AU_DESC': 'au_name'})
print(au.shape)
print(au.head())
au.head().plot()


In [None]:
# Remove water area units

pattern = r'ocean|strait|inlet|harbour'
cond = au['au_name'].str.contains(pattern, case=False)
au = au[~cond].copy()
print(au.shape)
au.head().plot()


In [None]:
# Merge geodata and metadata, drop null regions, and write to file

path = DATA_DIR/'au2001.csv'
f = pd.read_csv(path, dtype={'au2001': str})

g = au.merge(f[['au2001', 'territory', 'region', 'rental_area']])
g = g[g['region'].notnull()].copy()

path = DATA_DIR/'au2001.geojson'
with path.open('w') as tgt:
    tgt.write(g.to_json())

g.head()

# Create geodata for rental areas 

In [None]:
# Dissolve area units by area unit group

path = DATA_DIR/'au2001.geojson'
au = gpd.read_file(str(path))

ra = au[['rental_area', 'region', 'territory', 'geometry']].dissolve(by='rental_area').reset_index()

path = DATA_DIR/'rental_areas.geojson'
with path.open('w') as tgt:
    tgt.write(ra.to_json())

ra.head()

# Choose representative points for rental areas using approximate centroids of property titles

In [None]:
path = DATA_DIR/'rental_areas.geojson'
ra = gpd.read_file(str(path))

path = DATA_DIR/'property_titles.geojson'
t = gpd.read_file(str(path))
t.head()

In [5]:
# Spatial-join titles to rental areas

%time f = gpd.sjoin(t[['geometry', 'fid']], ra, op='intersects')
f.head()

NameError: name 't' is not defined

Unnamed: 0,au2001,property_type,#bedrooms,quarter,rent_count,rent_mean,rent_geo_mean,rent_synthetic_lower_quartile,rent_synthetic_upper_quartile,au_name,territory,region,rental_area
550378,563701,Flat or Apartment,1,1993-03-01,,,,,,Waikanae Beach,Kapiti Coast District,Wellington,Waikanae/Otaki
550379,563701,Flat or Apartment,2,1993-03-01,,,,,,Waikanae Beach,Kapiti Coast District,Wellington,Waikanae/Otaki
550380,563701,House,1,1993-03-01,,,,,,Waikanae Beach,Kapiti Coast District,Wellington,Waikanae/Otaki
550381,563701,House,2,1993-03-01,6.0,128.0,127.0,115.0,139.0,Waikanae Beach,Kapiti Coast District,Wellington,Waikanae/Otaki
550382,563701,House,3,1993-03-01,8.0,159.0,158.0,142.0,175.0,Waikanae Beach,Kapiti Coast District,Wellington,Waikanae/Otaki


In [None]:
# Choose representative points for rental areas

def pt(group):
    d = {}
    d['geometry'] = so.unary_union(group['geometry']).representative_point()
    d['territory'] = group['territory'].iat[0]
    d['region'] = group['region'].iat[0]
    return pd.Series(d)

g = gpd.GeoDataFrame(f.groupby('rental_area').apply(pt).reset_index())

path = DATA_DIR/'rental_points.geojson'
with path.open('w') as tgt:
    tgt.write(g.to_json())

g.head()

# Prepare rent data

In [None]:
# Reshape and merge all rent data sets

def clean(f, name):
    f = f.copy()
    f = f.rename(columns={
        'SAU': 'au2001',
        'Property_Type': 'property_type',
        'Bedrooms': '#bedrooms'
    })

    # Drop subtotals
    cond = False
    for col in ['au2001', 'property_type', '#bedrooms']:
        cond |= f[col].str.contains('total', case=False)

    f = f[~cond].copy()
    
    # Reshape
    id_vars = ['au2001', 'property_type', '#bedrooms']
    value_vars = [c for c in f.columns if '-' in c]
    f = pd.melt(f, id_vars=id_vars, value_vars=value_vars,
      var_name='quarter', value_name=name)
    
    return f

paths = [
    DATA_DIR/'raw'/'Detailed Bonds Lodged.csv',
    DATA_DIR/'raw'/'Detailed Mean Rents.csv',
    DATA_DIR/'raw'/'Detailed Geomean Rents.csv',
    DATA_DIR/'raw'/'Detailed Synthetic Lower Quartile Rents.csv',
    DATA_DIR/'raw'/'Detailed Synthetic Upper Quartile Rents.csv',
]
names = ['rent_count', 'rent_mean', 'rent_geo_mean', 'rent_synthetic_lower_quartile', 'rent_synthetic_upper_quartile']
frames = []
for path, name in zip(paths, names):
    f = pd.read_csv(path, dtype={'SAU': str})
    frames.append(clean(f, name))
    
f = reduce(lambda x, y: pd.merge(x, y), frames)

# Merge in region data
path = DATA_DIR/'au2001.csv'
g = pd.read_csv(path, dtype={'au2001': str})
f = f.merge(g)

# Write to file
path = DATA_DIR/'rents.csv'
f.to_csv(str(path), index=False)
f[f['rent_count'].notnull()].head()

# Explore rents

In [105]:
path = DATA_DIR/'rents.csv'
rents = pd.read_csv(path, dtype={'au2001': str})
rents.head()


Unnamed: 0,au2001,property_type,#bedrooms,quarter,rent_count,rent_mean,rent_geo_mean,rent_synthetic_lower_quartile,rent_synthetic_upper_quartile,au_name,territory,region,rental_area
0,500100,House,2,1993-03-01,,,,,,Awanui,Far North District,Northland,Rural Far North
1,500100,House,3,1993-03-01,,,,,,Awanui,Far North District,Northland,Rural Far North
2,500100,House,2,1993-06-01,,,,,,Awanui,Far North District,Northland,Rural Far North
3,500100,House,3,1993-06-01,,,,,,Awanui,Far North District,Northland,Rural Far North
4,500100,House,2,1993-09-01,,,,,,Awanui,Far North District,Northland,Rural Far North


In [106]:
# Slice in time and aggregate 

def hits(group):
    d = {}
    d['hit_frac'] = group['rent_count'].dropna().shape[0]/group['rent_count'].shape[0]
    return pd.Series(d)

def aggregate_rents(f, date, groupby_cols=('rental_area', '#bedrooms')):
    """
    """
    cond = f['quarter'] >= date
    f = f[cond].copy()
    
    def my_agg(group):
        d = {}
        d['territory'] = group['territory'].iat[0]
        d['region'] = group['region'].iat[0]
        d['rent_count'] = group['rent_count'].sum()
        d['rent_mean'] = (group['rent_mean']*group['rent_count']).sum()/d['rent_count']
        d['rent_geo_mean'] = (group['rent_geo_mean']**(group['rent_count']/d['rent_count'])).prod()
        return pd.Series(d)

    g = f.groupby(groupby_cols).apply(my_agg).reset_index()
    return g

agg_rents = aggregate_rents(rents, '2016-12-01')
agg_rents.head()

Unnamed: 0,rental_area,#bedrooms,region,rent_count,rent_geo_mean,rent_mean,territory
0,Addington,1,Canterbury,133.0,214.863943,218.654135,Christchurch City
1,Addington,2,Canterbury,56.0,339.402991,342.535714,Christchurch City
2,Addington,3,Canterbury,54.0,414.203127,418.796296,Christchurch City
3,Addington,4,Canterbury,5.0,488.0,494.0,Christchurch City
4,Addington,5+,Canterbury,,,,Christchurch City


In [111]:
# What fraction of rental data do we have by #bedrooms?

date = '2016-09-01'
f = aggregate_rents(rents, date, groupby_cols=('au2001', '#bedrooms'))
cond = f['region'] == 'Auckland'
print('census area units')
print(f[cond].copy().groupby('#bedrooms').apply(hits).reset_index())

f = aggregate_rents(rents, date, groupby_cols=('rental_area', '#bedrooms'))
cond = f['region'] == 'Auckland'
print('rental area units')
print(f[cond].copy().groupby('#bedrooms').apply(hits).reset_index())


census area units
  #bedrooms  hit_frac
0         1  0.396226
1         2  0.662791
2         3  0.843750
3         4  0.602305
4        5+  0.115016
rental area units
  #bedrooms  hit_frac
0         1  0.767677
1         2  0.959596
2         3  1.000000
3         4  0.919192
4        5+  0.252525


# Prepare regional slices of data

In [4]:
path = DATA_DIR/'rental_areas.geojson'
ra = gpd.read_file(str(path))

path = DATA_DIR/'rental_points.geojson'
rap = gpd.read_file(str(path))

path = DATA_DIR/'rents.csv'
rents = pd.read_csv(path, dtype={'au2001': str})

regions = [
    'auckland', 
    'canterbury', 
    'wellington',
]
for region in regions:
    root = DATA_DIR/region
    if not root.exists():
        root.mkdir()
        
    region_c = region.capitalize()

    # Rental areas slice
    f = ra[ra['region'] == region_c].copy()
    path = root/'rental_areas.geojson'
    with path.open('w') as tgt:
        tgt.write(f.to_json())
        
    # Rental area points slice
    f = rap[rap['region'] == region_c].copy()
    path = root/'rental_points.geojson'
    with path.open('w') as tgt:
        tgt.write(f.to_json())
        
    # Rents slice
    f = rents[rents['region'] == region_c].copy()
    path = root/'rents.csv'
    f.to_csv(str(path), index=False)
    

In [None]:
regions = [
    'auckland', 
    'canterbury', 
    'wellington',
]
for region in regions:
    path = DATA_DIR/region/'rental_points.geojson'
    new_path = DATA_DIR/region
    if not root.exists():
        root.mkdir()
