In [38]:
from pathlib import Path
import json
from functools import reduce
import math
import datetime as dt

import requests
import numpy as np
import pandas as pd
import geopandas as gpd
import shapely.ops as so

DATA_DIR = Path('../data')

CRS_NZGD49 = {'init': 'epsg:27200', 'no_defs': True}
CRS_NZTM = {'init': 'epsg:2193', 'no_defs': True}
CRS_WGS84 = {'init': 'epsg:4326'}

%matplotlib inline

In [None]:
# Prepare area unit table

path = DATA_DIR/'raw'/'Geographical Table.csv'
f = pd.read_csv(path, dtype={'SAU': str})
f = f.rename(columns={
    'SAU': 'au2001', 
    'SAU.Desc': 'au_name', 
    'TA': 'territory',
    'Region': 'region',
})
del f['Water']
f.head()

path = DATA_DIR/'raw'/'Market Rent Areas.csv'
g = pd.read_csv(path, dtype={'SAU': str})
g = g.rename(columns={
    'SAU': 'au2001', 
    'MARKET RENT DESCRIPTION': 'rental_area',
    'TA': 'territory',
    'AU NAME': 'au_name',
})

# Clean rental areas
def clean(x):
    y = x.split(' - ')
    y = y[1] if 'District' not in y[1] else y[0]
    return y

g['rental_area'] = g['rental_area'].map(clean)


f = f.merge(g[['au2001', 'rental_area']])

path = DATA_DIR/'au2001.csv'
f.to_csv(str(path), index=False)
f.head()

# Prepare geodata as GeoJSON

In [None]:
# Read Shapefile

path = DATA_DIR/'raw'/'NZ_AU01_region_simplified'/'NZ_AU01_region.shp'
au = gpd.read_file(str(path))
au.crs = CRS_NZGD49
au = au.to_crs(CRS_WGS84)
au = au.rename(columns={'AU01': 'au2001', 'AU_DESC': 'au_name'})
print(au.shape)
print(au.head())
au.head().plot()


In [None]:
# Remove water area units

pattern = r'ocean|strait|inlet|harbour'
cond = au['au_name'].str.contains(pattern, case=False)
au = au[~cond].copy()
print(au.shape)
au.head().plot()


In [None]:
# Merge geodata and metadata, drop null regions, and write to file

path = DATA_DIR/'au2001.csv'
f = pd.read_csv(path, dtype={'au2001': str})

g = au.merge(f[['au2001', 'territory', 'region', 'rental_area']])
g = g[g['region'].notnull()].copy()

path = DATA_DIR/'au2001.geojson'
with path.open('w') as tgt:
    tgt.write(g.to_json())

g.head()

# Create geodata for rental areas 

In [None]:
# Dissolve area units by area unit group

path = DATA_DIR/'au2001.geojson'
au = gpd.read_file(str(path))

ra = au[['rental_area', 'region', 'territory', 'geometry']].dissolve(by='rental_area').reset_index()

path = DATA_DIR/'rental_areas.geojson'
with path.open('w') as tgt:
    tgt.write(ra.to_json())

ra.head()

# Prepare rent data

In [None]:
# Reshape and merge all rent data sets

def clean(f, name):
    f = f.copy()
    f = f.rename(columns={
        'SAU': 'au2001',
        'Property_Type': 'property_type',
        'Bedrooms': '#bedrooms'
    })

    # Drop subtotals
    cond = False
    for col in ['au2001', 'property_type', '#bedrooms']:
        cond |= f[col].str.contains('total', case=False)

    f = f[~cond].copy()
    
    # Reshape
    id_vars = ['au2001', 'property_type', '#bedrooms']
    value_vars = [c for c in f.columns if '-' in c]
    f = pd.melt(f, id_vars=id_vars, value_vars=value_vars,
      var_name='quarter', value_name=name)
    
    return f

paths = [
    DATA_DIR/'raw'/'Detailed Bonds Lodged.csv',
    DATA_DIR/'raw'/'Detailed Mean Rents.csv',
    DATA_DIR/'raw'/'Detailed Geomean Rents.csv',
    DATA_DIR/'raw'/'Detailed Synthetic Lower Quartile Rents.csv',
    DATA_DIR/'raw'/'Detailed Synthetic Upper Quartile Rents.csv',
]
names = ['rent_count', 'rent_mean', 'rent_geo_mean', 'rent_synthetic_lower_quartile', 'rent_synthetic_upper_quartile']
frames = []
for path, name in zip(paths, names):
    f = pd.read_csv(path, dtype={'SAU': str})
    frames.append(clean(f, name))
    
f = reduce(lambda x, y: pd.merge(x, y), frames)

# Merge in region data
path = DATA_DIR/'au2001.csv'
g = pd.read_csv(path, dtype={'au2001': str})
f = f.merge(g)

# Write to file
path = DATA_DIR/'rents.csv'
f.to_csv(str(path), index=False)
f[f['rent_count'].notnull()].head()

# Explorer rents

In [None]:
path = DATA_DIR/'rents.csv'
f = pd.read_csv(path, dtype={'au2001': str})
f.head()


In [None]:
# Slice in time and aggregate 

def aggregate_rents(f, date, groupby_cols=('rental_area', '#bedrooms')):
    """
    """
    cond = f['quarter'] >= date
    f = f[cond].copy()
    
    def my_agg(group):
        d = {}
        d['territory'] = group['territory'].iat[0]
        d['region'] = group['region'].iat[0]
        d['rent_count'] = group['rent_count'].sum()
        d['rent_mean'] = (group['rent_mean']*group['rent_count']).sum()/d['rent_count']
        d['rent_geo_mean'] = (group['rent_geo_mean']**(group['rent_count']/d['rent_count'])).prod()
        return pd.Series(d)

    g = f.groupby(groupby_cols).apply(my_agg).reset_index()
    return g

agg_rents = aggregate_rents(f, '2016-12-01')
agg_rents

In [None]:
cond = agg_rents['region'] == 'Canterbury'
a = agg_rents[cond].copy()

def hits(group):
    d = {}
    d['hit_frac'] = group['rent_count'].dropna().shape[0]/group['rent_count'].shape[0]
    return pd.Series(d)

a.groupby('#bedrooms').apply(hits).reset_index()

# Choose representative points for rental areas using property titles

In [None]:
path = DATA_DIR/'rental_areas.geojson'
ra = gpd.read_file(str(path))

path = DATA_DIR/'property_titles.geojson'
t = gpd.read_file(str(path))
t.head()

In [None]:
%time f = gpd.sjoin(t[['geometry', 'fid']], ra, op='intersects')
f.head()

In [None]:
def pt(group):
    d = {}
    d['geometry'] = so.unary_union(group['geometry']).representative_point()
    d['territory'] = group['territory'].iat[0]
    d['region'] = group['region'].iat[0]
    return pd.Series(d)

g = gpd.GeoDataFrame(f.groupby('rental_area').apply(pt).reset_index())

path = DATA_DIR/'rental_area_points.geojson'
with path.open('w') as tgt:
    tgt.write(g.to_json())

g.head()

In [None]:
g[g['region'] == 'Auckland']

# Use Mapzen to calculate time-distance matrices

In [35]:
MAPZEN_ROUTE_URL = 'https://valhalla.mapzen.com'
MAPZEN_MATRIX_URL = 'https://matrix.mapzen.com'
MAPZEN_KEY = 'valhalla-Mc6zgDA'
LOCAL_URL = 'http://localhost:8002'

def get_route(origin, destination, mode, datetime=None, url=MAPZEN_ROUTE_URL, key=MAPZEN_KEY):
    """
    """
    url += '/route'
    if datetime is None:
        datetime = dt.datetime.now().strftime('%Y-%m-%dT%H:%M')
        
    json_params = {
        'locations': [
            {'lon': origin[0], 'lat': origin[1]}, 
            {'lon': destination[0], 'lat': destination[1]},         
        ],
        'costing': mode,
        'date_time': {'type':1, 'value': datetime},
    }
    params = {
        'api_key': key,
    }
                
    r = requests.get(url, json=json_params, params=params)

    # Raise an error if bad request
    r.raise_for_status()

    return r.json()

def get_matrix(origins, destinations, mode, datetime=None, url=MAPZEN_MATRIX_URL, key=MAPZEN_KEY):
    """ 
    Issue a GET request to the Valhalla time-distance matrix API at the given URL using the given API key.
    Use the many-to-one option and the given origins (list of WGS84 longitude-latitude pairs) 
    and destination (list of WGS84 longitude-latitude pairs).
    Return the (decoded) JSON response.
    
    Use the given mode of travel ('auto', 'bicycle', or 'pedestrian').
    
    NOTES:
        - Raise an HTTP error if the request fails
    """
    url += '/sources_to_targets'
    sources = [{'lon': lon, 'lat': lat} for lon, lat in origins]
    targets = [{'lon': lon, 'lat': lat} for lon, lat in destinations]
    if datetime is None:
        datetime = dt.datetime.now().strftime('%Y-%m-%dT%H:%M')
        
    json_params = {
        'sources': sources,
        'targets': targets,
        'costing': mode,
        'date_time': {'type':1, 'value': datetime},
    }
    params = {
        'api_key': key,
    }
                
    r = requests.get(url, json=json_params, params=params)

    # Raise an error if bad request
    r.raise_for_status()

    return r.json()

def matrix_to_df(matrix, orig_names=None, dest_names=None):
    """
    Given a (decoded) JSON time-distance matrix of the form output by :func:``get_matrix``, 
    a list of origin names (defaults to [0, 1, 2, etc.]), 
    and a list of destination names (defaults to [0, 1, 2, etc.]), convert the matrix to a DataFrame with
    the columns:
    
    - ``'origin'``: one of ``orig_names``
    - ``'destination'``: one of ``dest_names``
    - ``'time'``: time from origin to destination
    - ``'distance'``: distance from origin to destination
    
    The origin and destination names should be listed in the same order as the 'sources' and 'targets' 
    attributes of ``matrix``, respectively.
    """
    # Build DataFrame
    columns = ['from_index', 'to_index', 'distance', 'time']
    rows = [[d[0][c] for c in columns] for d in matrix['sources_to_targets']]
    f = pd.DataFrame(rows, columns=columns)
    
    # Map indices to names
    if orig_names is not None:
        orig_dict = dict(enumerate(orig_names))
        f['origin'] = f['from_index'].map(orig_dict)
    else:
        f['origin'] = f['from_index']
    if dest_names is not None:
        dest_dict = dict(enumerate(dest_names))
        f['destination'] = f['to_index'].map(dest_dict)
    else:
        f['destination'] = f['to_index']
        
    return f[['origin', 'destination', 'time', 'distance']].copy()
    
def collect_matrices(points, mode, datetime=None, chunk_size=49, url=MAPZEN_MATRIX_URL, key=MAPZEN_KEY):
    """
    Call :func:`get_matrix` repeatedly using the meshblock centroids (GeoDataFrame) 
    as origins and the cities (GeoDataFrame) as destinations.
    Only use meshblocks and cities within the same island (north or south), and
    group the time-distance calls into ``chunk_size``-to-1 chunks. 
    Aggregate the result into one DataFrame of the form output by :func:``matrix_to_df`` 
    with meshblock IDs listed as origins and city names listed as destinations.
    """
    frames = []
    for __, row in points.iterrows():
        dests = [row['geometry'].coords[0]]
        ra = row['rental_area']
        dest_names = [ra]
        
        # Chunk points and get matrix for each chuck to dests
        num_chunks = math.ceil(points.shape[0]/chunk_size)
        for g in np.array_split(points, num_chunks):
            # Get origins
            origs = [geo.coords[0] for geo in g['geometry']] 
            orig_names = g['rental_area'].values 
            # Get OD matrix
            try:
                j = get_matrix(origs, dests, mode=mode, datetime=datetime, url=url, key=key)
                df = matrix_to_df(j, orig_names, dest_names)
            except:
                df = pd.DataFrame()
                df['origin'] = orig_names
                df['destination'] = ra
                df['time'] = np.nan
                df['distance'] = np.nan
            frames.append(df)
            
    return pd.concat(frames).sort_values(['origin', 'destination'])


In [43]:
# Test some
orig = [ 174.7799, -36.8719]
dest = [174.7868, -36.9560]
datetime = '2017-05-12T08:00'
#get_route(orig, dest, mode='multimodal', datetime=datetime)
get_matrix([orig], [dest], mode='auto', datetime=datetime)

{'sources': [[{'lat': -36.871899, 'lon': 174.779907}]],
 'sources_to_targets': [[{'distance': 11.714,
    'from_index': 0,
    'time': 805,
    'to_index': 0}]],
 'targets': [[{'lat': -36.956001, 'lon': 174.786804}]],
 'units': 'km'}

In [28]:
path = DATA_DIR/'rental_area_points.geojson'
f = gpd.read_file(str(path))
f = f[f['region'] == 'Auckland'].copy()
f.head()

Unnamed: 0,geometry,id,region,rental_area,territory
2,POINT (174.7136142163085 -36.71740768433499),2,Auckland,Albany,North Shore City
6,POINT (174.689014050193 -36.89612845095292),6,Auckland,Avondale,Auckland City
10,POINT (174.7513445789724 -36.89353312125105),10,Auckland,Balmoral,Auckland City
13,POINT (174.6988893987124 -36.79717233861077),13,Auckland,Beachhaven/Birkdale,North Shore City
18,POINT (174.7059674167557 -36.91481011710743),18,Auckland,Blockhouse Bay/New Windsor,Auckland City


In [44]:
# Multimodal time-distance matrices not implemented by Mapzen

datetime = '2017-05-11T08:00'
for mode in ['pedestrian', 'bicycle', 'auto']:
    %time m = collect_matrices(f, mode=mode, datetime=datetime)
    print(m.head())
    path = DATA_DIR/'auckland'/'{!s}_matrix.csv'.format(mode)
    m.to_csv(str(path), index=False)

CPU times: user 3.77 s, sys: 144 ms, total: 3.92 s
Wall time: 8min 27s
   origin                 destination     time  distance
0  Albany                      Albany      0.0     0.000
0  Albany                    Avondale  19269.0    28.841
0  Albany                    Balmoral  16611.0    25.077
0  Albany         Beachhaven/Birkdale   8438.0    11.914
0  Albany  Blockhouse Bay/New Windsor  20089.0    29.993
CPU times: user 3.89 s, sys: 228 ms, total: 4.12 s
Wall time: 7min 58s
   origin                 destination    time  distance
0  Albany                      Albany     0.0     0.000
0  Albany                    Avondale  4778.0    30.902
0  Albany                    Balmoral  5616.0    36.069
0  Albany         Beachhaven/Birkdale  1975.0    12.630
0  Albany  Blockhouse Bay/New Windsor  5222.0    33.881
CPU times: user 4.12 s, sys: 192 ms, total: 4.32 s
Wall time: 5min
   origin                 destination    time  distance
0  Albany                      Albany     0.0     0.000
0