In [123]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from geopandas import GeoDataFrame
from shapely.geometry import shape, mapping, Point
import copy
import json
import math

# Import data

In [124]:
# Import Borough json
with open('../data/raw/misc/london_boroughs.json') as json_file:
    borough_coordinates = json.load(json_file)
# Import Ward json
with open('../data/raw/misc/london_wards_new.json') as json_file:
    ward_coordinates = json.load(json_file)

In [125]:
# Collect borough names
boroughs = []
for f in tqdm(borough_coordinates['features']):
    boroughs.append(f['properties']['name'])
    
# Collect ward names
wards = []
for f in tqdm(ward_coordinates['features']):
    wards.append(f['properties']['cmwd11nm'])

100%|██████████| 33/33 [00:00<00:00, 63725.61it/s]
100%|██████████| 628/628 [00:00<00:00, 99180.02it/s]


# Construct geopandas dataframe

## Boroughs

In [126]:
# Convert dictionary to dataframe
boroughs_df = pd.DataFrame.from_dict(borough_coordinates['features'])
# Remove unnecessary columns to avoid overlap with unpacked columns (see next line)
boroughs_df = boroughs_df.drop(columns=['type','id'])
# Expand dictionary-like columns
boroughs_df = boroughs_df.join(boroughs_df['properties'].apply(pd.Series))
# Drop unpacked column and unnecessary columns
boroughs_df = boroughs_df.drop(columns=['properties','id','code','inner_statistical'])
# Rename borough column
boroughs_df = boroughs_df.rename(columns={'name':'borough'})
# Remove spaces from Borough names
boroughs_df['borough_new'] = boroughs_df['borough'].apply(lambda x: x.replace(' ',''))
# Convert geometry to shapely multipolygon
boroughs_df['geometry'] = boroughs_df['geometry'].apply(lambda x: shape(x))
# Convert dataframe to geopandas
crs = {'init': 'epsg:4326'}
boroughs_gdf = GeoDataFrame(boroughs_df, crs=crs, geometry=boroughs_df['geometry'])

  return _prepare_from_string(" ".join(pjargs))


## Wards

In [127]:
# Convert dictionary to dataframe
wards_df = pd.DataFrame.from_dict(ward_coordinates['features'])
# Remove unnecessary columns to avoid overlap with unpacked columns (see next line)
wards_df = wards_df.drop(columns=['type'])
# Expand dictionary-like columns
wards_df = wards_df.join(wards_df['properties'].apply(pd.Series))
# Drop unpacked column and unnecessary columns
wards_df = wards_df.drop(columns=['properties','objectid','cmwd11nmw','lad11nmw','st_areasha','st_lengths'])
# Rename some columns
wards_df = wards_df.rename(columns={'cmwd11nm':'ward','lad11nm':'borough','cmwd11cd':'ward_code','lad11cd':'borough_code'})
# Homogenise borough names
wards_df = wards_df.replace('City and County of the City of London','City of London')
wards_df = wards_df.replace('City of Westminster','Westminster')

# Create new ward name columns to fix pathological cases
wards_df.loc[:,'ward_fixed'] = wards_df['ward']

# Fix ward names that are separated by a comma
comma_sep_ward_borough_pairs = wards_df[wards_df.ward.str.contains(',')][['ward','borough']].values
for w,b in comma_sep_ward_borough_pairs:
    wards_df.loc[(wards_df.ward==w) & (wards_df.borough==b),'ward_fixed'] = str(w).split(',')[0]
    
# Duplicate ward-borough pairs
duplicate_ward_borough = wards_df[wards_df['ward_fixed'].isin(wards_df['ward_fixed'][wards_df['ward_fixed'].duplicated()])][['ward_fixed','borough']].values

# Create new ward and borough name columns without spaces
wards_df.loc[:,'ward_new'] = wards_df['ward_fixed'].apply(lambda x: x.replace(' ',''))
wards_df.loc[:,'borough_new'] = wards_df['borough'].apply(lambda x: x.replace(' ',''))


# Loop over duplicates and rename them so they are unique across all boroughs
for w, b in duplicate_ward_borough:
    wards_df.loc[(wards_df.ward_fixed==w) & (wards_df.borough==b),'ward_new'] = str(w.replace(' ',''))+'_'+str(b.replace(' ',''))
    
# Convert geometry to shapely multipolygon
wards_df['geometry'] = wards_df['geometry'].apply(lambda x: shape(x))


# Convert dataframe to geopandas
crs = {'init': 'epsg:4326'}
wards_gdf = GeoDataFrame(wards_df, crs=crs, geometry=wards_df['geometry'])

In [128]:
# wards_df[wards_df.ward.str.contains(',')][['ward','borough','ward_fixed']].values

# Extract centroids

In [129]:
boroughs_gdf['lon'] = boroughs_gdf['geometry'].apply(lambda p: p.centroid.x)
boroughs_gdf['lat'] = boroughs_gdf['geometry'].apply(lambda p: p.centroid.y)

wards_gdf['lon'] = wards_gdf['geometry'].apply(lambda p: p.centroid.x)
wards_gdf['lat'] = wards_gdf['geometry'].apply(lambda p: p.centroid.y)

# Export data to file

In [130]:
boroughs_gdf = boroughs_gdf.sort_values(by='borough')
boroughs_gdf.to_file("../data/input/misc/borough_coordinates.json", driver='GeoJSON')

wards_gdf = wards_gdf.sort_values(by=['borough','ward'])
wards_gdf.to_file("../data/input/misc/ward_coordinates.json", driver='GeoJSON')

# Extract conceptual radii of Boroughs

This applies only when Boroughs are the origins and destinations

# Construct cost matrix

In [133]:
# Get new ward,borough names
wards_no_spaces = np.sort(wards_gdf['ward_new'].unique())
boroughs_no_spaces = np.sort(wards_gdf['borough_new'].unique())

# Initialise empty cost dataframe
cost_matrix = pd.DataFrame(0, index=wards_no_spaces, columns=np.sort(boroughs_no_spaces))
# Sort dataframe by index
cost_matrix = cost_matrix.sort_index(axis=1).sort_index(axis=0)

## Case 1: Origins != Destinations

In [134]:
# Loop over cost matrix rows
for origin,row in tqdm(cost_matrix.iterrows(),total=cost_matrix.shape[0]):
    # Compute distance between centroids for entries off the diagonal
    for destination in boroughs_no_spaces:
        # Store origin and destination geometry centroids
        origin_centroid = wards_gdf[wards_gdf['ward_new']==origin].geometry.centroid.values[0]
        destination_centroid = boroughs_gdf[boroughs_gdf['borough_new']==destination].geometry.centroid.values[0]
        # Store distance between origin and destination centroids
        cost_matrix.loc[origin,destination] = math.sqrt(origin_centroid.distance(destination_centroid))
        

100%|██████████| 628/628 [01:26<00:00,  7.25it/s]


## Case 2: Origins == Destinations

In [101]:
# # Loop over Multipolygons
# for i,row in tqdm(boroughs_gdf.iterrows(),total=boroughs_gdf.shape[0]):
#     # Initialise maximum,minimum distance
#     max_dist = 0
#     min_dist = 0
#     # Loop over polygons in Multipolygon
#     for polygon in row.geometry:
#         # Loop over points in polygon (-1 prevents duplicating the first vertex)
#         for point in polygon.exterior.coords[:-1]:
#             # Compute distance between centroid and point in polygon
#             dist = math.sqrt(row.geometry.centroid.distance(Point(point)))
#             # Update maximum distance if current distance is larger
#             if dist > max_dist:
#                 max_dist = dist
#             if dist < max_dist:
#                 min_dist = dist
#     # Append maximum distance between centroid and polygon and divide by sqrt(2) 
#     # to approximate radius of cirle surrounding polygon
#     boroughs_gdf.at[i,'max_radius'] = max_dist/math.sqrt(2)
#     boroughs_gdf.at[i,'min_radius'] = min_dist/math.sqrt(2)
#     boroughs_gdf.at[i,'avg_radius'] = (max_dist+min_dist)/2
# 
# # Loop over cost matrix rows
# for origin,row in tqdm(cost_matrix.iterrows(),total=cost_matrix.shape[0]):
#     # Compute distance between centroids for entries off the diagonal
#     for destination in boroughs_no_spaces:
#         # Computations in the case of origin = destination have been completed - so avoid them
#         if destination != origin:
#             # Store origin and destination geometry centroids
#             origin_centroid = boroughs_gdf[boroughs_gdf['name']==origin].geometry.centroid.values[0]
#             destination_centroid = boroughs_gdf[boroughs_gdf['name']==destination].geometry.centroid.values[0]
#             # Store distance between origin and destination centroids
#             cost_matrix.loc[origin,destination] = math.sqrt(origin_centroid.distance(destination_centroid))
#         else:
#             # Fill in diagonal with radii values
# #             cost_matrix.loc[origin,destination] = boroughs_gdf[boroughs_gdf['name']==origin]['max_radius'].values[0]
# #             cost_matrix.loc[origin,destination] = boroughs_gdf[boroughs_gdf['name']==origin]['min_radius'].values[0]
#             cost_matrix.loc[origin,destination] = boroughs_gdf[boroughs_gdf['name']==origin]['avg_radius'].values[0]
        

# Export cost matrix and Borough names as dataframe and numpy array

In [135]:
cost_matrix/np.sum(cost_matrix)

Unnamed: 0,BarkingandDagenham,Barnet,Bexley,Brent,Bromley,Camden,CityofLondon,Croydon,Ealing,Enfield,...,Merton,Newham,Redbridge,RichmonduponThames,Southwark,Sutton,TowerHamlets,WalthamForest,Wandsworth,Westminster
AbbeyRoad,0.001749,0.001069,0.001759,0.001114,0.001708,0.000636,0.001251,0.001578,0.001331,0.001358,...,0.001350,0.001691,0.001695,0.001382,0.001428,0.001478,0.001523,0.001548,0.001166,0.000689
AbbeyWood,0.000770,0.002136,0.000613,0.002278,0.001169,0.002173,0.001917,0.001726,0.002270,0.001798,...,0.002150,0.001077,0.001076,0.002239,0.001787,0.002003,0.001566,0.001498,0.002214,0.002206
Abbey_BarkingandDagenham,0.000740,0.001977,0.000983,0.002152,0.001326,0.002003,0.001745,0.001750,0.002175,0.001578,...,0.002095,0.000758,0.000714,0.002170,0.001677,0.001984,0.001367,0.001200,0.002125,0.002060
Abbey_Merton,0.001853,0.001631,0.001778,0.001479,0.001598,0.001525,0.001582,0.001216,0.001430,0.001798,...,0.000327,0.001841,0.001875,0.001209,0.001486,0.000824,0.001735,0.001859,0.000780,0.001355
Abingdon,0.001812,0.001256,0.001792,0.001120,0.001699,0.001045,0.001377,0.001489,0.001254,0.001540,...,0.001117,0.001773,0.001785,0.001221,0.001454,0.001310,0.001623,0.001688,0.000867,0.000842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WoolwichRiverside,0.000975,0.001951,0.000943,0.002092,0.001117,0.001922,0.001610,0.001556,0.002108,0.001625,...,0.001947,0.000725,0.001033,0.002076,0.001474,0.001828,0.001215,0.001283,0.001982,0.001946
WorcesterPark,0.001984,0.001763,0.001897,0.001560,0.001715,0.001763,0.001856,0.001350,0.001415,0.001959,...,0.000814,0.002022,0.002026,0.001090,0.001759,0.000854,0.001967,0.002044,0.001171,0.001634
WormholtandWhiteCity,0.001915,0.001195,0.001901,0.000864,0.001826,0.001229,0.001612,0.001640,0.001037,0.001605,...,0.001255,0.001916,0.001893,0.001087,0.001687,0.001427,0.001811,0.001812,0.001133,0.001169
Yeading,0.002266,0.001627,0.002245,0.001300,0.002197,0.001992,0.002294,0.002068,0.000819,0.002024,...,0.001792,0.002381,0.002280,0.001140,0.002320,0.001834,0.002389,0.002285,0.001867,0.002005


In [136]:
# Export to csv
cost_matrix.to_csv('../data/input/commuter/cost_matrix.csv')
# Export to txt
np.savetxt('../data/input/commuter/cost_matrix.txt',cost_matrix.to_numpy())

# Sanity check with retail cost matrix

In [27]:
retail_cost_matrix = np.loadtxt('../data/input/retail/cost_matrix.txt')

In [None]:
pd.DataFrame(retail_cost_matrix)