In [196]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from geopandas import GeoDataFrame
from shapely.geometry import shape, mapping, Point
import copy
import json
import math

# Import data

In [244]:
# Import Borough json
with open('../data/raw/misc/london_boroughs.json') as json_file:
    borough_coordinates = json.load(json_file)
# Import Ward json
with open('../data/raw/misc/london_wards.json') as json_file:
    ward_coordinates = json.load(json_file)
with open('../data/input/misc/ward_coordinates_new.json') as json_file:
    ward_coordinates2 = json.load(json_file)

In [245]:
# Collect borough names
boroughs = []
for f in tqdm(borough_coordinates['features']):
    boroughs.append(f['properties']['name'])
    
# Collect ward names
wards = []
wards2 = []
for f in tqdm(ward_coordinates['features']):
    wards.append(f['properties']['NAME'])
for f in tqdm(ward_coordinates2['features']):
    wards2.append(f['properties']['ward'])

100%|██████████| 33/33 [00:00<00:00, 17647.84it/s]
100%|██████████| 657/657 [00:00<00:00, 220382.10it/s]
100%|██████████| 628/628 [00:00<00:00, 231237.20it/s]


# Construct geopandas dataframe

## Boroughs

In [246]:
# Convert dictionary to dataframe
boroughs_df = pd.DataFrame.from_dict(borough_coordinates['features'])
# Remove unnecessary columns to avoid overlap with unpacked columns (see next line)
boroughs_df = boroughs_df.drop(columns=['type','id'])
# Expand dictionary-like columns
boroughs_df = boroughs_df.join(boroughs_df['properties'].apply(pd.Series))
# Drop unpacked column and unnecessary columns
boroughs_df = boroughs_df.drop(columns=['properties','id','inner_statistical'])
# Rename borough column
boroughs_df = boroughs_df.rename(columns={'name':'borough','code':'borough_code'})
# Remove spaces from Borough names
boroughs_df['borough_new'] = boroughs_df['borough'].apply(lambda x: x.replace(' ',''))
# Convert geometry to shapely multipolygon
boroughs_df['geometry'] = boroughs_df['geometry'].apply(lambda x: shape(x))
# Convert dataframe to geopandas
crs = {'init': 'epsg:4326'}
boroughs_gdf = GeoDataFrame(boroughs_df, crs=crs, geometry=boroughs_df['geometry'])

## Wards

In [232]:
# Convert dictionary to dataframe
wards_df = pd.DataFrame.from_dict(ward_coordinates['features'])
# Remove unnecessary columns to avoid overlap with unpacked columns (see next line)
wards_df = wards_df.drop(columns=['type'])
# Expand dictionary-like columns
wards_df = wards_df.join(wards_df['properties'].apply(pd.Series))
# Drop unpacked column and unnecessary columns
wards_df = wards_df.drop(columns=['properties','NONLD_AREA','HECTARES'])
# Rename some columns
wards_df = wards_df.rename(columns={'NAME':'ward','DISTRICT':'borough','GSS_CODE':'new_ward_code','LAGSSCODE':'borough_code'})
# Homogenise borough names
wards_df = wards_df.replace('City and County of the City of London','City of London')
wards_df = wards_df.replace('City of Westminster','Westminster')

# Create new ward name columns to fix pathological cases
wards_df.loc[:,'ward_fixed'] = wards_df['ward']

# Fix ward names that are separated by a comma
comma_sep_ward_borough_pairs = wards_df[wards_df.ward.str.contains(',')][['ward','borough']].values
for w,b in comma_sep_ward_borough_pairs:
    wards_df.loc[(wards_df.ward==w) & (wards_df.borough==b),'ward_fixed'] = str(w).split(',')[0]
    
# Duplicate ward-borough pairs
duplicate_ward_borough = wards_df[wards_df['ward_fixed'].isin(wards_df['ward_fixed'][wards_df['ward_fixed'].duplicated()])][['ward_fixed','borough']].values

# Create new ward and borough name columns without spaces
wards_df.loc[:,'ward_new'] = wards_df['ward_fixed'].apply(lambda x: x.replace(' ',''))
wards_df.loc[:,'borough_new'] = wards_df['borough'].apply(lambda x: x.replace(' ',''))


# Loop over duplicates and rename them so they are unique across all boroughs
for w, b in duplicate_ward_borough:
    wards_df.loc[(wards_df.ward_fixed==w) & (wards_df.borough==b),'ward_new'] = str(w.replace(' ',''))+'_'+str(b.replace(' ',''))
    
# Convert geometry to shapely multipolygon
wards_df['geometry'] = wards_df['geometry'].apply(lambda x: shape(x))


# Convert dataframe to geopandas
crs = {'init': 'epsg:4326'}
wards_gdf = GeoDataFrame(wards_df, crs=crs, geometry=wards_df['geometry'])

In [233]:
# Convert dictionary to dataframe
wards_df2 = pd.DataFrame.from_dict(ward_coordinates2['features'])

# Expand dictionary-like columns
wards_df2 = wards_df2.join(wards_df2['properties'].apply(pd.Series))

# Drop unpacked column and unnecessary columns
wards_df2 = wards_df2.drop(columns=['properties','type'])

# Convert geometry to shapely multipolygon
wards_df2['geometry'] = wards_df2['geometry'].apply(lambda x: shape(x))

# Convert dataframe to geopandas
crs = {'init': 'epsg:4326'}
wards_gdf2 = GeoDataFrame(wards_df2, crs=crs, geometry=wards_df2['geometry'])

# Extract centroids

In [234]:
boroughs_gdf['lon'] = boroughs_gdf['geometry'].apply(lambda p: p.centroid.x)
boroughs_gdf['lat'] = boroughs_gdf['geometry'].apply(lambda p: p.centroid.y)

wards_gdf['lon'] = wards_gdf['geometry'].apply(lambda p: p.centroid.x)
wards_gdf['lat'] = wards_gdf['geometry'].apply(lambda p: p.centroid.y)

wards_gdf2['lon'] = wards_gdf2['geometry'].apply(lambda p: p.centroid.x)
wards_gdf2['lat'] = wards_gdf2['geometry'].apply(lambda p: p.centroid.y)

In [285]:
# # Merge two dataframes to get new ids
# wards_gdf_new = pd.merge(wards_gdf2, wards_gdf[['ward_new','new_ward_code']], on=['ward_new'],  how='left')

In [299]:
wards_gdf[wards_gdf['ward'].isin(null_wards2)].sort_values('ward')

Unnamed: 0,geometry,ward,new_ward_code,borough,borough_code,ward_fixed,ward_new,borough_new,lon,lat
354,"MULTIPOLYGON (((0.06252 51.60708, 0.06254 51.6...",Bridge,E05011236,Redbridge,E09000026,Bridge,Bridge_Redbridge,Redbridge,0.052084,51.606039
647,"MULTIPOLYGON (((-0.08776 51.50788, -0.08810 51...",Bridge,E05009294,City of London,E09000001,Bridge,Bridge_CityofLondon,CityofLondon,-0.086107,51.509662
207,"MULTIPOLYGON (((-0.15166 51.62278, -0.15182 51...",Brunswick Park,E05000043,Barnet,E09000003,Brunswick Park,BrunswickPark,Barnet,-0.14488,51.628592
316,"MULTIPOLYGON (((-0.07496 51.64829, -0.07582 51...",Grange,E05000200,Enfield,E09000010,Grange,Grange,Enfield,-0.091438,51.644188
392,"MULTIPOLYGON (((-0.35303 51.44882, -0.35378 51...",Heathfield,E05000523,Richmond upon Thames,E09000027,Heathfield,Heathfield,RichmonduponThames,-0.371841,51.448335
335,"MULTIPOLYGON (((-0.04823 51.57190, -0.04622 51...",Lea Bridge,E05000603,Waltham Forest,E09000031,Lea Bridge,LeaBridge_WalthamForest,WalthamForest,-0.032471,51.568408
562,"MULTIPOLYGON (((-0.05736 51.56183, -0.05651 51...",Lea Bridge,E05009380,Hackney,E09000012,Lea Bridge,LeaBridge_Hackney,Hackney,-0.050329,51.558767
461,"MULTIPOLYGON (((-0.19645 51.48721, -0.19762 51...",North End,E05000258,Hammersmith and Fulham,E09000013,North End,NorthEnd,HammersmithandFulham,-0.205911,51.48787
409,"MULTIPOLYGON (((-0.20400 51.42862, -0.20418 51...",Village,E05000472,Merton,E09000024,Village,Village_Merton,Merton,-0.231471,51.427269
618,"MULTIPOLYGON (((0.14790 51.54160, 0.14821 51.5...",Village,E05000041,Barking and Dagenham,E09000002,Village,Village_BarkingandDagenham,BarkingandDagenham,0.166279,51.539049


# Export data to file

In [278]:
# wards_gdf_new = wards_gdf_new.drop(columns=['new_ward_code_y'])
# wards_gdf_new = wards_gdf_new.rename(columns = {"new_ward_code_x":"new_ward_code"})

In [274]:
null_wards2 = wards_gdf_new[wards_gdf_new['new_ward_code_x'].isna()]['ward_new'].values
print(len(null_wards2))

85


In [275]:
item = 0

In [277]:
wards_gdf_new[wards_gdf_new['new_ward_code'].isna()].sort_values('ward')[item:]

KeyError: 'new_ward_code'

In [226]:
wards_gdf2[wards_gdf2['ward_new'].isin(null_wards2)].sort_values('ward')[item:]

Unnamed: 0,geometry,ward_code,ward,borough_code,borough,ward_fixed,ward_new,borough_new,lon,lat
124,"MULTIPOLYGON (((-0.07092 51.38460, -0.07117 51...",E36007175,Addiscombe,E09000008,Croydon,Addiscombe,Addiscombe,Croydon,-0.081637,51.380755
125,"MULTIPOLYGON (((-0.04177 51.38929, -0.04199 51...",E36007176,Ashburton,E09000008,Croydon,Ashburton,Ashburton,Croydon,-0.058288,51.384209
551,"MULTIPOLYGON (((-0.05730 51.53443, -0.05682 51...",E36007602,Bethnal Green North,E09000030,Tower Hamlets,Bethnal Green North,BethnalGreenNorth,TowerHamlets,-0.057485,51.531230
552,"MULTIPOLYGON (((-0.04623 51.52187, -0.04671 51...",E36007603,Bethnal Green South,E09000030,Tower Hamlets,Bethnal Green South,BethnalGreenSouth,TowerHamlets,-0.056620,51.523415
40,"MULTIPOLYGON (((0.10301 51.45059, 0.10246 51.4...",E36007095,Blackfen and Lamorbey,E09000004,Bexley,Blackfen and Lamorbey,BlackfenandLamorbey,Bexley,0.091728,51.445509
...,...,...,...,...,...,...,...,...,...,...
531,"MULTIPOLYGON (((-0.06469 51.47398, -0.06388 51...",E36007582,The Lane,E09000028,Southwark,The Lane,TheLane,Southwark,-0.068914,51.468295
144,"MULTIPOLYGON (((-0.07855 51.41985, -0.07846 51...",E36007195,Upper Norwood,E09000008,Croydon,Upper Norwood,UpperNorwood,Croydon,-0.098738,51.414625
532,"MULTIPOLYGON (((-0.08295 51.45853, -0.08276 51...",E36007583,Village,E09000028,Southwark,Village,Village_Southwark,Southwark,-0.086746,51.449780
493,"MULTIPOLYGON (((0.04104 51.58427, 0.04107 51.5...",E36007544,Wanstead,E09000026,Redbridge,Wanstead,Wanstead,Redbridge,0.032815,51.567211


In [227]:
wards_gdf[wards_gdf['ward_new'].isin(null_wards)].sort_values('ward')[item:]

Unnamed: 0,geometry,ward,new_ward_code,borough,borough_code,ward_fixed,ward_new,borough_new,lon,lat
38,"MULTIPOLYGON (((-0.06379 51.37556, -0.06415 51...",Addiscombe East,E05011462,Croydon,E09000008,Addiscombe East,AddiscombeEast,Croydon,-0.069592,51.380178
37,"MULTIPOLYGON (((-0.08093 51.37511, -0.08124 51...",Addiscombe West,E05011463,Croydon,E09000008,Addiscombe West,AddiscombeWest,Croydon,-0.084309,51.380117
527,"MULTIPOLYGON (((-0.05621 51.52096, -0.05609 51...",Bethnal Green,E05009317,Tower Hamlets,E09000030,Bethnal Green,BethnalGreen,TowerHamlets,-0.047840,51.526384
298,"MULTIPOLYGON (((0.14258 51.44832, 0.14128 51.4...",Bexleyheath,E05011219,Bexley,E09000004,Bexleyheath,Bexleyheath,Bexley,0.148354,51.459035
648,"MULTIPOLYGON (((-0.08548 51.50860, -0.08548 51...",Billingsgate,E05009291,City of London,E09000001,Billingsgate,Billingsgate,CityofLondon,-0.082661,51.509180
...,...,...,...,...,...,...,...,...,...,...
649,"MULTIPOLYGON (((-0.08839 51.51279, -0.08858 51...",Walbrook,E05009312,City of London,E09000001,Walbrook,Walbrook,CityofLondon,-0.089061,51.513620
349,"MULTIPOLYGON (((0.02034 51.55627, 0.02007 51.5...",Wanstead Park,E05011254,Redbridge,E09000026,Wanstead Park,WansteadPark,Redbridge,0.035164,51.564773
364,"MULTIPOLYGON (((0.01913 51.56985, 0.01906 51.5...",Wanstead Village,E05011255,Redbridge,E09000026,Wanstead Village,WansteadVillage,Redbridge,0.026852,51.579009
305,"MULTIPOLYGON (((0.14325 51.47015, 0.14301 51.4...",West Heath,E05011233,Bexley,E09000004,West Heath,WestHeath,Bexley,0.130046,51.475755


In [228]:
# # boroughs_gdf = boroughs_gdf.sort_values(by='borough')
# # boroughs_gdf.to_file("../data/input/misc/borough_coordinates.json", driver='GeoJSON')

# wards_gdf_new = wards_gdf_new.sort_values(by=['borough','ward'])
# wards_gdf_new.to_file("../data/input/misc/ward_coordinates_new.json", driver='GeoJSON')

# Extract conceptual radii of Boroughs

This applies only when Boroughs are the origins and destinations

# Construct cost matrix

In [133]:
# Get new ward,borough names
wards_no_spaces = np.sort(wards_gdf['ward_new'].unique())
boroughs_no_spaces = np.sort(wards_gdf['borough_new'].unique())

# Initialise empty cost dataframe
cost_matrix = pd.DataFrame(0, index=wards_no_spaces, columns=np.sort(boroughs_no_spaces))
# Sort dataframe by index
cost_matrix = cost_matrix.sort_index(axis=1).sort_index(axis=0)

## Case 1: Origins != Destinations

In [134]:
# Loop over cost matrix rows
for origin,row in tqdm(cost_matrix.iterrows(),total=cost_matrix.shape[0]):
    # Compute distance between centroids for entries off the diagonal
    for destination in boroughs_no_spaces:
        # Store origin and destination geometry centroids
        origin_centroid = wards_gdf[wards_gdf['ward_new']==origin].geometry.centroid.values[0]
        destination_centroid = boroughs_gdf[boroughs_gdf['borough_new']==destination].geometry.centroid.values[0]
        # Store distance between origin and destination centroids
        cost_matrix.loc[origin,destination] = math.sqrt(origin_centroid.distance(destination_centroid))
        

100%|██████████| 628/628 [01:26<00:00,  7.25it/s]


## Case 2: Origins == Destinations

In [101]:
# # Loop over Multipolygons
# for i,row in tqdm(boroughs_gdf.iterrows(),total=boroughs_gdf.shape[0]):
#     # Initialise maximum,minimum distance
#     max_dist = 0
#     min_dist = 0
#     # Loop over polygons in Multipolygon
#     for polygon in row.geometry:
#         # Loop over points in polygon (-1 prevents duplicating the first vertex)
#         for point in polygon.exterior.coords[:-1]:
#             # Compute distance between centroid and point in polygon
#             dist = math.sqrt(row.geometry.centroid.distance(Point(point)))
#             # Update maximum distance if current distance is larger
#             if dist > max_dist:
#                 max_dist = dist
#             if dist < max_dist:
#                 min_dist = dist
#     # Append maximum distance between centroid and polygon and divide by sqrt(2) 
#     # to approximate radius of cirle surrounding polygon
#     boroughs_gdf.at[i,'max_radius'] = max_dist/math.sqrt(2)
#     boroughs_gdf.at[i,'min_radius'] = min_dist/math.sqrt(2)
#     boroughs_gdf.at[i,'avg_radius'] = (max_dist+min_dist)/2
# 
# # Loop over cost matrix rows
# for origin,row in tqdm(cost_matrix.iterrows(),total=cost_matrix.shape[0]):
#     # Compute distance between centroids for entries off the diagonal
#     for destination in boroughs_no_spaces:
#         # Computations in the case of origin = destination have been completed - so avoid them
#         if destination != origin:
#             # Store origin and destination geometry centroids
#             origin_centroid = boroughs_gdf[boroughs_gdf['name']==origin].geometry.centroid.values[0]
#             destination_centroid = boroughs_gdf[boroughs_gdf['name']==destination].geometry.centroid.values[0]
#             # Store distance between origin and destination centroids
#             cost_matrix.loc[origin,destination] = math.sqrt(origin_centroid.distance(destination_centroid))
#         else:
#             # Fill in diagonal with radii values
# #             cost_matrix.loc[origin,destination] = boroughs_gdf[boroughs_gdf['name']==origin]['max_radius'].values[0]
# #             cost_matrix.loc[origin,destination] = boroughs_gdf[boroughs_gdf['name']==origin]['min_radius'].values[0]
#             cost_matrix.loc[origin,destination] = boroughs_gdf[boroughs_gdf['name']==origin]['avg_radius'].values[0]
        

# Export cost matrix and Borough names as dataframe and numpy array

In [135]:
cost_matrix/np.sum(cost_matrix)

Unnamed: 0,BarkingandDagenham,Barnet,Bexley,Brent,Bromley,Camden,CityofLondon,Croydon,Ealing,Enfield,...,Merton,Newham,Redbridge,RichmonduponThames,Southwark,Sutton,TowerHamlets,WalthamForest,Wandsworth,Westminster
AbbeyRoad,0.001749,0.001069,0.001759,0.001114,0.001708,0.000636,0.001251,0.001578,0.001331,0.001358,...,0.001350,0.001691,0.001695,0.001382,0.001428,0.001478,0.001523,0.001548,0.001166,0.000689
AbbeyWood,0.000770,0.002136,0.000613,0.002278,0.001169,0.002173,0.001917,0.001726,0.002270,0.001798,...,0.002150,0.001077,0.001076,0.002239,0.001787,0.002003,0.001566,0.001498,0.002214,0.002206
Abbey_BarkingandDagenham,0.000740,0.001977,0.000983,0.002152,0.001326,0.002003,0.001745,0.001750,0.002175,0.001578,...,0.002095,0.000758,0.000714,0.002170,0.001677,0.001984,0.001367,0.001200,0.002125,0.002060
Abbey_Merton,0.001853,0.001631,0.001778,0.001479,0.001598,0.001525,0.001582,0.001216,0.001430,0.001798,...,0.000327,0.001841,0.001875,0.001209,0.001486,0.000824,0.001735,0.001859,0.000780,0.001355
Abingdon,0.001812,0.001256,0.001792,0.001120,0.001699,0.001045,0.001377,0.001489,0.001254,0.001540,...,0.001117,0.001773,0.001785,0.001221,0.001454,0.001310,0.001623,0.001688,0.000867,0.000842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WoolwichRiverside,0.000975,0.001951,0.000943,0.002092,0.001117,0.001922,0.001610,0.001556,0.002108,0.001625,...,0.001947,0.000725,0.001033,0.002076,0.001474,0.001828,0.001215,0.001283,0.001982,0.001946
WorcesterPark,0.001984,0.001763,0.001897,0.001560,0.001715,0.001763,0.001856,0.001350,0.001415,0.001959,...,0.000814,0.002022,0.002026,0.001090,0.001759,0.000854,0.001967,0.002044,0.001171,0.001634
WormholtandWhiteCity,0.001915,0.001195,0.001901,0.000864,0.001826,0.001229,0.001612,0.001640,0.001037,0.001605,...,0.001255,0.001916,0.001893,0.001087,0.001687,0.001427,0.001811,0.001812,0.001133,0.001169
Yeading,0.002266,0.001627,0.002245,0.001300,0.002197,0.001992,0.002294,0.002068,0.000819,0.002024,...,0.001792,0.002381,0.002280,0.001140,0.002320,0.001834,0.002389,0.002285,0.001867,0.002005


In [136]:
# Export to csv
cost_matrix.to_csv('../data/input/commuter/cost_matrix.csv')
# Export to txt
np.savetxt('../data/input/commuter/cost_matrix.txt',cost_matrix.to_numpy())

# Sanity check with retail cost matrix

In [27]:
retail_cost_matrix = np.loadtxt('../data/input/retail/cost_matrix.txt')

In [None]:
pd.DataFrame(retail_cost_matrix)