In [124]:
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import copy
import json
import matplotlib.pyplot as plt

# Import data

In [125]:
# Import processed ward coordinates
ward_coordinates = gpd.read_file("../data/input/misc/ward_coordinates.json")

In [126]:
# Import ward population data
ward_atlas_pop = pd.read_csv("../data/raw/misc/ward_atlas_population_estimates.csv")
ward_ethnic_pop = pd.read_csv("../data/raw/misc/ethnic-group-ward-2001.csv")

# Unify population and coordinate data

In [127]:
# Rename columns
ward_ethnic_pop = ward_ethnic_pop.rename(columns={'Borough':'borough','Ward':'ward'}).sort_values(by=['borough','ward'])
ward_atlas_pop = ward_atlas_pop.rename(columns={'Borough':'borough','Names':'ward'}).sort_values(by=['borough','ward'])

# Choose dataset
dataset = 'ethnic' # 'ethnic' or 'atlas'
if dataset == 'ethnic':
    ward_pop = copy.deepcopy(ward_ethnic_pop)
elif dataset == 'atlas':
    ward_pop = copy.deepcopy(ward_atlas_pop)
else: 
    raise ValueError(f"Dataset '{dataset}' not found.")

In [128]:
# Process ward and borough names
# Create new ward and borough name columns without spaces
def fix_str(x):
    return x.replace(' ','').replace('.','').replace('`',"'")

ward_pop.loc[:,'ward_new'] = ward_pop['ward'].apply(lambda x: fix_str(x))
ward_pop.loc[:,'borough_new'] = ward_pop['borough'].apply(lambda x: x.replace(' ',''))

# Duplicate ward-borough pairs
duplicate_ward_borough = ward_pop[ward_pop['ward'].isin(ward_pop['ward'][ward_pop['ward'].duplicated()])][['ward','borough']].values

# Loop over duplicates and rename them so they are unique across all boroughs
for w, b in duplicate_ward_borough:
    ward_pop.loc[(ward_pop.ward==w) & (ward_pop.borough==b),'ward_new'] = fix_str(str(w))+'_'+str(b.replace(' ',''))

weird_wards = ['HamPetershamandRichmondRiverside','StMargarets&NorthTwickenham']
ward_pop.loc[ward_pop.ward_new==weird_wards[0],'ward_new'] = "Ham" 
ward_pop.loc[ward_pop.ward_new==weird_wards[1],'ward_new'] = "StMargaretsandNorthTwickenham"

In [129]:
# Compute set difference of json wards - csv wards
# For consistency this has to be empty
assert len(np.sort(list(set(list(ward_coordinates.ward_new)) - set(list(ward_pop.ward_new))))) == 0
# Compute set difference of csv wards - json wards
# For consistency this has to be empty
assert len(np.sort(list(set(list(ward_pop.ward_new)) - set(list(ward_coordinates.ward_new))))) == 0

In [130]:
# Merge population with coordinate dataframes
ward_population = pd.merge(ward_coordinates, ward_pop, on=['ward_new', 'borough_new']).sort_values('ward_new')

In [131]:
# Get origin supply
origin_supply = copy.deepcopy(ward_population[['ward_new','pop2001','lon','lat']])
# origin_supply = origin_supply.rename(columns={'index':'Origin',0:'Supply'})
origin_supply.columns = ['origin','supply','lon','lat']
origin_supply = origin_supply.set_index('origin').sort_index()

# Export data as dataframe and numpy array

In [134]:
origin_supply.shape

(628, 3)

In [132]:
# Export to csv
origin_supply.to_csv('../data/input/commuter/origin_supply.csv')
# Export to txt
np.savetxt('../data/input/commuter/origin_supply.txt',origin_supply['supply'].to_numpy())
np.savetxt('../data/input/commuter/origin_locations.txt',origin_supply[['lon','lat']].to_numpy())
np.savetxt('../data/input/commuter/origins.txt',origin_supply.index.to_numpy(),fmt="%s")