In [1]:
from timeit import default_timer as timer
import pandas as pd
import geopandas as gpd
import json

# Import Offifical Population Data

In [2]:
print('Import Census Blocks from Inegi')
start = timer()

# Raw data downloaded from:
# http://internet.contenidos.inegi.org.mx/contenidos/Productos/prod_serv/contenidos/espanol/bvinegi/productos/geografia/marcogeo/889463674658_s.zip
blocks = gpd.read_file('../data/Geoestadistica_2018/09_ciudaddemexico/conjunto de datos/09m.shp',encoding="utf-8")

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Census Blocks from Inegi
Computing Time: 4 sec


In [3]:
print('Import Population Counts from Inegi')
start = timer()

# Downloaded from:
# https://www.inegi.org.mx/contenidos/programas/ccpv/2010/microdatos/iter/ageb_manzana/09_distrito_federal_2010_ageb_manzana_urbana_dbf.zip
pop = gpd.read_file('../data/Poblacion/RESAGEBURB_09DBF10.dbf')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Population Counts from Inegi
Computing Time: 19 sec


In [4]:
start = timer()

print('Merge')
census = pd.merge(
blocks[['CVE_ENT','CVE_MUN','CVE_LOC','CVE_AGEB','CVE_MZA','CVEGEO','geometry']],
pop[['ENTIDAD','MUN','LOC','AGEB','MZA','POBTOT']],
left_on=['CVE_ENT','CVE_MUN','CVE_LOC','CVE_AGEB','CVE_MZA'],
right_on=['ENTIDAD','MUN','LOC','AGEB','MZA'])[['CVEGEO','POBTOT','geometry']]

print('Sort blocks by population count and area')
census['POBTOT'] = census['POBTOT'].astype(int)
census = pd.concat([census,census.geometry.area.rename('AREA')],1).sort_values(
by=['POBTOT','AREA']).reset_index(drop=True).drop('AREA',1).rename(columns={'POBTOT':'population'})

print('Project to wgs84')
census = census.to_crs({'init': 'epsg:4326'})

end = timer()
print('Computing Time:', round(end - start), 'sec')

Merge
Sort blocks by population count and area
Project to wgs84
Computing Time: 6 sec


# Import Clusters

In [5]:
start = timer()

# geodataframe in which each row contains a cluster id, population, and geometry
clusters = gpd.read_file('../data/clusters-cdmx.geojson')

# dictionary converting cluster id to census block ids
with open('../data/cluster2blocks-cdmx.json') as json_data:
    cluster2blocks = json.load(json_data)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Computing Time: 9 sec


# Check Clusters Populations

In [6]:
start = timer()

print('Create dictionary converting each census block id (CVEGEO) to a cluster id')
block2cluster = pd.Series(cluster2blocks).apply(pd.Series).stack().rename('CVEGEO').reset_index(
drop=True,level = 1).reset_index().set_index('CVEGEO')['index'].rename('CLUSTER').to_dict()

print('Map each census block id to a cluster id (some empty blocks are not allocated to a cluster)')
census['cluster'] = census['CVEGEO'].apply(lambda x:block2cluster.get(x,None)).astype(float)
census.dropna(inplace=True)
census['cluster'] = census['cluster'].astype(int)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Create dictionary converting each census block id (CVEGEO) to a cluster id
Map each census block id to a cluster id (some empty blocks are not allocated to a cluster)
Computing Time: 9 sec


In [7]:
print('Number of People in Each Census Blocks Grouped By Cluster == Number of People in Each Cluster')
census.groupby('cluster')['population'].sum().equals(clusters['population'])

Number of People in Each Census Blocks Grouped By Cluster == Number of People in Each Cluster


True