- The median block in the CDMX census has a population of 97 households.

- To preserve the privacy of Cuebiq users, we choose to aggregate blocks that were too small by clustering its nearest neighbors, using a gravity type of assumption that 

- Prior to doing so, we 


- We might want to check that income gaps are not too high

- Drop empty blocks for the clustering

- Beyond Parent / Children relationships, shapes do not intersect


- Take out the zeros

- Merge Children

- Go to parents

https://www.earthdatascience.org/workshops/gis-open-source-python/dissolve-polygons-in-python-geopandas-shapely/

In [1]:
import geopandas as gpd
import pandas as pd
from timeit import default_timer as timer
import multiprocessing as mp
import numpy as np
from shapely.ops import cascaded_union
import matplotlib.pyplot as plt
from functools import partial 
import seaborn as sns
import pickle

# Load Data

In [2]:
print('Import Shapefile CDMX (INEGI):')
start = timer()

census_blocks = gpd.read_file(
'../data/Geoestadistica_2018/09_ciudaddemexico/conjunto de datos/09m.shp',encoding="utf-8")

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Shapefile CDMX (INEGI):
Computing Time: 4 sec


In [3]:
print('Import Census CDMX (INEGI):')
start = timer()

census_data = gpd.read_file('../data/Poblacion/RESAGEBURB_09DBF10.dbf')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Census CDMX (INEGI):
Computing Time: 20 sec


In [4]:
start = timer()

#Merge Census Blocks and Data
census_fields = ['POBTOT']
census = pd.merge(
census_blocks[['CVE_ENT','CVE_MUN','CVE_LOC','CVE_AGEB','CVE_MZA']+['CVEGEO','geometry']],
census_data[['ENTIDAD','MUN','LOC','AGEB','MZA']+census_fields],
left_on=['CVE_ENT','CVE_MUN','CVE_LOC','CVE_AGEB','CVE_MZA'],
right_on=['ENTIDAD','MUN','LOC','AGEB','MZA'])[['CVEGEO']+census_fields+['geometry']]
census['POBTOT'] = census['POBTOT'].astype(int)

#Remove Empty Blocks
census = census[census['POBTOT']>0].copy()

#Sort By Population Size and Area
census = pd.concat(
[census,census.geometry.area.rename('AREA')],1).sort_values(
by=['POBTOT','AREA']).reset_index(drop=True)

#Project to wgs84
census_wgs84 = census.to_crs({'init': 'epsg:4326'})

end = timer()
print('Computing Time:', round(end - start), 'sec')

Computing Time: 6 sec


In [5]:
census_wgs84.head()

Unnamed: 0,CVEGEO,POBTOT,geometry,AREA
0,900200010580007,1,POLYGON ((-99.14500959994513 19.47020226011121...,198.914771
1,900500011294026,1,POLYGON ((-99.10358525021086 19.48651605024127...,201.473993
2,900500012837004,1,POLYGON ((-99.13612875532323 19.52221415056184...,213.305924
3,901700011384015,1,POLYGON ((-99.09960187994574 19.41345100010429...,222.296886
4,901000010328049,1,POLYGON ((-99.23764940038554 19.37966822993875...,336.779882


# Check Intersecting Shapes

In [6]:
start = timer()

#Find Blocks Containing Other Blocks
contains = gpd.sjoin(census_wgs84,census_wgs84,op='contains',lsuffix='parent',rsuffix='child')

#Remove Those Containing Themselves
contains = contains[(contains['CVEGEO_parent'] != contains['CVEGEO_child'])].copy()

#Sort By Pop Size
contains.sort_values(by=['POBTOT_parent','CVEGEO_parent','POBTOT_child'],inplace=True)

#List Children by Parent
parent2children = contains.groupby(contains.index)['index_child'].apply(list)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Computing Time: 35 sec


In [7]:
def cluster_children(idx_parent, pop_threshold = 100):
    
    # Initialize Clusters By Putting Each Child Into Its Own Cluster
    clusters = [[x] for x in parent2children[idx_parent]]

    # Initialize Parent
    parent = [idx_parent]

    while True:

        # Stop if there is only one cluster left
        if len(clusters) == 1:

            # If the remaining cluster is below the threshold
            if census_wgs84.loc[clusters[0],'POBTOT'].sum() < pop_threshold:

                # Dissolve it into its parent
                parent += clusters.pop()

            break

        # Find Cluster Position With the Smallest Pop
        idx_smallest = np.argmin([census_wgs84.loc[cluster,'POBTOT'].sum() for cluster in clusters])

        # Stop if the smallest cluster is above the threshold
        if census_wgs84.loc[clusters[idx_smallest],'POBTOT'].sum()>=pop_threshold:
            break

        # Find Cluster Position Of Its Closest Neighbor
        idx_closest = np.argsort([
        cascaded_union(census_wgs84.loc[cluster,'geometry']).distance(
        cascaded_union(census_wgs84.loc[clusters[idx_smallest],'geometry'])) for cluster in clusters])[1]

        # Create a new cluster merging of the two
        new_cluster = clusters[idx_smallest]+clusters[idx_closest]

        # Remove smallest cluster and its closest neighbor
        clusters = [cluster for idx_cluster,cluster in enumerate(clusters) if idx_cluster not in [idx_smallest,idx_closest]]

        # Append the new cluster
        clusters.append(new_cluster)
    
    # List of cluster can be empty
    return parent, clusters

In [8]:
start = timer()
print('Cluster Children')
parent2clusters = {}

for i,idx_parent in enumerate(contains.index.unique()):
    
    if not i%100:
        print('# Parent', i)
        
    parent2clusters[idx_parent] = cluster_children(idx_parent,pop_threshold=100)
    
end = timer()
print('Computing Time:', round(end - start), 'sec')

Cluster Children
# Parent 0
# Parent 100
# Parent 200
# Parent 300
# Parent 400
# Parent 500
# Parent 600
Computing Time: 97 sec


In [9]:
print("Check Parents in Which Children Were Dissolved:")
print((contains.groupby(contains.index)['POBTOT_child'].sum()<100).mean())
print(len([parent for (parent,clusters) in parent2clusters.values() if len(parent) > 1])/len(parent2clusters))

Check Parents in Which Children Were Dissolved:
0.10132890365448505
0.10132890365448505


In [10]:
print("Check Population Parents+Children:")
print(contains.groupby(contains.index).agg({'POBTOT_parent':'first','POBTOT_child':'sum'}).sum().sum())
print(
sum([census_wgs84.loc[parent,'POBTOT'].sum() for (parent,clusters) in parent2clusters.values()])+\
sum([census_wgs84.loc[cluster,'POBTOT'].sum() for (parent,clusters) in parent2clusters.values() for cluster in clusters]))

Check Population Parents+Children:
616579
616579


In [11]:
def plot_parent2children(idx_parent):

    colors = sns.color_palette("Set3", 10)
    
    if idx_parent not in parent2children:
        print('No Children :(')
        return
    
    fig, ax = plt.subplots(figsize = (15,15)) 

    data_parent = census_wgs84.loc[[idx_parent]].copy()

    data_parent.plot(
    ax=ax,
    linewidth=0,
    color='k',
    alpha=0.25,
    edgecolor='k')

    plt.annotate(
    s=data_parent['POBTOT'].values[0], 
    xy=(data_parent['geometry'].centroid.x,data_parent['geometry'].centroid.y),
    horizontalalignment='center',
    color='k',
    fontsize=30,
    alpha=1,
    fontweight='bold')

    data_children = census_wgs84.loc[parent2children[idx_parent]].copy()

    data_children.plot(
    ax=ax,
    color=colors[-1],
    linewidth=0,
    edgecolor=colors[-1])

    for idx, row in data_children.iterrows():
        plt.annotate(
        s=row['POBTOT'], 
        xy=(row['geometry'].centroid.x,row['geometry'].centroid.y),
        horizontalalignment='center',
        color='k')

    ax.axis('off')
    plt.savefig('../figures/parent2children-'+census_wgs84.loc[idx_parent,'CVEGEO']+'.pdf',bbox_inches='tight')

In [12]:
def plot_childrenclusters(idx_parent,pop_threshold = 100):

    colors = sns.color_palette("Set3", 10)
    
    if idx_parent not in parent2children:
        print('No Children :(')
        return

    parent, clusters = cluster_children(idx_parent,pop_threshold)

    fig, ax = plt.subplots(figsize = (15,15))

    data_parent = census_wgs84.loc[parent].drop('CVEGEO',1).reset_index(drop=True)

    data_parent.plot(ax=ax,color='k',alpha=0.25)

    plt.annotate(
    s=data_parent['POBTOT'].sum(), 
    xy=(cascaded_union(data_parent['geometry']).centroid.x,cascaded_union(data_parent['geometry']).centroid.y),
    horizontalalignment='center',
    color='k',
    fontsize=30,
    fontweight='bold',
    alpha=1)

    if len(clusters):

        data_clusters = pd.concat([
        census_wgs84.loc[cluster].drop('CVEGEO',1) for cluster in clusters],
        keys=range(len(clusters))).reset_index(
        level=1,drop=True).reset_index().dissolve(by='index', aggfunc='sum')

        data_clusters.plot(ax=ax,cmap='YlOrRd')

        for idx, row in data_clusters.iterrows():
            plt.annotate(
            s=row['POBTOT'], 
            xy=(row['geometry'].centroid.x,row['geometry'].centroid.y),
            horizontalalignment='center',
            color='k')

    ax.axis('off')
    plt.savefig('../figures/childrenclusters-'+census_wgs84.loc[idx_parent,'CVEGEO']+'.pdf',bbox_inches='tight')

In [13]:
# plot_parent2children(9)
# plot_childrenclusters(9)

In [14]:
# plot_parent2children(52)
# plot_childrenclusters(52)

In [15]:
# plot_parent2children(294)
# plot_childrenclusters(294)

# 2nd Stage

In [16]:
# Compute Distance Between 2 Clusters
def get_distance(smallest, cluster):
    return cascaded_union(
    census_wgs84.loc[cluster,'geometry']).distance(cascaded_union(
    census_wgs84.loc[smallest,'geometry']))

In [17]:
def cluster_blocks(pop_threshold=100):
    
    # Take all the parents' neighbors
    neighbors=sorted(set(census_wgs84.index).difference(set(contains.index)).difference(set(contains.index_child)))

    # Initialize Clusters By Putting Each Neighbor In Its Own Cluster and Allow Merges With Parents
    clusters=[[neighbor] for neighbor in neighbors]+[parent for (parent,childrenclusters) in parent2clusters.values()]
#     clusters=[clusters[i] for i in np.random.choice(range(len(clusters)),100)]

    # Initialize Clusters Populations
    pops=[census_wgs84.loc[cluster,'POBTOT'].sum() for cluster in clusters]

    i = 0 
    start = timer()
    
    while True:

        if not i % 100:
            print('Iteration', i)
            end = timer()
            print('Computing Time:', round(end - start), 'Sec')
            start = timer()

        # Find Cluster Position With the Smallest Pop
        idx_smallest=np.argmin(pops)

        # Stop if the smallest cluster is above the threshold
        if pops[idx_smallest]>=pop_threshold:
            break

        # First Dimension Gets Partialized
        partial_distance = partial(get_distance, clusters[idx_smallest])

        # Compute Distances to Smallest Cluster
        with mp.Pool() as pool:
            distances = pool.map(partial_distance, clusters)

        # Find Cluster Position Of Its Closest Neighbor
        idx_closest = np.argsort(distances)[1]

        # Create a new cluster merging the smallest cluster and its closest neighbor
        new_cluster = clusters[idx_smallest]+clusters[idx_closest]
        new_pop = census_wgs84.loc[new_cluster,'POBTOT'].sum()

        # Remove smallest cluster and its closest neighbor
        clusters = [cluster for idx_cluster,cluster in enumerate(clusters) if idx_cluster not in [idx_smallest,idx_closest]]
        pops = [pop for idx_pop,pop in enumerate(pops) if idx_pop not in [idx_smallest,idx_closest]]

        # Append the new cluster and its pop
        clusters.append(new_cluster)
        pops.append(new_pop)

        i+=1
    
    return clusters

In [18]:
start = timer()
print('Cluster Blocks')

clusters = cluster_blocks(pop_threshold=100)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Cluster Blocks
Iteration 0
Computing Time: 0 Sec
Iteration 10
Computing Time: 1 Sec
Iteration 20
Computing Time: 1 Sec
Iteration 30
Computing Time: 1 Sec
Iteration 40
Computing Time: 1 Sec
Computing Time: 6 sec


In [19]:
df = pd.DataFrame([
[50, 3],
[100, 7],
[150, 12],
[200, 20],
[300, 32],
[400, 57],
[500, 80],
[600, 94],
[700, 128],
[800, 193],
[900, 224],
[1000, 259]])

z = np.polyfit(list(df[0]), list(df[1]), 2)

p = np.poly1d(z)

print('# Days', p(53830)/(3600*24))

# Days 7.651657732689291


In [22]:
print('SAVE')
start = timer()

with open("../data/clusters-cdmx.pkl","wb") as f:
    pickle.dump(census_wgs84['CVEGEO'].to_dict(),f)
    pickle.dump(parent2clusters,f)
    pickle.dump(clusters,f)

end = timer()
print('Computing Time:', round(end - start), 'sec')

SAVE
Computing Time: 0 sec


# Inspect Results

In [37]:
print('LOAD')
start = timer()

with open("../data/clusters-cdmx.pkl","rb") as f:
    idx2CVEGEO = pickle.load(f)
    parent2clusters = pickle.load(f)
    clusters = pickle.load(f)

clusters = [cluster for parent, clusters in parent2clusters.values() for cluster in clusters] + clusters

end = timer()
print('Computing Time:', round(end - start), 'sec')

LOAD
Computing Time: 0 sec


In [38]:
print('# Clusters:', len(clusters))

# Clusters: 35843


In [44]:
pops = pd.Series([census_wgs84.loc[cluster,'POBTOT'].sum() for cluster in clusters])

In [46]:
pops.describe()

count    35843.000000
mean       244.569288
std        214.690052
min        100.000000
25%        141.000000
50%        191.000000
75%        272.000000
max      12964.000000
dtype: float64