# Description

- This code computes clusters of census blocks, each resulting cluster having a population above a predetermined threshold (e.g. 100)

- It allows to match privacy-sensitive geocoded information which census data as any acceptable level of granularity

- It takes a given country's shapefiles as input (e.g. Mexico shapefiles from INEGI)

- It iteratively merges each cluster of blocks with a population below the threshold with its closest neighbor

    - When census blocks are nested, we first cluster children before clustering parents
    
    - When several admin levels are present, we iteratively cluster at each admin level
    
    - Rural areas that are not partitioned into blocks are clustered separately
    
    - One extension would be to only allow merges between urban clusters if their distance is below a cutoff
    
    - Another extension would be to only allow merges between urban clusters if the difference in average wealth is below a cutoff

In [1]:
from timeit import default_timer as timer
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.ops import unary_union
from shapely.geometry import MultiPolygon, Polygon
import multiprocessing as mp
from itertools import chain
from functools import partial

# Import Data

In [2]:
path_to_data = '/scratch/spf248/cuebiq/data/'

In [3]:
print('Import Census Shapes')
start = timer()

blocks = gpd.read_file(path_to_data+'census/blocks-mexico.geojson')
urban  = gpd.read_file(path_to_data+'census/urban-mexico.geojson')
munic  = gpd.read_file(path_to_data+'census/munic-mexico.geojson')
print('# Blocks:', blocks.shape[0])
print('# Urban:', urban.shape[0])
print('# Munic:', munic.shape[0])

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Census Shapes
# Blocks: 1376969
# Urban: 4525
# Munic: 2456
Computing Time: 270 sec


In [4]:
blocks.set_index('BLOCK',inplace=True)

In [5]:
blocks.head()

Unnamed: 0_level_0,AGEB,URBAN,MUNIC,POB1,geometry
BLOCK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100100010229001,100100010229,10010001,1001,65,(POLYGON ((-102.295873492189 21.92998714417593...
100100010229002,100100010229,10010001,1001,0,(POLYGON ((-102.2919195976506 21.9231902834303...
100100010229003,100100010229,10010001,1001,0,(POLYGON ((-102.2916304870713 21.9189053905925...
100100010229004,100100010229,10010001,1001,0,(POLYGON ((-102.2948584083044 21.9171921055255...
100100010229006,100100010229,10010001,1001,25,(POLYGON ((-102.2962004535445 21.9312597033061...


In [6]:
empty_blocks = blocks.loc[blocks['POB1']==0].index.tolist()
print('# empty blocks:', len(empty_blocks))

# empty blocks: 155073


# Library of Functions

In [7]:
# Compute min distance between points in 2 geometries
def dist_geoms(geom1,geom2):
    
    return geom1.distance(geom2)

In [8]:
# Return union of cluster geometries
def union_geoms(cluster, data=blocks):
    
    return unary_union(data.loc[cluster,'geometry'])

In [9]:
# Flatten nested lists of any depth
def flatten_list(nested_list):
    
    flat_list = []
    for item in nested_list:
        if type(item) != list:
            flat_list.append(item)
        else:
            flat_list.extend(flatten_list(item))
    return flat_list

In [10]:
# Select nested shapes from geodataframe
def nested_shapes(df,index_col='BLOCK'):
    
    # Find shapes that are contained in one another retaining children geometry
    nested_shapes = gpd.sjoin(
    df.reset_index(),
    df.reset_index(),
    op='within',
    lsuffix='child',
    rsuffix='parent')
    
    # Remove original shapes
    nested_shapes = nested_shapes[
    nested_shapes[(index_col+'_parent')]!=\
    nested_shapes[(index_col+'_child')]].sort_values(
    by=[index_col+'_parent',index_col+'_child']).reset_index(drop=True)
    
    return nested_shapes

In [11]:
def cluster_shapes(clusters, pops, geoms, mp_dists=False, pop_threshold=100):
    
    # Stop if there is only one cluster left
    while len(clusters) > 1:

        # Create random vector to break ties
        np.random.seed(0)
        pops_tiebreak = np.random.random(len(pops))
        
        # Find the position of the cluster with the smallest population
        pos_smallest = np.lexsort((pops_tiebreak, pops))[0]

        # Stop if the smallest cluster is above the threshold
        if pops[pos_smallest] >= pop_threshold:
            break

        # Compute distances using multiprocessing
        if mp_dists:
            
            # Partial distance with respect to smallest cluster
            dist_smallest = partial(dist_geoms, geoms[pos_smallest])
            
            # Compute distances to the smallest cluster
            with mp.Pool() as pool:
                dists = pool.map(dist_smallest, geoms)
                
        else:
            
            # Compute distances to the smallest cluster
            dists = [geom.distance(geoms[pos_smallest]) for geom in geoms]
        
        # Create random vector to break ties
        np.random.seed(0)
        dists_tiebreak = np.random.random(len(dists))
        
        # Find the position of the smallest cluster's closest neighboring cluster
        pos_dists   = np.lexsort((dists_tiebreak, dists))
        pos_closest = [pos for pos in pos_dists if pos != pos_smallest][0]
        
        # Create a new cluster merging the smallest cluster and its closest neighbor
        new_cluster = clusters[pos_smallest] + clusters[pos_closest]
        new_pop     = pops[pos_smallest] + pops[pos_closest]
        new_geom    = unary_union([geoms[pos_smallest], geoms[pos_closest]])
        
        # Remove smallest cluster and its closest neighbor
        clusters = [cluster for pos_cluster,cluster in enumerate(clusters) if pos_cluster not in [pos_smallest,pos_closest]]
        pops     = [pop for pos_pop,pop in enumerate(pops) if pos_pop not in [pos_smallest,pos_closest]]
        geoms    = [geom for pos_geom,geom in enumerate(geoms) if pos_geom not in [pos_smallest,pos_closest]]

        # Append the new cluster and its population
        clusters.append(new_cluster)
        pops.append(new_pop)
        geoms.append(new_geom)
        
    return clusters, pops, geoms

# Agglomerative Clustering of Children Blocks

In [12]:
print('Extract nested blocks')
start = timer()

nested_blocks = nested_shapes(blocks)
print('# nested blocks:', nested_blocks.shape[0])
    
nested_blocks = nested_blocks[[
'MUNIC_parent',
'URBAN_parent',
'AGEB_parent',
'BLOCK_parent',
'BLOCK_child',
'POB1_child',
'geometry',
]].rename(columns={
'MUNIC_parent':'MUNIC',
'URBAN_parent':'URBAN',
'AGEB_parent':'AGEB',
'BLOCK_parent':'BLOCK',
'BLOCK_child':'CLUSTER',
'POB1_child':'POB1',
}).copy()
    
end = timer()
print('Computing Time:', round(end - start), 'sec')

Extract nested blocks
# nested blocks: 10005
Computing Time: 564 sec


In [13]:
print('Cluster non-empty children at the parent level')
start = timer()

# For each parent with at least one non-empty child, 
# initialize children to clusters, along with their pops, and geometries
# Note: a parent can be emptry even if its child is not
parent2children = \
nested_blocks[nested_blocks['POB1']>0].groupby(
by=['MUNIC','URBAN','AGEB','BLOCK']).agg({
'CLUSTER':lambda x:[[y] for y in x],'POB1':list,'geometry':list})

print('# parent blocks containing at least one non-empty child:', len(parent2children))
print('# clusters before clustering:', parent2children['CLUSTER'].apply(len).sum())
print('# blocks before clustering:', len(flatten_list(parent2children['CLUSTER'])))
print('Population before clustering:', sum(parent2children['POB1'].sum()))

# Randomize index
idx_parent = parent2children.sample(frac=1,random_state=0).index

# Initialize input
x_parent = zip(
parent2children.loc[idx_parent,'CLUSTER'],
parent2children.loc[idx_parent,'POB1'],
parent2children.loc[idx_parent,'geometry'],
)

with mp.Pool() as pool:
    x_parent = pool.starmap(cluster_shapes, x_parent)

# Unravel output
clusters_parent, pops_parent, geoms_parent = map(list,zip(*x_parent))

parent2children.loc[idx_parent,'CLUSTER'] = clusters_parent.copy()
parent2children.loc[idx_parent,'POB1'] = pops_parent.copy()
parent2children.loc[idx_parent,'geometry'] = geoms_parent.copy()

print('# clusters after clustering:', parent2children['CLUSTER'].apply(len).sum())
print('# blocks after clustering:', len(flatten_list(parent2children['CLUSTER'])))
print('Population after clustering:', sum(parent2children['POB1'].sum()))

end = timer()
print('Computing Time:', round(end - start), 'sec')

Cluster non-empty children at the parent level
# parent blocks containing at least one non-empty child: 1068
# clusters before clustering: 9621
# blocks before clustering: 9621
Population before clustering: 593532
# clusters after clustering: 3269
# blocks after clustering: 9621
Population after clustering: 593532
Computing Time: 4 sec


# Agglomerative Clustering of Blocks By Admin Level

In [14]:
start = timer()

print('Initialize non-empty adult clusters')

ageb2adults = blocks.drop(empty_blocks,errors='ignore').drop(
flatten_list(parent2children['CLUSTER']),errors='ignore').reset_index().rename(
columns={'BLOCK':'CLUSTER'}).groupby(['MUNIC','URBAN','AGEB']).agg({
'CLUSTER':lambda x:[[y] for y in x],'POB1':list,'geometry':list})

print('Initialize children clusters')

ageb2children = parent2children.groupby(['MUNIC','URBAN','AGEB']).sum()

print('Initialize all AGEB-level clusters')

ageb2clusters = pd.DataFrame({
'CLUSTER':np.empty((len(ageb2adults.index.union(ageb2children.index)), 0)).tolist(),
'POB1':np.empty((len(ageb2adults.index.union(ageb2children.index)), 0)).tolist(),
'geometry':np.empty((len(ageb2adults.index.union(ageb2children.index)), 0)).tolist()},
index=ageb2adults.index.union(ageb2children.index))

ageb2clusters.loc[ageb2adults.index] = ageb2clusters.loc[ageb2adults.index].add(ageb2adults)
ageb2clusters.loc[ageb2children.index] = ageb2clusters.loc[ageb2children.index].add(ageb2children)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Initialize non-empty adult clusters
Initialize children clusters
Initialize all AGEB-level clusters
Computing Time: 25 sec


In [15]:
print('Cluster at the AGEB level')
start = timer()

print('# clusters before clustering:', ageb2clusters['CLUSTER'].apply(len).sum())
print('# blocks before clustering:', len(flatten_list(ageb2clusters['CLUSTER'])))
print('Population before clustering:', sum(ageb2clusters['POB1'].sum()))

# Randomize index
idx_ageb = ageb2clusters.sample(frac=1,random_state=0).index

# Initialize input
x_ageb = zip(
ageb2clusters.loc[idx_ageb,'CLUSTER'],
ageb2clusters.loc[idx_ageb,'POB1'],
ageb2clusters.loc[idx_ageb,'geometry'],
)

with mp.Pool() as pool:
    x_ageb = pool.starmap(cluster_shapes, x_ageb)
    
# Unravel output
clusters_ageb, pops_ageb, geoms_ageb = map(list,zip(*x_ageb))

ageb2clusters.loc[idx_ageb,'CLUSTER'] = clusters_ageb.copy()
ageb2clusters.loc[idx_ageb,'POB1'] = pops_ageb.copy()
ageb2clusters.loc[idx_ageb,'geometry'] = geoms_ageb.copy()

print('# clusters after clustering:', ageb2clusters['CLUSTER'].apply(len).sum())
print('# blocks after clustering:', len(flatten_list(ageb2clusters['CLUSTER'])))
print('population after clustering:', sum(ageb2clusters['POB1'].sum()))

end = timer()
print('Computing Time:', round(end - start), 'sec')

Cluster at the AGEB level
# clusters before clustering: 1215544
# blocks before clustering: 1221896
Population before clustering: 86984906
# clusters after clustering: 438841
# blocks after clustering: 1221896
population after clustering: 86984906
Computing Time: 296 sec


In [16]:
print('Cluster at the urban locality level')
start = timer()

#Initialize all urban locality level clusters
urban2clusters = ageb2clusters.groupby(['MUNIC','URBAN']).sum()

print('# clusters before clustering:', urban2clusters['CLUSTER'].apply(len).sum())
print('# blocks before clustering:', len(flatten_list(urban2clusters['CLUSTER'])))
print('Population before clustering:', sum(urban2clusters['POB1'].sum()))

# Randomize index
idx_urban = urban2clusters.sample(frac=1,random_state=0).index

# Initialize input
x_urban = zip(
urban2clusters.loc[idx_urban,'CLUSTER'],
urban2clusters.loc[idx_urban,'POB1'],
urban2clusters.loc[idx_urban,'geometry'],
)

with mp.Pool() as pool:
    x_urban = pool.starmap(cluster_shapes, x_urban)
    
# Unravel output
clusters_urban, pops_urban, geoms_urban = map(list,zip(*x_urban))

urban2clusters.loc[idx_urban,'CLUSTER'] = clusters_urban.copy()
urban2clusters.loc[idx_urban,'POB1'] = pops_urban.copy()
urban2clusters.loc[idx_urban,'geometry'] = geoms_urban.copy()

print('# clusters after clustering:', urban2clusters['CLUSTER'].apply(len).sum())
print('# blocks after clustering:', len(flatten_list(urban2clusters['CLUSTER'])))
print('Population after clustering:', sum(urban2clusters['POB1'].sum()))

end = timer()
print('Computing Time:', round(end - start), 'sec')

Cluster at the urban locality level
Initialize all urban locality level clusters
# clusters before clustering: 438841
# blocks before clustering: 1221896
Population before clustering: 86984906
# clusters after clustering: 428498
# blocks after clustering: 1221896
Population after clustering: 86984906
Computing Time: 147 sec


In [17]:
urban2clusters.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CLUSTER,POB1,geometry
MUNIC,URBAN,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001,10010001,"[[0100100010229006, 0100100010229008], [010010...","[103, 307, 194, 246, 202, 154, 101, 173, 146, ...",[(POLYGON ((-102.2972162668094 21.931158026454...
1001,10010239,"[[0100102394185033, 0100102394185034, 01001023...","[186, 104, 164, 142, 360, 107, 102, 204, 107, ...",[(POLYGON ((-102.2091563725392 21.996354505718...
1001,10010293,"[[0100102933666004], [0100102933666005], [0100...","[156, 110, 124, 204, 303, 179, 237, 193, 232, ...",[(POLYGON ((-102.216099018286 21.8884090045941...
1001,10010357,"[[0100103574166010], [0100103574166001, 010010...","[139, 288, 140, 242, 206, 209, 117, 145, 142, ...",[(POLYGON ((-102.2061994330045 21.862280433716...
1001,10010479,"[[0100104792742006], [0100104792742017], [0100...","[111, 119, 114, 175, 166, 324, 142, 163, 265, ...",[(POLYGON ((-102.1880557592149 21.822819361578...


In [18]:
print('Flatten list of clusters')
start = timer()

clusters = gpd.GeoDataFrame(pd.concat([
urban2clusters['CLUSTER'].apply(pd.Series).stack().reset_index(level=2, drop=True).to_frame('CLUSTER'),
urban2clusters['POB1'].apply(pd.Series).stack().reset_index(level=2, drop=True).to_frame('POB1'),],1), 
geometry=urban2clusters['geometry'].apply(pd.Series).stack().reset_index(level=2, drop=True))

end = timer()
print('Computing Time:', round(end - start), 'sec')

Flatten list of clusters
Computing Time: 35 sec


In [19]:
clusters.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,CLUSTER,POB1,geometry
MUNIC,URBAN,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001,10010001,"[0100100010229006, 0100100010229008]",103.0,(POLYGON ((-102.2972162668094 21.9311580264542...
1001,10010001,"[0100100010229007, 0100100010229019, 010010001...",307.0,(POLYGON ((-102.2960545266846 21.9172431949010...
1001,10010001,[0100100010233001],194.0,(POLYGON ((-102.3172373846377 21.9077531607741...
1001,10010001,[0100100010233005],246.0,(POLYGON ((-102.319460545915 21.90867287089753...
1001,10010001,[0100100010233012],202.0,(POLYGON ((-102.3088704296389 21.9029877168594...


# Remove Non-dissolved Children Shapes

In [20]:
print('Retrieve children of non-empty parents each cluster')
start = timer()

# Non-empty parents containing non-empty children 
parents_list = list(set(parent2children.index.get_level_values('BLOCK').unique()).difference(empty_blocks))

# Non-empty parents to list of non-empty children
parent2childrenlist=parent2children.reset_index(
drop=True,level=['MUNIC','URBAN','AGEB'])['CLUSTER'].apply(flatten_list).drop(empty_blocks,errors='ignore')

def get_children(parents):
    children = []
    for parent in parents:
        children+=parent2childrenlist[parent]
    return children

clusters['PARENTS'] = clusters['CLUSTER'].apply(lambda x:list(set(x).intersection(parents_list)))
clusters['CHILDREN'] = clusters['PARENTS'].apply(get_children)
clusters['DISSOLVED_CHILDREN'] = clusters.apply(lambda x:list(set(x['CHILDREN']).intersection(x['CLUSTER'])),1)
clusters['NON_DISSOLVED_CHILDREN'] = clusters.apply(lambda x:list(set(x['CHILDREN']).difference(x['CLUSTER'])),1)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Retrieve children of non-empty parents each cluster
Computing Time: 115 sec


In [21]:
print('Remove non-dissolved children from each non-empty parents geometry')
start = timer()

with mp.Pool() as pool:
    clusters['GEOM_DISSOLVED_CHILDREN'] = pool.map(union_geoms, clusters['DISSOLVED_CHILDREN'])

with mp.Pool() as pool:
    clusters['GEOM_NON_DISSOLVED_CHILDREN'] = pool.map(union_geoms, clusters['NON_DISSOLVED_CHILDREN'])

clusters['geometry'] = clusters.apply(lambda x:x['geometry'].difference(x['GEOM_NON_DISSOLVED_CHILDREN']),1)

end = timer()
print('Computing Time:', round(end - start), 'sec')

Remove non-dissolved children from each non-empty parents geometry
Computing Time: 119 sec


In [22]:
print('# non-empty parents', len(set(parents_list)))
print('# non-empty parents clustered', clusters['PARENTS'].apply(len).sum())

print('# non-empty children in non-empty parent',parent2childrenlist.apply(len).sum())
print('# non-empty children in non-empty parent clustered', clusters['CHILDREN'].apply(len).sum())

print('# non-empty dissolved children of non-empty parents', clusters['DISSOLVED_CHILDREN'].apply(len).sum())
print('# non-empty non-dissolved children of non-empty parents', clusters['NON_DISSOLVED_CHILDREN'].apply(len).sum())

print('Area of all blocks:', blocks.geometry.area.sum())
print('Area of all clusters:', clusters.apply(
lambda x:\
x['geometry'].area+\
x['GEOM_DISSOLVED_CHILDREN'].area+\
x['GEOM_NON_DISSOLVED_CHILDREN'].area,1).sum()+\
blocks.loc[empty_blocks].geometry.area.sum())

# non-empty parents 844
# non-empty parents clustered 844
# non-empty children in non-empty parent 7176
# non-empty children in non-empty parent clustered 7176
# non-empty dissolved children of non-empty parents 1153
# non-empty non-dissolved children of non-empty parents 6023
Area of all blocks: 1.641465654837783
Area of all clusters: 1.6414656548377828


In [23]:
print('Save')
start = timer()

clusters.to_pickle(path_to_data+'clusters-urban-mexico.pkl')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Save
Computing Time: 24 sec


# Extract Rural Areas

In [24]:
# Urban areas that are below the privacy threshold will be dissolved
def check_validity(data=clusters, pop_col='POB1', pop_threshold=100):
    
    valid = data.loc[data[pop_col]>=pop_threshold].copy()
    invalid = data.loc[data[pop_col]<pop_threshold].copy()
    
    return valid, invalid

clusters_valid, clusters_invalid = check_validity()

print('# invalid clusters:', clusters_invalid.shape[0])

# invalid clusters: 14


In [25]:
print('Compute rural areas populations')
start = timer()

# Subtracting the population of valid urban areas from the populations of municipalities
pops_rural = munic.set_index('MUNIC')[['POB1']].subtract(
clusters_valid.groupby('MUNIC')[['POB1']].sum(),fill_value=0).sort_index().reset_index()

# Only keep non empty rural areas
pops_rural = pops_rural[pops_rural['POB1']>0].copy()
pops_rural.POB1 = pops_rural.POB1.astype(int)
print('# non-empty rural areas:', pops_rural.shape[0])

end = timer()
print('Computing Time:', round(end - start), 'sec')

Compute rural areas populations
# non-empty rural areas: 2398
Computing Time: 0 sec


In [26]:
print('Compute rural areas geometries')
start = timer()

# Remove the geometries of urban areas that were above the privacy threshold from municipality areas geometries
geoms_rural = gpd.overlay(
munic[['MUNIC','geometry']], 
urban.loc[urban['URBAN'].isin(clusters_valid.index.get_level_values('URBAN').unique()),['geometry']], 
how='difference')

print('# rural areas:', geoms_rural.shape[0])
print('# municipality areas fully covered by valid urban areas:', munic.shape[0]-geoms_rural.shape[0])

end = timer()
print('Computing Time:', round(end - start), 'sec')

Compute rural areas geometries
# rural areas: 2452
# municipality areas fully covered by valid urban areas: 4
Computing Time: 160 sec


# Cluster Rural Areas

In [27]:
print('Cluster Rural Areas:')
start = timer()

# Some rural areas are below the privacy threshold
rural = gpd.GeoDataFrame(pd.merge(geoms_rural,pops_rural,on='MUNIC')[
['MUNIC','POB1','geometry']]).sort_values(by='MUNIC').reset_index(drop=True)

print('# rural shapes before clustering:', rural['MUNIC'].shape[0])
print('Rural population before clustering:', sum(rural['POB1']))
print('Rural area before clustering:', rural.geometry.area.sum())

# Cluster rural areas computing distances in parallel
x_rural = cluster_shapes(
rural['MUNIC'].apply(lambda x:[x]).tolist(),
rural['POB1'].tolist(),
rural['geometry'].tolist(),
mp_dists=True,
)

# Unravel output
clusters_rural = gpd.GeoDataFrame(zip(*x_rural),columns=['CLUSTER','POB1','geometry'])

print('# rural clusters after clustering:', clusters_rural['CLUSTER'].apply(len).sum())
print('Rural population after clustering:', sum(clusters_rural['POB1']))
print('Rural area after clustering:', clusters_rural['geometry'].area.sum())

end = timer()
print('Computing Time:', round(end - start), 'sec')

Cluster Rural Areas:
# rural shapes before clustering: 2398
Rural population before clustering: 25352508
Rural area before clustering: 172.61292698777646
# rural clusters after clustering: 2398
Rural population after clustering: 25352508
Rural area after clustering: 172.61292698777646
Computing Time: 2024 sec


In [28]:
clusters_rural.head()

Unnamed: 0,CLUSTER,POB1,geometry
0,[01001],56330,(POLYGON ((-102.1064122399267 22.0603544130303...
1,[01002],32739,"POLYGON ((-102.051893439036 22.29143529350414,..."
2,[01003],27480,POLYGON ((-102.6856884472506 22.09962730886251...
3,[01004],10144,"POLYGON ((-102.287865181776 22.41649003941679,..."
4,[01005],31515,POLYGON ((-102.3356775711372 22.05066521496391...


In [29]:
print('Save')
start = timer()

clusters_rural.to_pickle(path_to_data+'clusters/clusters-rural-mexico.pkl')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Save
Computing Time: 1 sec


# Combine All Clusters

In [30]:
print('Finalize Clusters:')
start = timer()

# Combine Urban and Rural Clusters
clusters_final = pd.concat([clusters_valid[['CLUSTER', 'POB1', 'geometry']],clusters_rural],sort=True)

# Sort by population and area
clusters_final = pd.concat([
clusters_final,clusters_final.geometry.area.rename('area'),
],1).sort_values(by=['POB1','area']).drop('area',1).reset_index(
drop=True).reset_index().rename(columns={
'index':'cluster','POB1':'population','CLUSTER':'cvegeo'})

clusters_final.population = clusters_final.population.astype(int)

clusters_final['geometry']=\
[MultiPolygon([feature]) if type(feature)==Polygon else feature for feature in clusters_final['geometry']]

clusters_final = clusters_final.reindex(columns=['cluster', 'population', 'cvegeo', 'geometry'])

print('Population:', munic.POB1.sum())
print('Population clusters:', clusters_final.population.sum())
print('Min. population clusters:', clusters_final.population.min())

end = timer()
print('Computing Time:', round(end - start), 'sec')

Finalize Clusters:
Population: 112336538
Population clusters: 112336538
Min. population clusters: 100
Computing Time: 14 sec


In [31]:
clusters_final.head()

Unnamed: 0,cluster,population,cvegeo,geometry
0,0,100,[0901600010514003],(POLYGON ((-99.22546446460659 19.4323776478605...
1,1,100,[0900300011552027],(POLYGON ((-99.17264961758036 19.3104308113713...
2,2,100,[1510900031059057],(POLYGON ((-99.16810817973173 19.6222906553012...
3,3,100,[1409800012432018],(POLYGON ((-103.3504229026755 20.6059711208537...
4,4,100,[0900600011179011],(POLYGON ((-99.06789319473418 19.4092513064329...


In [33]:
print('Save')
start = timer()

clusters_final.to_pickle(path_to_data+'clusters/clusters-mexico.pkl')
clusters_final.drop('cvegeo',1).to_file(path_to_data+'clusters/clusters-mexico.geojson', driver='GeoJSON')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Save
Computing Time: 287 sec
