# Create_Clusters

In [None]:
import rasterio
from rasterio.mask import mask
from rasterio.features import shapes
from rasterstats import zonal_stats
import geopandas as gpd
import json

# 1. Download

GHS layer https://ghsl.jrc.ec.europa.eu/data.php
admin GADM
grid from Alex

In [None]:
# Input GHS (global) GeoTiff, output clipped and shapefile to use for clipping
ghs_in = 'GHS_POP_250/GHS_POP_GPW42015_GLOBE_R2015A_54009_250_v1_0.tif'
ghs_clip = 'GHS_clipped.tif'
clip_boundary = 'gadm_mwi.shp'
grid_in = 'malawi_mv.shp'

# 2. Clip

https://automating-gis-processes.github.io/CSC18/lessons/L6/clipping-raster.html

In [None]:
ghs = rasterio.open(ghs_in)

In [None]:
# open shapefile and convert to string of points for clipping bounds
adm = gpd.read_file(clip_boundary)
adm = adm.to_crs(crs=ghs.crs)
coords = [json.loads(adm.to_json())['features'][0]['geometry']]

In [None]:
# mask/clip the raster using rasterio.mask
out_img, out_transform = mask(dataset=ghs, shapes=coords, crop=True)

In [None]:
# save the clipped raster with the correct metadata
out_meta = ghs.meta.copy()
out_meta.update({"driver": "GTiff",
                 "height": out_img.shape[1],
                 "width": out_img.shape[2],
                 "transform": out_transform})

with rasterio.open(ghs_clip, "w", **out_meta) as dest:
    dest.write(out_img)

# 3. Vectorize

https://gis.stackexchange.com/questions/187877/how-to-polygonize-raster-to-shapely-polygons

In [None]:
with rasterio.open(ghs_clip) as src:
    image = src.read(1) # first band
    results = (
    {'properties': {'raster_val': v}, 'geometry': s}
    for i, (s, v) 
    in enumerate(
        shapes(image, mask=None, transform=src.transform)))
        
geoms = list(results)
ghs_poly  = gpd.GeoDataFrame.from_features(geoms)
ghs_poly.crs = ghs.crs.data

# 4. Filter on population and size, buffer and dissolve

In [None]:
max_block_size = 100000
min_block_pop = 50
buffer_amount = 150

# filter to ignore blocks with basically no people
w
ghs_poly = ghs_poly[ghs_poly['area_m2'] < max_block_size] # remove blocks that are too big (basically artifacts)
ghs_poly = ghs_poly[ghs_poly['raster_val'] > min_block_pop] # remove blocks with 30 or less people

# buffer outwards so that nearby blocks will overlap
ghs_poly['geometry'] = ghs_poly.geometry.buffer(buffer_amount)

# and dissolve the thousands of blocks into a single shapefile (with no attributes!)
ghs_poly['same'] = 1
ghs_poly = ghs_poly.dissolve(by='same')

# 7. To singleparts

https://gis.stackexchange.com/a/271735

In [None]:
# To get our attributes, we convert the dissolves polygon into singleparts
# This means each contiguous bubble becomes its own polygon and can store its own attributes

ghs_poly = ghs_poly.explode()
ghs_poly = ghs_poly.reset_index()
ghs_poly['geometry'] = ghs_poly[0]
ghs_poly = ghs_poly.drop(columns=['level_0', 'level_1', 0]) # shapefile doesn't like integer column name
ghs_poly = gpd.GeoDataFrame(ghs_poly)
ghs_poly.crs = ghs.crs.data

In [None]:
ghs_poly.to_file('GHS_vec_fil_buf_diss_exp.shp')

# 8. Raster zonal statistics

https://automating-gis-processes.github.io/CSC18/lessons/L6/zonal-statistics.html

In [None]:
# But we still need to get the population data back, so we join it with the original raster data
# We take the sum of all population that lies underneath the polygon
pop_sums = zonal_stats('GHS_vec_fil_buf_diss_exp.shp', ghs_in, stats='sum')
pop_sums = [x['sum'] for x in pop_sums]
ghs_poly['pop_sum'] = pop_sums

# And then add the polygon's area back to its attributes
ghs_poly["area_m2"] = ghs_poly['geometry'].area

# 10. Buffer and join with grid to see if 'connected'

In [None]:
# read in the relevant MV grid lines file
grid = gpd.read_file(grid_in)
grid = grid.to_crs(crs=ghs_poly.crs)

In [None]:
# We buffer out to find overlaps with the nearest grid lines
grid_distance_for_connected = 1000  # this is the maximum distance to consider a cluster grid connected

ghs_poly['index'] = ghs_poly.index
ghs_poly_bigbuffer = ghs_poly.copy()
ghs_poly_bigbuffer['geometry'] = ghs_poly_bigbuffer.geometry.buffer(grid_distance_for_connected)

In [None]:
ghs_poly_joined = gpd.sjoin(ghs_poly_bigbuffer, grid, how="left", op='intersects')

# Extract only the un-joined rows (not near grid) and mark them as not grid connected
ghs_poly_joined = ghs_poly_joined.fillna({'index_right': -999})
ghs_poly_joined = ghs_poly_joined[ghs_poly_joined['index_right'] == -999]
ghs_poly_joined['connected'] = 0
ghs_poly_joined = ghs_poly_joined[['index', 'connected']]

# Join them back into the original (un_bigbuffered) clusters and mark the remainder as connected
ghs_poly_joined = ghs_poly.merge(ghs_poly_joined, how='left', on='index')
ghs_poly_joined.loc[ghs_poly_joined['connected'] != 0, 'connected'] = 1

# convert all to int
ghs_poly_joined['connected'] = ghs_poly_joined['connected'].astype(int)
ghs_poly_joined['pop_sum'] = ghs_poly_joined['pop_sum'].astype(int)
ghs_poly_joined['area_m2'] = ghs_poly_joined['area_m2'].astype(int)

ghs_poly_joined = ghs_poly_joined.drop(columns=['index'])

In [None]:
ghs_poly_joined.to_file('GHS_clusters_joined.shp')