## Bootstraping

We're going to create random circles of fixed radius (1000 inside each sample)
and count the number of buildings inside them, both real and predicted.

For the circles to be fully contained inside the sample, their centers must be
inside a square of `x-2r` side length, where `x` is the side length of the sample

Then we'll compare the results of the distributions.

In [1]:
import os

import numpy as np
import pandas as pd
import geopandas as gpd

from multiprocessing import Pool

from shapely.wkt import loads
from shapely.geometry import Point

In [2]:
def gen_circles(sample, n, radius, crs):
    """
    Generate a GeoDataFrame of n circles with radius r
    that fall completely inside `sample` (assuming `sample`
    is a rectangle of at least 2*radius on each side)
    """
    min_x, min_y, max_x, max_y = (
        np.array(sample['geometry'].bounds) +
        np.array([radius, radius, -radius, -radius])
    )

    rand_x = np.random.uniform(min_x, max_x, size=n)
    rand_y = np.random.uniform(min_y, max_y, size=n)
    
    circles = [Point(x, y).buffer(radius) for x, y in zip(rand_x, rand_y)]
    
    return gpd.GeoDataFrame({'geometry': circles}, crs=crs).reset_index()

def intersect_count(circles_df, points_df, groupby_cols=['index', 'type_short']):
    """
    Count the number of points that fall inside a polygon.
    
    We'll use this to count the number of Ground Truth
    points or building centroids that fall inside each
    bootstrap circle
    """
    sjoin = gpd.sjoin(circles_df, points_df, op='contains')
    
    return sjoin.groupby(groupby_cols).size().unstack(fill_value=0)

def process_sample(idx, row):
    """ Wrapper for multiprocessing """
    fid = f'FID_{row["FID"]}_{row["Class"].lower()}'
    print(f'Working on {fid}')
    circles = gen_circles(row, n, radius, samples.crs)
    
    return fid, intersect_count(circles, gt.loc[:, ['geometry', 'type_short']])

Load data (Ground Truth, digitized and segmentized) and convert to local UTM

In [3]:
latlon = {'init': 'epsg:4326'}
utm_18 = {'init': 'epsg:32618'}

strata = ['residential', 'rural', 'shanty', 'urbanirreg', 'urbanreg']

class_path = '../data/results/classified/'
gt_path = '../data/ground_truth/within_samples/'

# Load samples
samples = pd.read_csv("../data/samples/samples_reference.csv")
samples['geometry'] = samples['geometry_wkt'].apply(lambda x: loads(x))
samples = gpd.GeoDataFrame(samples, crs=latlon)
samples = samples.to_crs(crs=utm_18)

# Load data
digis = {stratum:gpd.read_file(os.path.join(class_path, f'{stratum}_digitized.gpkg')) for stratum in strata}
segms = {stratum:gpd.read_file(os.path.join(class_path, f'{stratum}_segmentized.gpkg')) for stratum in strata}
gts = {stratum:gpd.read_file(os.path.join(gt_path, f'gt_within_{stratum}')).to_crs(utm_18) for stratum in strata}

In [4]:
bootstrap_path = '../data/results/bootstrap'
circles_path = os.path.join(bootstrap_path, 'circles')
os.makedirs(circles_path, exist_ok=True)

bootstrap_res = {}
for stratum in strata:
    samples_stratum = samples.loc[samples['Class'].str.lower() == stratum]
    digi = digis[stratum].copy()
    segm = segms[stratum].copy()
    gt = gts[stratum].copy()
    
    digi.geometry = digi.geometry.centroid
    segm.geometry = segm.geometry.centroid
    
    for idx, sample in samples_stratum.iterrows():
        circles = gen_circles(sample, 200, 40, utm_18)
        
        digi_res = intersect_count(circles, digi, groupby_cols=['index', 'class_svm_puk'])
        segm_res = intersect_count(circles, segm, groupby_cols=['index', 'class_svm_puk'])
        gt_res = intersect_count(circles, gt)
        
        bootstrap_res[f'{stratum}_{idx}'] = digi_res.join(segm_res, lsuffix='_digi', rsuffix='_segm').join(gt_res)
        bootstrap_res[f'{stratum}_{idx}'].to_csv(os.path.join(bootstrap_path, f'bootstrap_{stratum}_{idx}.csv'))
        
        circles.to_file(os.path.join(circles_path, f'bootstrap_circles_{stratum}_{idx}.gpkg'), driver='GPKG')