In [8]:
import numpy as np
import pandas as pd
import random
import csv
import os
import matplotlib.pyplot as plt
from collections import defaultdict

import shapefile
import json
from json import dumps
import fiona
from pyproj import Proj#, transform
import pyproj

import shapely
from shapely.geometry import Polygon
from shapely.geometry import shape
from functools import partial
from shapely.ops import transform
from shapely.strtree import STRtree

In [9]:
from tqdm import tqdm

Originally shp2geo.py

# Helper functions

In [3]:
def read_csv(shape_file, readCSV):
    """Read the coordinate of the bounding boxes and constructs and R-Tree data structure

    Args:
      shape_file : polygons
      readCSV: pandas dataframe containing bounding boxes

    Returns:
    dict, r-tree: dict of bounding boxes for each image id and r-tree
    """
    shapes = fiona.open(shape_file)
    if len(shapes.crs) != 0:
        destination = Proj(shapes.crs)
    else:
        destination = Proj('+init=EPSG:4326')
    original = Proj('+init=EPSG:4326')

    grid = dict()
    keys = ['max_lat', 'max_lon', 'min_lat', 'min_lon']
    poly_list = []
    
    for index, row in readCSV.iterrows():
        if index not in grid:
            grid[index] = dict()
        grid[index]['image_id'] = row['image_id']
        grid[index]['max_lat'] = float(row['max_lat'])
        grid[index]['max_lon'] = float(row['max_lon'])
        grid[index]['min_lat'] = float(row['min_lat'])
        grid[index]['min_lon'] = float(row['min_lon'])

        grid[index]['poly'] = shapely.geometry.box(
            grid[index]['min_lon'], grid[index]['min_lat'], grid[index]['max_lon'], grid[index]['max_lat'])
        
        # project boxes from WSG 84 to parcel projection
        project = partial(pyproj.transform, original, destination)
        grid[index]['poly'] = transform(project, grid[index]['poly'])

        # populating r-tree
        poly_obj = grid[index]['poly']
        poly_obj.name = grid[index]['image_id'] # useful for retrival in search phase
        poly_list.append(poly_obj)
        
    tree = STRtree(poly_list) # constructing R-Tree
    return grid, tree

def listit(t):
    # convert to appropriate list type 
    return list(map(listit, t)) if isinstance(t, (list, tuple)) else t


def check_polygon_in_bounds(poly, tree):
    """
    find image corrspinding to the existance of a field in the list of 
    image bounding boxes

    Args:
      poly (polygon): field
      tree (r-tree): r-tree of images

    Returns:
      List: List of intersecting images with a field
    """
    results = tree.query(poly)
    return results


def field_imageId_list(polys, count_parcels):
    """
    extract name of the intersecting polygons

    Args:
      polys (polygons): intersecting fields
      count_parcels (dict): the sanity check summary of # of fields in image ids  
    Returns:
      list: list of the image ids
    """
    list_image_ids = []
    for element in polys:
        list_image_ids.append(element.name)
        count_parcels[element.name] += 1
    return list_image_ids

# Find intersecting polygons

### France

In [4]:
def dump_shp_to_json(shape_file, grid, tree, output_json='../data/planet/france/sherrie10k/test_json'):
    """
    find intersecting polygons in the list of available images and save the GeoJSON

    Args:
      shape_file (polygons): fields
      grid (dict): image bounding boxes 
      tree (r-tre): r-tree of images
      output_json (str): output path of json file
    """
    # coordinate transformation
    reader = shapefile.Reader(shape_file)
    shapes = fiona.open(shape_file)
    if len(shapes.crs) != 0:
        original = Proj(shapes.crs)
    else:
        original = Proj('+init=EPSG:4326')
#     print(fiona.open(shape_file).crs)

    # list of properties of features
    fields = reader.fields[1:]
    field_names = [field[0] for field in fields]
    field_names.append('image_id')

    buffer = []
    # sanity check counters
    count_parcels = defaultdict(int)
#     index = 0
    counter_method1 = 0
    counter_method2 = 0
    num_matched = 0
    failed_projection = 0
  
    # loop through the polygon fields
    for sr in tqdm(reader.iterShapeRecords(), total=9517878):
#         if index % 100000 == 0:
#             print('Parsed ', index)
#         index += 1
        geom = sr.shape.__geo_interface__
        shp_geom = shape(geom)
        intersect = check_polygon_in_bounds(shp_geom, tree)
#         print(intersect)
        if len(intersect) != 0:
            num_matched += len(intersect)
#             print("Matched:", str(index))
#             print("Number matched:", num_matched)
      
            id_list = field_imageId_list(intersect, count_parcels)
            sr.record.append(id_list)
            atr = dict(zip(field_names, sr.record))
            
            geom['coordinates'] = listit(geom['coordinates'])
            try: # protection at polygons that fail at projection
                if len(geom['coordinates']) == 1: # for single polygon
                    counter_method1 += 1
                    x, y = zip(*geom['coordinates'][0])
                    lat, long = original(x, y, inverse=True) # coordinate transformation
                    geom['coordinates'] = [listit(list(zip(lat, long)))]
                else: # for multipolygons
                    counter_method2 += 1
                    for index_coord in range(0, len(geom['coordinates'])):
                        for counter in range(0,len(geom['coordinates'][index_coord])):
                            x, y = geom['coordinates'][index_coord][counter]
                            lat, long = original(x, y, inverse=True) # coordinate transformation
                            geom['coordinates'][index_coord][counter] = [lat, long] #(long, lat)
            except:
                failed_projection =+ 1
#                 print(geom['coordinates'])
            buffer.append(dict(type="Feature", geometry=geom, properties=atr))
            
#             if num_matched > 10:
#                 break
      
      
    # write the GeoJSON file
    output_json_interval = output_json + str(num_matched) + '.json'
    print("saving json")
    with open(output_json_interval, 'w') as geojson:
        geojson.write(dumps({"type": "FeatureCollection", "features": buffer}, indent=2) + "\n")
        geojson.close()
        print('saved', output_json_interval)
    
    # print summary
    print('method one count:', counter_method1)
    print('method two count:', counter_method2)
    print("Number matched:", num_matched)
    print('failed count', failed_projection)

In [7]:
test = fiona.open(shape_file)

In [18]:
list(test.schema['properties'].keys())

['ID_PARCEL',
 'SURF_PARC',
 'CODE_CULTU',
 'CODE_GROUP',
 'CULTURE_D1',
 'CULTURE_D2']

In [12]:
for t in test:
    print(t['geometry'])
    break

{'type': 'MultiPolygon', 'coordinates': [[[(701200.320100002, 6883238.830900002), (700676.8660000041, 6882785.608600002), (700657.6099999994, 6882831.337000001), (700652.1803000048, 6882844.3105), (700646.9530000016, 6882853.198800001), (700627.3748000041, 6882878.875500001), (700617.5866999999, 6882893.216300003), (700613.4576999992, 6882901.178100001), (700585.1190000027, 6882964.341000002), (700568.450000003, 6883002.176000003), (700553.0282000005, 6883039.002), (700539.0053000003, 6883078.2927), (700540.7251000032, 6883084.113500003), (700639.8117000014, 6883181.4804), (700742.3322999999, 6883282.026700001), (701020.0553000048, 6883553.359200001), (701148.2096000016, 6883518.824800003), (701147.2836000025, 6883498.9810000025), (701143.9200000018, 6883446.650000002), (701143.1825000048, 6883398.5715), (701146.2990000024, 6883375.501900002), (701154.2087000012, 6883351.177100003), (701166.5234000012, 6883320.895800002), (701164.6138000041, 6883310.729600001), (701174.5357000008, 6883

In [None]:
base_dir = '../data/planet/france/sherrie10k/'
# csv_file = os.path.join(base_dir, 'bbox10k.csv')
# csv_file = os.path.join(base_dir, 'bbox10k_1250px.csv')
csv_file = os.path.join(base_dir, 'bbox10k_2500px.csv')

shape_file = '../data/parcels/france/RPG_2-0__SHP_LAMB93_FR-2018_2018-01-15/RPG/1_DONNEES_LIVRAISON_2018/RPG_2-0_SHP_LAMB93_FR-2018/PARCELLES_GRAPHIQUES.shp'
# TODO: update shape file to 2019

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

for start in np.arange(0, 1500, 250): # np.arange(1500, 10000, 250):
    end = start + 250
    images_df = pd.read_csv(csv_file).iloc[start:end]
    images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
    grid, tree = read_csv(shape_file, images_df)
    
    dump_shp_to_json(shape_file, grid, tree, 
                     '../data/planet/france/sherrie10k/json_polys/bbox10k_2500px_{}_'.format(int(start/250)))

### 2019 Geopackage

In [7]:
test = fiona.open(shape_file)

In [27]:
test_proj = Proj(test.crs)

In [54]:
for t in test:
    if len(t['geometry']['coordinates']) > 1:
        break

In [55]:
geom = t['geometry']

In [48]:
geom['coordinates'][index_coord][counter]

[(871071.9519999996, 6895244.015000001),
 (871071.9827999994, 6895244.0273),
 (871072.9772000015, 6895241.6697),
 (871072.9785000011, 6895241.669200003),
 (871548.8317000046, 6895434.682700001),
 (871725.5075000003, 6895505.657300003),
 (871892.441399999, 6895569.984700002),
 (871904.3320000023, 6895558.030000001),
 (871910.7366999984, 6895547.213300001),
 (871917.0866999999, 6895528.4278),
 (871918.2650000006, 6895513.297000002),
 (871916.9420000017, 6895504.0370000005),
 (871907.4294000044, 6895482.125600003),
 (871902.5249000043, 6895477.249300003),
 (871884.4105999991, 6895459.239100002),
 (871802.3350000009, 6895392.929000001),
 (871766.6579999998, 6895360.632000003),
 (871729.6621000022, 6895339.944900002),
 (871421.6810000017, 6895157.777000003),
 (871371.2897000015, 6895130.869100001),
 (871371.2390000001, 6895130.842100002),
 (871370.7206000015, 6895131.712300003),
 (871156.5509000048, 6895047.279900003),
 (871156.5323000029, 6895047.323700003),
 (871155.5854000002, 6895046.99

In [61]:
geom['coordinates'][1]

[[(871071.9827999994, 6895244.0273),
  (871857.5275000036, 6895558.577600002),
  (871890.6110000014, 6895571.825000003),
  (871892.2334000021, 6895570.193800002),
  (871889.9606000036, 6895571.5645),
  (871071.9827999994, 6895244.0273)]]

In [62]:
for index_coord in range(0, len(geom['coordinates'])):
    for counter in range(0,len(geom['coordinates'][index_coord][0])):
        x, y = geom['coordinates'][index_coord][0][counter]
        lat, long = test_proj(x, y, inverse=True) # coordinate transformation

In [24]:
test_x, test_y = zip(*geom['coordinates'][0][0])

In [4]:
def dump_shp_to_json(shape_file, grid, tree, output_json='../data/planet/france/sherrie10k/test_json'):
    """
    find intersecting polygons in the list of available images and save the GeoJSON

    Args:
      shape_file (polygons): fields
      grid (dict): image bounding boxes 
      tree (r-tre): r-tree of images
      output_json (str): output path of json file
    """
    # coordinate transformation
    shapes = fiona.open(shape_file)
    if len(shapes.crs) != 0:
        original = Proj(shapes.crs)
    else:
        original = Proj('+init=EPSG:4326')

    # list of properties of features
#     field_names = shapes.schema['properties'].keys()
#     field_names.append('image_id')
    
    # sanity check counters
    buffer = []
    count_parcels = defaultdict(int)
    index = 0
    counter_method1 = 0
    counter_method2 = 0
    num_matched = 0
    failed_projection = 0
  
    # loop through the polygon fields
    for sr in tqdm(shapes, total=9517878):
#         if index % 100000 == 0:
#             print('Parsed ', index)
#         index += 1
        geom = sr['geometry']
        shp_geom = shape(geom)
        intersect = check_polygon_in_bounds(shp_geom, tree)
#         print(intersect)
        if len(intersect) != 0:
            num_matched += len(intersect)
#             print("Matched:", str(index))
#             print("Number matched:", num_matched)
      
            id_list = field_imageId_list(intersect, count_parcels)
            atr = dict(sr['properties'])
            atr['image_id'] = id_list
#             sr.record.append(id_list)
#             atr = dict(zip(field_names, sr.record))
            
            geom['coordinates'] = listit(geom['coordinates'])
            try: # protection at polygons that fail at projection
                if len(geom['coordinates']) == 1: # for single polygon
                    counter_method1 += 1
                    x, y = zip(*geom['coordinates'][0][0])
                    lat, long = original(x, y, inverse=True) # coordinate transformation
                    geom['coordinates'] = [listit(list(zip(lat, long)))]
                else: # for multipolygons
                    counter_method2 += 1
                    for index_coord in range(0, len(geom['coordinates'])):
                        for counter in range(0,len(geom['coordinates'][index_coord][0])):
                            x, y = geom['coordinates'][index_coord][0][counter]
                            lat, long = original(x, y, inverse=True) # coordinate transformation
                            geom['coordinates'][index_coord][counter] = [lat, long] #(long, lat)
            except:
                failed_projection += 1
#                 print(geom['coordinates'])
            buffer.append(dict(type="Feature", geometry=geom, properties=atr))
            
            # for debugging
#             if num_matched > 10:
#                 break
      
      
    # write the GeoJSON file
    output_json_interval = output_json + str(num_matched) + '.json'
    print("saving json")
    with open(output_json_interval, 'w') as geojson:
        geojson.write(dumps({"type": "FeatureCollection", "features": buffer}, indent=2) + "\n")
        geojson.close()
        print('saved', output_json_interval)
    
    # print summary
    print('method one count:', counter_method1)
    print('method two count:', counter_method2)
    print("Number matched:", num_matched)
    print('failed count', failed_projection)

In [None]:
base_dir = '../data/planet/france/sherrie10k/'
csv_file = os.path.join(base_dir, 'bbox10k.csv')
# csv_file = os.path.join(base_dir, 'bbox10k_1250px.csv')
# csv_file = os.path.join(base_dir, 'bbox10k_2500px.csv')

# shape_file = '../data/parcels/france/RPG_2-0__SHP_LAMB93_FR-2018_2018-01-15/RPG/1_DONNEES_LIVRAISON_2018/RPG_2-0_SHP_LAMB93_FR-2018/PARCELLES_GRAPHIQUES.shp'
# TODO: update shape file to 2019
shape_file = '../data/parcels/france/RPG_2-0_GPKG_LAMB93_FR-2019/RPG/1_DONNEES_LIVRAISON_2019/RPG_2-0_GPKG_LAMB93_FR-2019/PARCELLES_GRAPHIQUES.gpkg'

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

# 300px and 1250px images
images_per_file = 1000
for start in np.arange(0, 10000, images_per_file):
    end = start + images_per_file
    images_df = pd.read_csv(csv_file).iloc[start:end]
    images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
    grid, tree = read_csv(shape_file, images_df)

    dump_shp_to_json(shape_file, grid, tree, 
                     '../data/planet/france/sherrie10k/json_polys_2019/bbox10k_{}_'.format(int(start/images_per_file)))

# 2500px images
# images_per_file = 250 
# for start in np.arange(250, 10000, images_per_file):
#     end = start + images_per_file
#     images_df = pd.read_csv(csv_file).iloc[start:end]
#     images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
#     grid, tree = read_csv(shape_file, images_df)
    
#     dump_shp_to_json(shape_file, grid, tree, 
#                      '../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_{}_'.format(int(start/images_per_file)))

9604463it [16:18, 9816.81it/s]                              


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_0_23365.json
method one count: 23327
method two count: 0
Number matched: 23365
failed count 0


9604463it [16:31, 9690.12it/s]                              


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_1_24138.json
method one count: 24127
method two count: 0
Number matched: 24138
failed count 0


9604463it [16:26, 9734.33it/s]                              


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2_24287.json
method one count: 24264
method two count: 0
Number matched: 24287
failed count 0


9604463it [16:24, 9754.87it/s]                              


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_3_23178.json
method one count: 23169
method two count: 0
Number matched: 23178
failed count 0


  5%|▌         | 477805/9517878 [00:47<18:24, 8185.56it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 63%|██████▎   | 5983555/9517878 [10:07<05:27, 10788.11it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

9604463it [16:20, 9796.65it/s]                              


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_4_23120.json
method one count: 23072
method two count: 0
Number matched: 23120
failed count 0


 19%|█▉        | 1785552/9517878 [03:01<17:05, 7537.50it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 50%|████▉     | 4729537/9517878 [07:52<07:46, 10274.39it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 82%|████████▏ | 7769850/9517878 [13:01<02:47, 10407.57it/s]

In [None]:
base_dir = '../data/planet/france/sherrie10k/'
# csv_file = os.path.join(base_dir, 'bbox10k.csv')
csv_file = os.path.join(base_dir, 'bbox10k_1250px.csv')
# csv_file = os.path.join(base_dir, 'bbox10k_2500px.csv')

# shape_file = '../data/parcels/france/RPG_2-0__SHP_LAMB93_FR-2018_2018-01-15/RPG/1_DONNEES_LIVRAISON_2018/RPG_2-0_SHP_LAMB93_FR-2018/PARCELLES_GRAPHIQUES.shp'
# TODO: update shape file to 2019
shape_file = '../data/parcels/france/RPG_2-0_GPKG_LAMB93_FR-2019/RPG/1_DONNEES_LIVRAISON_2019/RPG_2-0_GPKG_LAMB93_FR-2019/PARCELLES_GRAPHIQUES.gpkg'

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

# 300px and 1250px images
images_per_file = 1000
for start in np.arange(0, 10000, images_per_file):
    end = start + images_per_file
    images_df = pd.read_csv(csv_file).iloc[start:end]
    images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
    grid, tree = read_csv(shape_file, images_df)

#     dump_shp_to_json(shape_file, grid, tree, 
#                      '../data/planet/france/sherrie10k/json_polys_2019/bbox10k_{}_'.format(int(start/images_per_file)))
    dump_shp_to_json(shape_file, grid, tree, 
                     '../data/planet/france/sherrie10k/json_polys_2019/bbox10k_1250px_{}_'.format(int(start/images_per_file)))

# 2500px images
# images_per_file = 250 
# for start in np.arange(250, 10000, images_per_file):
#     end = start + images_per_file
#     images_df = pd.read_csv(csv_file).iloc[start:end]
#     images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
#     grid, tree = read_csv(shape_file, images_df)
    
#     dump_shp_to_json(shape_file, grid, tree, 
#                      '../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_{}_'.format(int(start/images_per_file)))

 71%|███████   | 6741942/9517878 [12:46<05:03, 9149.60it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

9604463it [18:27, 8672.22it/s]                              


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_1250px_2_292629.json
method one count: 286678
method two count: 0
Number matched: 292629
failed count 0


 25%|██▍       | 2349602/9517878 [04:19<13:51, 8617.18it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 82%|████████▏ | 7767145/9517878 [14:19<03:23, 8599.82it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 42%|████▏     | 4006520/9517878 [07:28<09:33, 9603.30it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config va

In [7]:
base_dir = '../data/planet/france/sherrie10k/'
# csv_file = os.path.join(base_dir, 'bbox10k.csv')
# csv_file = os.path.join(base_dir, 'bbox10k_1250px.csv')
csv_file = os.path.join(base_dir, 'bbox10k_2500px.csv')

# shape_file = '../data/parcels/france/RPG_2-0__SHP_LAMB93_FR-2018_2018-01-15/RPG/1_DONNEES_LIVRAISON_2018/RPG_2-0_SHP_LAMB93_FR-2018/PARCELLES_GRAPHIQUES.shp'
# TODO: update shape file to 2019
shape_file = '../data/parcels/france/RPG_2-0_GPKG_LAMB93_FR-2019/RPG/1_DONNEES_LIVRAISON_2019/RPG_2-0_GPKG_LAMB93_FR-2019/PARCELLES_GRAPHIQUES.gpkg'

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

# 300px and 1250px images
# images_per_file = 1000
# for start in np.arange(0, 10000, images_per_file):
#     end = start + images_per_file
#     images_df = pd.read_csv(csv_file).iloc[start:end]
#     images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
#     grid, tree = read_csv(shape_file, images_df)

#     dump_shp_to_json(shape_file, grid, tree, 
#                      '../data/planet/france/sherrie10k/json_polys_2019/bbox10k_{}_'.format(int(start/images_per_file)))
#     dump_shp_to_json(shape_file, grid, tree, 
#                      '../data/planet/france/sherrie10k/json_polys_2019/bbox10k_1250px_{}_'.format(int(start/images_per_file)))

# 2500px images
images_per_file = 250 
for start in np.arange(0, 10000, images_per_file):
    end = start + images_per_file
    images_df = pd.read_csv(csv_file).iloc[start:end]
    images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
    grid, tree = read_csv(shape_file, images_df)
    
    dump_shp_to_json(shape_file, grid, tree, 
                     '../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_{}_'.format(int(start/images_per_file)))

9604463it [12:46, 12526.03it/s]                             


saving json


  0%|          | 2172/9517878 [00:00<07:18, 21711.63it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_6_288684.json
method one count: 284965
method two count: 0
Number matched: 288684
failed count 0


 31%|███       | 2945285/9517878 [03:38<09:41, 11301.72it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 68%|██████▊   | 6484918/9517878 [08:00<03:07, 16198.39it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

9604463it [10:33, 15169.94it/s]                             


saving json


  0%|          | 1492/9517878 [00:00<10:37, 14917.92it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_9_272310.json
method one count: 267356
method two count: 0
Number matched: 272310
failed count 0


 33%|███▎      | 3096410/9517878 [03:10<05:51, 18273.33it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  0%|          | 0/9517878 [00:00<?, ?it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_10_271939.json
method one count: 269159
method two count: 1
Number matched: 271939
failed count 1


9604463it [09:44, 16430.59it/s]                             


saving json


  0%|          | 2348/9517878 [00:00<06:45, 23479.74it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_11_261721.json
method one count: 255460
method two count: 0
Number matched: 261721
failed count 0


9604463it [10:30, 15223.88it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_12_269393.json
method one count: 266073
method two count: 0
Number matched: 269393
failed count 0


9604463it [09:49, 16299.33it/s]                             


saving json


  0%|          | 2321/9517878 [00:00<06:50, 23203.66it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_13_253779.json
method one count: 249295
method two count: 0
Number matched: 253779
failed count 0


9604463it [09:51, 16225.55it/s]                             


saving json


  0%|          | 2276/9517878 [00:00<06:58, 22758.45it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_14_292750.json
method one count: 286249
method two count: 1
Number matched: 292750
failed count 1


9604463it [10:29, 15249.54it/s]                             


saving json


  0%|          | 2125/9517878 [00:00<07:27, 21249.11it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_15_259118.json
method one count: 257527
method two count: 1
Number matched: 259118
failed count 1


9604463it [10:00, 15999.45it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_16_295360.json
method one count: 291977
method two count: 1
Number matched: 295360
failed count 1


9604463it [09:53, 16174.41it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_17_255654.json
method one count: 251875
method two count: 0
Number matched: 255654
failed count 0


9604463it [10:23, 15408.76it/s]                             


saving json


  0%|          | 0/9517878 [00:00<?, ?it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_18_257334.json
method one count: 256236
method two count: 1
Number matched: 257334
failed count 1


9604463it [09:52, 16222.31it/s]                             


saving json


  0%|          | 2303/9517878 [00:00<06:53, 23020.80it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_19_276848.json
method one count: 272195
method two count: 2
Number matched: 276848
failed count 2


9604463it [09:50, 16269.99it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_20_262002.json
method one count: 256788
method two count: 0
Number matched: 262002
failed count 0


9604463it [10:28, 15270.75it/s]                             


saving json


  0%|          | 2061/9517878 [00:00<07:41, 20607.22it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_21_272577.json
method one count: 269022
method two count: 0
Number matched: 272577
failed count 0


9604463it [09:56, 16096.87it/s]                             


saving json


  0%|          | 2158/9517878 [00:00<07:21, 21563.11it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_22_284713.json
method one count: 282092
method two count: 2
Number matched: 284713
failed count 2


9604463it [09:52, 16218.90it/s]                             


saving json


  0%|          | 2286/9517878 [00:00<06:56, 22857.79it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_23_275841.json
method one count: 271285
method two count: 1
Number matched: 275841
failed count 1


9604463it [10:33, 15160.30it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_24_274699.json
method one count: 271857
method two count: 1
Number matched: 274699
failed count 1


9604463it [09:51, 16245.19it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_25_268769.json
method one count: 262144
method two count: 0
Number matched: 268769
failed count 0


9604463it [09:47, 16358.23it/s]                             


saving json


  0%|          | 0/9517878 [00:00<?, ?it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_26_278306.json
method one count: 274252
method two count: 0
Number matched: 278306
failed count 0


9604463it [10:33, 15149.22it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_27_270542.json
method one count: 268005
method two count: 0
Number matched: 270542
failed count 0


9604463it [09:53, 16189.66it/s]                             


saving json


  0%|          | 0/9517878 [00:00<?, ?it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_28_267738.json
method one count: 261177
method two count: 0
Number matched: 267738
failed count 0


9604463it [09:52, 16218.67it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_29_285515.json
method one count: 284850
method two count: 0
Number matched: 285515
failed count 0


9604463it [10:33, 15170.82it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_30_264797.json
method one count: 257894
method two count: 0
Number matched: 264797
failed count 0


9604463it [10:07, 15810.22it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_31_272505.json
method one count: 271017
method two count: 0
Number matched: 272505
failed count 0


9604463it [09:58, 16039.08it/s]                             


saving json


  0%|          | 1561/9517878 [00:00<10:09, 15603.69it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_32_279244.json
method one count: 277374
method two count: 0
Number matched: 279244
failed count 0


9604463it [10:28, 15272.13it/s]                             


saving json


  0%|          | 2346/9517878 [00:00<06:45, 23441.30it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_33_264697.json
method one count: 260328
method two count: 0
Number matched: 264697
failed count 0


9604463it [09:54, 16168.58it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_34_264940.json
method one count: 261742
method two count: 0
Number matched: 264940
failed count 0


9604463it [09:49, 16291.53it/s]                             


saving json


  0%|          | 2329/9517878 [00:00<06:48, 23281.70it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_35_273013.json
method one count: 270895
method two count: 1
Number matched: 273013
failed count 1


9604463it [10:36, 15078.71it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_36_282654.json
method one count: 281245
method two count: 0
Number matched: 282654
failed count 0


9604463it [09:49, 16295.75it/s]                             


saving json


  0%|          | 0/9517878 [00:00<?, ?it/s]

saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_37_258864.json
method one count: 256744
method two count: 0
Number matched: 258864
failed count 0


9604463it [09:49, 16294.13it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_38_269585.json
method one count: 265951
method two count: 0
Number matched: 269585
failed count 0


9604463it [10:30, 15241.59it/s]                             


saving json
saved ../data/planet/france/sherrie10k/json_polys_2019/bbox10k_2500px_39_256261.json
method one count: 251184
method two count: 0
Number matched: 256261
failed count 0


### India

In [19]:
# defined a new dump_shp_to_json function for india because the projection wasn't working...
# the parcels should already be in LAT, LON and don't need to be reprojected
# but somehow the inverse projection was messing things up
# TODO: fix this in a general way

In [10]:
def dump_shp_to_json(shape_file, grid, tree, output_json='../data/planet/france/sherrie10k/test_json'):
    """
    find intersecting polygons in the list of available images and save the GeoJSON

    Args:
      shape_file (polygons): fields
      grid (dict): image bounding boxes 
      tree (r-tre): r-tree of images
      output_json (str): output path of json file
    """
    # coordinate transformation
    reader = shapefile.Reader(shape_file)
    shapes = fiona.open(shape_file)
    if len(shapes.crs) != 0:
        original = Proj(shapes.crs)
    else:
        original = Proj('+init=EPSG:4326')
#     print(fiona.open(shape_file).crs)

    # list of properties of features
    fields = reader.fields[1:]
    field_names = [field[0] for field in fields]
    field_names.append('image_id')

    buffer = []
    # sanity check counters
    count_parcels = defaultdict(int)
#     index = 0
    counter_method1 = 0
    counter_method2 = 0
    num_matched = 0
    failed_projection = 0
  
    # loop through the polygon fields
    for sr in tqdm(reader.iterShapeRecords(), total=9517878):
#         if index % 100000 == 0:
#             print('Parsed ', index)
#         index += 1
        geom = sr.shape.__geo_interface__
        shp_geom = shape(geom)
        intersect = check_polygon_in_bounds(shp_geom, tree)
#         print(intersect)
        if len(intersect) != 0:
            num_matched += len(intersect)
#             print("Matched:", str(index))
#             print("Number matched:", num_matched)
      
            id_list = field_imageId_list(intersect, count_parcels)
            sr.record.append(id_list)
            atr = dict(zip(field_names, sr.record))
            
            geom['coordinates'] = listit(geom['coordinates'])
#             print(geom)
            try: # protection at polygons that fail at projection
                if len(geom['coordinates']) == 1: # for single polygon
                    counter_method1 += 1
                    x, y = zip(*geom['coordinates'][0])
                    lat,long = x, y
#                     lat, long = original(x, y, inverse=True) # coordinate transformation
#                     print(x,y)
#                     print(lat,long)
                    geom['coordinates'] = [listit(list(zip(lat, long)))]
                else: # for multipolygons
                    counter_method2 += 1
                    for index_coord in range(0, len(geom['coordinates'])):
                        for counter in range(0,len(geom['coordinates'][index_coord])):
                            x, y = geom['coordinates'][index_coord][counter]
                            lat, long = original(x, y, inverse=True) # coordinate transformation
                            geom['coordinates'][index_coord][counter] = [lat, long] #(long, lat)
            except:
                failed_projection =+ 1
#                 print(geom['coordinates'])
            buffer.append(dict(type="Feature", geometry=geom, properties=atr))
            
#             if num_matched > 10:
#                 break
      
      
    # write the GeoJSON file
    output_json_interval = output_json + str(num_matched) + '.json'
    print("saving json")
    with open(output_json_interval, 'w') as geojson:
        geojson.write(dumps({"type": "FeatureCollection", "features": buffer}, indent=2) + "\n")
        geojson.close()
        print('saved', output_json_interval)
    
    # print summary
    print('method one count:', counter_method1)
    print('method two count:', counter_method2)
    print("Number matched:", num_matched)
    print('failed count', failed_projection)

In [18]:
base_dir = '../data/planet/india/'
csv_file = os.path.join(base_dir, 'bbox1000.csv')

shape_file = '../mount/data/india_parcels/india_parcels_with_area.shp'

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

# for start in np.arange(0, 1500, 250): # np.arange(1500, 10000, 250):
#     end = start + 250
#     images_df = pd.read_csv(csv_file).iloc[start:end]

images_df = pd.read_csv(csv_file)
images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
images_df = images_df[images_df['image_id'].isin(['00064', '00126'])]

grid, tree = read_csv(shape_file, images_df)

dump_shp_to_json(shape_file, grid, tree, 
                 '../data/planet/india/json_polys/bbox1000_labeled')

  0%|          | 1638/9517878 [00:00<16:47, 9447.31it/s]


saving json
saved ../data/planet/india/json_polys/bbox1000_labeled1600.json
method one count: 1600
method two count: 0
Number matched: 1600
failed count 0


### GeoWiki

In [24]:
base_dir = '../data/planet/india/geowiki/'
csv_file = os.path.join(base_dir, 'geowiki_maharashtra.csv')

shape_file = '../mount/data/india_parcels/india_geowiki_parcels_with_area.shp'

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

images_df = pd.read_csv(csv_file)
images_df = images_df[images_df['image_id'].isin([960228])]

grid, tree = read_csv(shape_file, images_df)

dump_shp_to_json(shape_file, grid, tree, 
                 '../data/planet/india/geowiki/json_polys/geowiki_labeled')

  0%|          | 266/9517878 [00:00<19:01, 8339.57it/s]

saving json
saved ../data/planet/india/geowiki/json_polys/geowiki_labeled259.json
method one count: 259
method two count: 0
Number matched: 259
failed count 0





### General Blockchain

In [11]:
base_dir = '../data/planet/india/GeneralBlockchain/'
csv_file = os.path.join(base_dir, 'bbox_india_GB_v1.csv')

shape_file = '../mount/data/GeneralBlockchain/campaign_results/india_fields_with_area.shp'

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

df = pd.read_csv(csv_file)
grid, tree = read_csv(shape_file, df)

dump_shp_to_json(shape_file, grid, tree, 
                 '../data/planet/india/GeneralBlockchain/json_polys/bbox_images')

  0%|          | 10013/9517878 [00:09<2:22:33, 1111.64it/s]


saving json
saved ../data/planet/india/GeneralBlockchain/json_polys/bbox_images26405.json
method one count: 8788
method two count: 15
Number matched: 26405
failed count 0


### Large Airbus images

In [8]:
base_dir = '../data/general_blockchain/'
csv_file = os.path.join(base_dir, 'bbox_india_GB_large_Airbus.csv')

shape_file = '../mount/data/GeneralBlockchain/campaign_results/india_fields_with_area.shp'

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

df = pd.read_csv(csv_file)
grid, tree = read_csv(shape_file, df)

dump_shp_to_json(shape_file, grid, tree, 
                 '../data/general_blockchain/json_polys/bbox_Airbus_large')

  0%|          | 9966/9517878 [00:04<1:09:28, 2280.77it/s]


saving json
saved ../data/general_blockchain/json_polys/bbox_Airbus_large9975.json
method one count: 9898
method two count: 68
Number matched: 9975
failed count 0


In [9]:
df.head()

Unnamed: 0,image_id,min_lat,min_lon,max_lat,max_lon
0,0,26.291001,73.093048,26.311001,73.115357
1,1,25.707699,77.426401,25.727699,77.448599
2,2,27.182688,76.592924,27.202688,76.615409
3,3,27.466,80.584531,27.486,80.607074
4,4,21.107701,78.076782,21.127701,78.098221


## Africa

### Senegal

In [5]:
base_dir = '../data/planet/senegal/'
csv_file = os.path.join(base_dir, 'bbox_tiles_all.csv')

shape_file = '../mount/data/senegal_parcels/SenegalFields_03_26.shp'

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

df = pd.read_csv(csv_file)
df.shape

(14543, 5)

In [8]:
increment = 20000
    
for start in np.arange(0, df.shape[0], increment):
    end = start + increment
    images_df = pd.read_csv(csv_file).iloc[start:end]

    images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
    grid, tree = read_csv(shape_file, images_df)
    
    dump_shp_to_json(shape_file, grid, tree, 
                     '../data/planet/senegal/json_polys/bbox_tiles_{}_'.format(int(start/increment)))

  0%|          | 2590/9517878 [00:01<1:17:12, 2053.89it/s]


saving json
saved ../data/planet/senegal/json_polys/bbox_tiles_0_3293.json
method one count: 2579
method two count: 10
Number matched: 3293
failed count 1


### Ghana

In [5]:
base_dir = '../data/planet/ghana/udry/'
csv_file = os.path.join(base_dir, 'bbox_tiles_all.csv')

shape_file = '../mount/data/udry_parcels/udry_fields_2017.shp'

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

df = pd.read_csv(csv_file)
df.shape

(18023, 5)

In [10]:
increment = 20000
    
for start in np.arange(0, df.shape[0], increment):
    end = start + increment
    images_df = pd.read_csv(csv_file).iloc[start:end]

    images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
    grid, tree = read_csv(shape_file, images_df)
    
    dump_shp_to_json(shape_file, grid, tree, 
                     '../data/planet/ghana/udry/json_polys/bbox_tiles_{}_1'.format(
                         int(start/increment)))

  0%|          | 8938/9517878 [00:04<1:23:23, 1900.41it/s]


saving json
saved ../data/planet/ghana/udry/json_polys/bbox_tiles_0_111334.json
method one count: 8938
method two count: 0
Number matched: 11334
failed count 0


### Malawi

In [12]:
base_dir = '../data/planet/malawi/'
csv_file = os.path.join(base_dir, 'bbox_tiles_all.csv')

shape_file = '../mount/data/malawi_parcels/malawi_WFP_fields_2018.shp'

if os.path.exists(os.path.join(base_dir, 'json_polys')) == False:
    os.makedirs(os.path.join(base_dir, 'json_polys'))

df = pd.read_csv(csv_file)
df.shape

(2372, 5)

In [13]:
increment = 20000
    
for start in np.arange(0, df.shape[0], increment):
    end = start + increment
    images_df = pd.read_csv(csv_file).iloc[start:end]

    images_df['image_id'] = images_df['image_id'].astype(str).str.zfill(5)
    grid, tree = read_csv(shape_file, images_df)
    
    dump_shp_to_json(shape_file, grid, tree, 
                     '../data/planet/malawi/json_polys/bbox_tiles_{}_'.format(int(start/increment)))

  0%|          | 423/9517878 [00:00<1:16:26, 2074.92it/s]

saving json
saved ../data/planet/malawi/json_polys/bbox_tiles_0_499.json
method one count: 423
method two count: 0
Number matched: 499
failed count 0



