##### <center>Downloading Meta's CHM data from Google Earth Engine</center>

###### See example here: https://code.earthengine.google.com/?scriptPath=users%2Fsat-io%2Fawesome-gee-catalog-examples%3Aagriculture-vegetation-forestry%2FGLOBAL-1m-CANOPY-HEIGHT

In [1]:
import ee
import sys
import os
import requests
import logging
import multiprocessing
from retry import retry
import time

In [2]:
# initialize the Earth Engine module to use the high volume endpoint (use whenever making automated requests)
ee.Initialize(url = 'https://earthengine-highvolume.googleapis.com', project = 'ee-zack-loken')

In [4]:
# Configure logging of process information
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(processName)s] %(levelname)s: %(message)s',
    handlers=[logging.StreamHandler()]
)

In [21]:
state_name = 'Illinois'
num_workers = int(24)
out_dir = 'C:/Users/zack/Desktop/CRP_Windbreak_ID/meta_chm_data'

In [5]:
def check_and_split_county(county_geometry, max_dim, scale, max_request_size):
    """Check if the geometry exceeds GEE maximum dimensions and 
    split it into smaller subregions if needed."""
    # Get the bounding box of the geometry
    bounds = county_geometry.bounds().getInfo()['coordinates'][0]
    min_x, min_y = bounds[0]
    max_x, max_y = bounds[2]
    
    width = max_x - min_x
    height = max_y - min_y

    # Convert degrees to meters (approx)
    width_m = width * 111319.5
    height_m = height * 111319.5
    
    # Calculate the number of splits needed
    num_splits = 1
    while (width_m / num_splits) / scale > max_dim or (height_m / num_splits) / scale > max_dim:
        num_splits *= 2

    # Further split if the request size exceeds the limit
    while (width_m * height_m * 4 / (num_splits ** 2)) > max_request_size:
        num_splits *= 2

    if num_splits > 1:
        x_step = width / num_splits
        y_step = height / num_splits

        # Split the region into smaller subregions if needed (GeoJSON format)
        sub_regions = [
            [
                min_x + i * x_step, # xmin
                min_y + j * y_step, # ymin
                min_x + (i + 1) * x_step, # xmax
                min_y + (j + 1) * y_step # ymax
            ]
            for i in range(num_splits)
            for j in range(num_splits)
        ]
        
        return sub_regions
    else:
        return [county_geometry]

In [9]:
def getRequests(state_name:str):
    """Split counties in a state into subregions and return a list of 
    tuples with the county name followed by its subregions."""
    try:
        # Load the counties for state into feature collection
        counties = ee.FeatureCollection('TIGER/2018/Counties').filter(
            ee.Filter.eq(
                'STATEFP',
                ee.FeatureCollection('TIGER/2018/States')
                .filter(ee.Filter.eq('NAME', state_name))
                .first()
                .get('STATEFP')
            )
        )   

        # Function to process each county name in the list
        def process_county(county_name):
            # Split the county into smaller subregions
            sub_regions = check_and_split_county(
                county_geometry = counties.filter(ee.Filter.eq('NAME', county_name)).first().geometry(), 
                max_dim = 32768, 
                scale = 1, 
                max_request_size = 50331648
            )

            # Create a tuple with the county name followed by its subregions
            return (county_name, *sub_regions)
        
        # Use map to process each county and create the list of subregions
        return list(map(process_county, counties.aggregate_array('NAME').getInfo()))
    
    except Exception as e:
        logging.error(f'Error processing state {state_name}: {e}')
        return []

In [7]:
def getResults(index, counties_and_sub_regions, failed_sub_regions):
    """Prepare sub-region processing tasks for each county"""
    county_name, *sub_regions = counties_and_sub_regions[index]

    # Prepare arguments for parallel processing
    tasks = [(county_name, sub_region_idx, sub_region_geojson, failed_sub_regions) for sub_region_idx, sub_region_geojson in enumerate(sub_regions)]
    
    return tasks

In [8]:
def process_sub_region(county_name, sub_region_idx, sub_region, failed_sub_regions):
    """Download the sub-region images for each county in the list of tuples"""
    # Convert GeoJSON to ee.Geometry.Rectangle
    sub_region = ee.Geometry.Rectangle(sub_region)
    
    # Get the sub-region image
    sub_region_image = ee.ImageCollection('projects/meta-forest-monitoring-okw37/assets/CanopyHeight').filterBounds(sub_region).mosaic().clip(sub_region)

    max_retries = 5
    backoff_factor = 1

    for attempt in range(max_retries):
        try:
            # Get the download URL for the image
            url = sub_region_image.getDownloadURL({
                'scale': 1,
                'format': 'GEO_TIFF'
            })

            # Download the TIFF from URL
            r = requests.get(url, stream=True)  # Add timeout to requests
            if r.status_code != 200:
                r.raise_for_status()

            # create a directory for the county
            county_dir = f"{out_dir}/{state_name}" + '/' + str(county_name)
            os.makedirs(county_dir, exist_ok=True)

            # save the image to the county directory with filename: {county_name}_{sub_region_idx}_chm.tif
            with open(f'{county_dir}/{county_name}_{sub_region_idx}_chm.tif', 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)

            logging.info(f'Saved {county_name}_{sub_region_idx}_chm.tif to {county_dir}')
            break  # Exit the retry loop if successful

        except requests.exceptions.HTTPError as e:
            if r.status_code == 429:
                wait_time = backoff_factor * (2 ** attempt)
                logging.warning(f"Rate limit exceeded. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                logging.error(f"HTTP error occurred: {e}")
                failed_sub_regions.append((county_name, sub_region_idx, sub_region.getInfo()['coordinates']))
                break
        except requests.exceptions.RequestException as e:
            logging.error(f"Request error occurred: {e}")
            failed_sub_regions.append((county_name, sub_region_idx, sub_region.getInfo()['coordinates']))
            break
        except ee.EEException as e:
            logging.error(f"Error processing sub_region {sub_region_idx}: {e}")
            failed_sub_regions.append((county_name, sub_region_idx, sub_region.getInfo()['coordinates']))
            break
        except Exception as e:
            logging.error(f"Unexpected error: {e}")
            failed_sub_regions.append((county_name, sub_region_idx, sub_region.getInfo()['coordinates']))
            break

In [25]:
#  store counties and subregions in a list
counties_and_sub_regions = getRequests(state_name)

In [38]:
all_tasks = []
failed_sub_regions = []
for index in range(len(counties_and_sub_regions)):
    county_folder = os.path.join(out_dir, state_name, counties_and_sub_regions[index][0])
    tasks = getResults(index, counties_and_sub_regions, failed_sub_regions)
    all_tasks.extend(tasks)

In [40]:
# Scan county folders for any missing subregions. 
def scan_county_directories(state_name, out_dir):
    failed_sub_regions = []

    # Iterate through county directories
    for county_name in os.listdir(os.path.abspath(out_dir + '/' + state_name)):
        county_dir = os.path.join(os.path.abspath(out_dir + '/' + state_name), county_name)
        if os.path.isdir(county_dir):
            # Get list of TIF files in the county directory
            tif_files = [f for f in os.listdir(county_dir) if f.endswith('_chm.tif')]
            num_tifs = len(tif_files)

            # Check if the number of TIF files is either 256 or 1024
            if num_tifs not in [256, 1024]:
                # Determine the expected number of subregions
                num_subregions = 256 if num_tifs < 256 else 1024

                # Identify missing subregions
                existing_indices = set(int(f.split('_')[1]) for f in tif_files)
                missing_indices = set(range(num_subregions)) - existing_indices

                # Add missing subregions to the failed_sub_regions list
                for idx in missing_indices:
                    failed_sub_regions.append(f"{county_name}_{idx}_chm.tif")

    return failed_sub_regions

In [41]:
# for each region in failed_subregions, cross reference all tasks and add failed sub regions data to list called failed_tasks
failed_tasks = []
for failed_sub_region in scan_county_directories(state_name, out_dir):
    county_name, sub_region_idx = failed_sub_region.split('_')[:2]
    county_name = county_name.replace('.tif', '')
    for task in all_tasks:
        if task[0] == county_name and task[1] == int(sub_region_idx):
            failed_tasks.append(task)  # Add the failed task to the list

# remove 4th item in each tuple, it is a empty list
failed_tasks = [(task[0], task[1], task[2]) for task in failed_tasks]

print(failed_tasks)

[('Coles', 188, [-88.12012678769091, 39.607864173949636, -88.08813341526385, 39.62738279775332]), ('Coles', 189, [-88.12012678769091, 39.62738279775332, -88.08813341526385, 39.646901421557]), ('Cook', 582, [-87.61537212768404, 41.59805368856354, -87.57935609185054, 41.619447590716256]), ('Cumberland', 164, [-88.18156361268697, 39.22302363152703, -88.1525998669139, 39.23602014946147]), ('Jo Daviess', 142, [-90.28816636030814, 42.46915136720753, -90.24209927585093, 42.48881024931352]), ('Kendall', 113, [-88.44987312423467, 41.47376065259381, -88.42791117044155, 41.49048809564097]), ('Lake', 505, [-87.64671472075162, 42.419894830374325, -87.60984590438093, 42.43071450983256]), ('Lake', 506, [-87.64671472075162, 42.43071450983256, -87.60984590438093, 42.4415341892908]), ('Lake', 507, [-87.64671472075162, 42.4415341892908, -87.60984590438093, 42.452353868749036]), ('Lake', 508, [-87.64671472075162, 42.452353868749036, -87.60984590438093, 42.46317354820728]), ('Lake', 509, [-87.6467147207516

In [42]:
# loop through failed_tasks one at a time to retry downloading
for county_name, sub_region_idx, sub_region in failed_tasks:
    process_sub_region(county_name, sub_region_idx, sub_region, failed_sub_regions)

2024-10-24 17:08:56,483 [MainProcess] INFO: Saved Coles_188_chm.tif to C:/Users/zack/Desktop/CRP_Windbreak_ID/meta_chm_data/Illinois/Coles
2024-10-24 17:09:02,169 [MainProcess] INFO: Saved Coles_189_chm.tif to C:/Users/zack/Desktop/CRP_Windbreak_ID/meta_chm_data/Illinois/Coles
2024-10-24 17:09:06,893 [MainProcess] INFO: Saved Cook_582_chm.tif to C:/Users/zack/Desktop/CRP_Windbreak_ID/meta_chm_data/Illinois/Cook
2024-10-24 17:09:09,040 [MainProcess] INFO: Saved Cumberland_164_chm.tif to C:/Users/zack/Desktop/CRP_Windbreak_ID/meta_chm_data/Illinois/Cumberland
2024-10-24 17:09:17,464 [MainProcess] INFO: Saved Jo Daviess_142_chm.tif to C:/Users/zack/Desktop/CRP_Windbreak_ID/meta_chm_data/Illinois/Jo Daviess
2024-10-24 17:09:20,195 [MainProcess] INFO: Saved Kendall_113_chm.tif to C:/Users/zack/Desktop/CRP_Windbreak_ID/meta_chm_data/Illinois/Kendall
2024-10-24 17:09:23,824 [MainProcess] INFO: Saved Lake_505_chm.tif to C:/Users/zack/Desktop/CRP_Windbreak_ID/meta_chm_data/Illinois/Lake
2024-10

In [None]:
# if __name__ == '__main__':
#     # Global list to store failed sub-regions
#     manager = multiprocessing.Manager()
#     failed_sub_regions = manager.list()

#     program_start_time = time.time()

#     # store counties and subregions in a list
#     counties_and_sub_regions = getRequests(state_name)
    
#     # Process each county one at a time
#     for index in range(len(counties_and_sub_regions)):
#         print(f'Beginning {counties_and_sub_regions[index][0]} County')
#         county_start_time = time.time()

#         # Get tasks for the current county
#         tasks = getResults(index, counties_and_sub_regions, failed_sub_regions)
        
#         # Create a multiprocessing pool with num_workers
#         pool = multiprocessing.Pool(num_workers)
        
#         # Use starmap to parallelize the processing of sub-regions for the current county
#         results = pool.starmap_async(process_sub_region, tasks)
        
#         # Monitor the processing time
#         while not results.ready():
#             if time.time() - county_start_time > 1200:  # 20 minutes
#                 logging.warning(f"Processing county {counties_and_sub_regions[index][0]} timed out.")
#                 pool.terminate()
#                 pool.join()
#                 # Add remaining tasks to failed_sub_regions
#                 for task in tasks[results._index:]:
#                     failed_sub_regions.append((task[0], task[1], task[2]))
#                 break
#             time.sleep(1)
        
#         if results.ready():
#             pool.close()
#             pool.join()

#     # Save failed sub-regions to a text file
#     failed_sub_regions_file = f"{out_dir}/{state_name}/failed_sub_regions.txt"
#     with open(failed_sub_regions_file, 'w') as f:
#         for item in failed_sub_regions:
#             f.write(f"{item}\n")

#     program_end_time = time.time()

#     print(f"Program finished in {program_end_time - program_start_time:.2f} seconds")
#     print(f"Failed sub-regions saved to {failed_sub_regions_file}")