<a href="https://colab.research.google.com/github/boothmanrylan/canadaMSSForestDisturbances/blob/main/exportTrainingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install --quiet --upgrade pip
!pip install --quiet "apache-beam[gcp]==2.46.0"
!pip install --quiet geemap

In [None]:
import os

import google
from google.colab import auth
from google.api_core import retry

import requests

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

import ee
import geemap
import geopandas

import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
PROJECT = 'api-project-269347469410'
BUCKET = 'gs://rylan-mssforestdisturbances/'
LOCATION = 'us-central1'

HIGH_VOLUME_ENDPOINT = 'https://earthengine-highvolume.googleapis.com'

auth.authenticate_user()

os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT
!gcloud config set project {PROJECT}

credentials, _ = google.auth.default()
ee.Initialize(credentials, project=PROJECT, opt_url=HIGH_VOLUME_ENDPOINT)

In [None]:
# clone and install msslib
!git clone --quiet https://github.com/boothmanrylan/msslib.git
%cd msslib
!pip install --quiet .
%cd ..

!git clone --quiet https://github.com/boothmanrylan/canadaMSSForestDisturbances.git
%cd canadaMSSForestDisturbances
from mss_forest_disturbances import data

In [None]:
MAX_REQUESTS = 20
ASSET_PATH = "projects/api-project-269347469410/assets/rylan-mssforestdisturbances/"

# Step 1. Create a Covering Grid of Forest Dominated Canada

Step 1.1

Create a grid that covers all of forest dominated Canada, excluding cells that are >70% water. Export the resulting grid as an Earth Engine asset.

In [None]:
GRID_CELL_SIZE = 512
grid = data.build_land_covering_grid(data.ECOZONES.geometry(), GRID_CELL_SIZE)
grid_list = grid.toList(grid.size())
ids = ee.List.sequence(0, grid.size().subtract(1))
id_grid = ee.FeatureCollection(
    ids.map(lambda i: ee.Feature(grid_list.get(i)).set('cell_id', i))
)

task = ee.batch.Export.table.toAsset(
    collection=id_grid,
    description="export_land_covering_grid",
    assetId=os.path.join(ASSET_PATH, "data", "land_covering_grid")
)
task.start()

Step 1.2

For each year for which we are generating training data estimate the amount of harvest and fire that occurred in each cell of the grid created in Step 1.1. Export the resulting FeatureCollection as an Earth Engine asset.

In [None]:
def set_id(feature):
    cell_id = ee.String(feature.getNumber("cell_id"))
    year = ee.String(feature.getNumber("year"))
    id = cell_id.cat("_").cat(year)
    return feature.set("id", cell_id)

base_grid = ee.FeatureCollection(os.path.join(ASSET_PATH, "data", "land_covering_grid"))

for year in range(1985, 1996):
    annual_grid = data.add_disturbance_counts(base_grid, year).map(set_id)

    asset_name = f"disturbance_estimate_grid_{year}"
    task = ee.batch.Export.table.toAsset(
        collection=annual_grid,
        description=f"export_grid_with_disturbance_estimates_{year}",
        assetId=os.path.join(ASSET_PATH, "data", asset_name)
    )
    task.start()

# Step 2. Select Cells from Grid to Create Train/Test/Val Datasets

In [None]:
annual_grids_assets = [
    os.path.join(ASSET_PATH, "data", f"disturbance_estimate_grid_{year}")
    for year in range(1985, 1996)
]
annual_grids = ee.FeatureCollection([
    ee.FeatureCollection(asset)
    for asset in annual_grids_assets
]).flatten()

# perform the train/test/val splitting individually within each ecozone
ecozones = annual_grids.aggregate_array("ecozone").distinct().getInfo()
ecozone_grids = [
    annual_grids.filter(ee.Filter.eq("ecozone", x))
    for x in ecozones
]

cell_counts = [200, 200, 200]
splits = [0.7, 0.15, 0.15]
selected_cells = [
    data.sample_cells(grid, *cell_counts, *splits)
    for grid in ecozone_grids
]

# join the train/test/val groups from each ecozone
# shuffle to ensure ecozones are intermingled
train_cells = ee.FeatureCollection(
    [ecozone_selection[0] for ecozone_selection in selected_cells]
).flatten().sort("shuffle")
test_cells = ee.FeatureCollection(
    [ecozone_selection[1] for ecozone_selection in selected_cells]
).flatten().sort("shuffle")
val_cells = ee.FeatureCollection(
    [ecozone_selection[2] for ecozone_selection in selected_cells]
).flatten().sort("shuffle")

# export each group to Google Earth Engine
task = ee.batch.Export.table.toAsset(
    collection=train_cells,
    description="export_train_cells",
    assetId=os.path.join(ASSET_PATH, "data", "train_cells")
)
task.start()

task = ee.batch.Export.table.toAsset(
    collection=test_cells,
    description="export_test_cells",
    assetId=os.path.join(ASSET_PATH, "data", "test_cells")
)
task.start()

task = ee.batch.Export.table.toAsset(
    collection=val_cells,
    description="export_val_cells",
    assetId=os.path.join(ASSET_PATH, "data", "val_cells")
)
task.start()

# Step 3. Export Image Patches

Based on https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/people-and-planet-ai/land-cover-classification
and https://github.com/google/earthengine-community/blob/master/guides/linked/Earth_Engine_training_patches_computePixels.ipynb

In [None]:
train_file = os.path.join(BUCKET, "data", "train_cells.geojson")
train_cells = geopandas.read_file(train_file)
# train_cells = train_cells.to_crs(data.PROJECTION.getInfo()['wkt'])

In [None]:
cell = None
for index, c in train_cells.iterrows():
    cell = c
    break
type(cell)

In [None]:
geom = ee.Geometry.Polygon(list(cell["geometry"].exterior.coords))
print(geom.projection().getInfo())
Map = geemap.Map()
Map.centerObject(geom)
Map.addLayer(geom, {}, "Test Geometry")
Map

In [None]:
print(data.PROJECTION.getInfo()['wkt'])
test_set_crs = train_cells.to_crs(data.PROJECTION.getInfo()['wkt'])
print(test_set_crs.crs)

In [None]:
ECOZONES = [4, 5, 6, 7, 9, 11, 12, 13, 14, 15]
DISTURBANCE_TYPES = ['fire', 'harvest', 'undisturbed']

def get_image_label_metadata(series):
    """
    row should be a pandas series with keys:
    lat, lon, year, ecozone, train/test/val, and fire/harvest/no disturbance
    """
    # TODO: use lat, lon, and year as inputs to msslib.getCol()

    # TODO: must return an iterable in order for FlatMap to work: use yield
    pass


def serialize_tensor(image, label, metadata):
    # TODO: create a tf.train.Example()
    # TODO: return example.SerializeToString() --> ensure we can read/parse this later on
    pass

class ProcessSampleGroup(beam.PTransform):
    def __init__(self, prefix):
        super().__init__()
        self.prefix = prefix

    def expand(self, pcoll):
        return (
            pcoll
            | beam.FlatMap(get_image_label_metadata)
            | beam.MapTuple(serialize_tensor)
            | beam.io.WriteToTFRecord(self.prefix, file_name_suffix=".tfrecord.gz")
        )

def filter(x, ecozone, disturbance_type):
    x['ecozone'] == ecozone and x['disturbance_type'] == disturbance_type

def write_tfrecord(input_file, output_prefix):
    data = pd.read_csv(input_file) # TODO: GeoJSON

    with beam.Pipeline() as pipeline:
        pcoll = pipeline | beam.Create(data) | beam.Reshuffle()

        for ecozone in ECOZONES:
            for disturbance_type in DISTURBANCE_TYPES:
                path = os.path.join(
                    output_prefix,
                    f"ecozone{ecozone}",
                    disturbance_type
                )

                inner_pcoll = pcoll | beam.Filter(
                    lambda x: filter(x, ecozone, disturbance_type)
                )
                inner_pcoll.apply(ProcessSampleGroup(prefix=path))