<a href="https://colab.research.google.com/github/boothmanrylan/canadaMSSForestDisturbances/blob/main/exportTrainingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install --quiet --upgrade pip
!pip install --quiet "apache-beam[gcp]==2.46.0"
!pip install --quiet geemap

In [None]:
import os
import io
import itertools

import google
from google.colab import auth
from google.api_core import retry

import requests

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

import ee
import geemap
import geopandas

import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
PROJECT = 'api-project-269347469410'
BUCKET = 'gs://rylan-mssforestdisturbances/'
LOCATION = 'us-central1'

HIGH_VOLUME_ENDPOINT = 'https://earthengine-highvolume.googleapis.com'

auth.authenticate_user()

os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT
!gcloud config set project {PROJECT}

credentials, _ = google.auth.default()
ee.Initialize(credentials, project=PROJECT, opt_url=HIGH_VOLUME_ENDPOINT)

In [None]:
# clone and install msslib
!git clone --quiet https://github.com/boothmanrylan/msslib.git
%cd msslib
!pip install --quiet .
%cd ..

from msslib import msslib

!git clone --quiet https://github.com/boothmanrylan/canadaMSSForestDisturbances.git
%cd canadaMSSForestDisturbances
from mss_forest_disturbances import data

In [None]:
MAX_REQUESTS = 20
ASSET_PATH = "projects/api-project-269347469410/assets/rylan-mssforestdisturbances/"

# Step 1. Create a Covering Grid of Forest Dominated Canada

Step 1.1

Create a grid that covers all of forest dominated Canada, excluding cells that are >70% water. Export the resulting grid as an Earth Engine asset.

In [None]:
GRID_CELL_SIZE = 512
grid = data.build_land_covering_grid(data.ECOZONES.geometry(), GRID_CELL_SIZE)
grid_list = grid.toList(grid.size())
ids = ee.List.sequence(0, grid.size().subtract(1))
id_grid = ee.FeatureCollection(
    ids.map(lambda i: ee.Feature(grid_list.get(i)).set('cell_id', i))
)

task = ee.batch.Export.table.toAsset(
    collection=id_grid,
    description="export_land_covering_grid",
    assetId=os.path.join(ASSET_PATH, "data", "land_covering_grid")
)
task.start()

Step 1.2

For each year for which we are generating training data estimate the amount of harvest and fire that occurred in each cell of the grid created in Step 1.1. Export the resulting FeatureCollection as an Earth Engine asset.

In [None]:
def set_id(feature):
    cell_id = feature.getNumber('cell_id').format("%d")
    year = feature.getNumber('year').format("%d")
    id = cell_id.cat('_').cat(year)
    return feature.set("id", id)

base_grid = ee.FeatureCollection(os.path.join(ASSET_PATH, "data", "land_covering_grid"))

for year in range(1985, 1996):
    annual_grid = data.add_disturbance_counts(base_grid, year).map(set_id)

    asset_name = f"disturbance_estimate_grid_{year}"
    task = ee.batch.Export.table.toAsset(
        collection=annual_grid,
        description=f"export_grid_with_disturbance_estimates_{year}",
        assetId=os.path.join(ASSET_PATH, "data", "annual_grids", asset_name)
    )
    task.start()

# Step 2. Select Cells from Grid to Create Train/Test/Val Datasets

In [None]:
annual_grids_assets = [
    os.path.join(
        ASSET_PATH,
        "data",
        "annual_grids",
        f"disturbance_estimate_grid_{year}"
    )
    for year in range(1985, 1996)
]
annual_grids = ee.FeatureCollection([
    ee.FeatureCollection(asset)
    for asset in annual_grids_assets
]).flatten()

# perform the train/test/val splitting individually within each ecozone
ecozones = annual_grids.aggregate_array("ecozone").distinct().getInfo()
ecozone_grids = [
    annual_grids.filter(ee.Filter.eq("ecozone", x))
    for x in ecozones
]

cell_counts = [200, 200, 200]
splits = [0.7, 0.15, 0.15]
selected_cells = [
    data.sample_cells(grid, *cell_counts, *splits)
    for grid in ecozone_grids
]

# join the train/test/val groups from each ecozone
# shuffle to ensure ecozones are intermingled
train_cells = ee.FeatureCollection(
    [ecozone_selection[0] for ecozone_selection in selected_cells]
).flatten().sort("shuffle")
test_cells = ee.FeatureCollection(
    [ecozone_selection[1] for ecozone_selection in selected_cells]
).flatten().sort("shuffle")
val_cells = ee.FeatureCollection(
    [ecozone_selection[2] for ecozone_selection in selected_cells]
).flatten().sort("shuffle")

# export each group to Google Earth Engine
task = ee.batch.Export.table.toAsset(
    collection=train_cells,
    description="export_train_cells",
    assetId=os.path.join(ASSET_PATH, "data", "train_cells")
)
task.start()

task = ee.batch.Export.table.toAsset(
    collection=test_cells,
    description="export_test_cells",
    assetId=os.path.join(ASSET_PATH, "data", "test_cells")
)
task.start()

task = ee.batch.Export.table.toAsset(
    collection=val_cells,
    description="export_val_cells",
    assetId=os.path.join(ASSET_PATH, "data", "val_cells")
)
task.start()

# Step 3. Export Image Patches

Based on https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/people-and-planet-ai/land-cover-classification
and https://github.com/google/earthengine-community/blob/master/guides/linked/Earth_Engine_training_patches_computePixels.ipynb

In [None]:
# create default request for computePixels
proj = data.PROJECTION.getInfo()
scale_x = proj['transform'][0]
scale_y = -proj['transform'][4]

PATCH_SIZE = 512

OFFSET_X = -scale_x * PATCH_SIZE / 2
OFFSET_Y = -scale_y * PATCH_SIZE / 2

REQUEST = {
    'fileFormat': 'NPY',
    'grid': {
        'dimensions': {
            'width': PATCH_SIZE,
            'height': PATCH_SIZE,
        },
        'affineTransform': {
            'scaleX': scale_x,
            'shearX': 0,
            'shearY': 0,
            'scaleY': scale_y,
        },
        'crsCode': proj['wkt']
    }
}

In [None]:
ERROR_MARGIN = ee.ErrorMargin(0.1, "projected")

def _get_images_from_feature(feature):
    geom = feature.geometry(ERROR_MARGIN, data.PROJECTION)
    year = feature.getNumber("year")

    # TODO: need to be able to handle this returning no images
    images = msslib.getCol(
        aoi=geom.centroid(1).buffer(60),
        yearRange=[year, year],
        doyRange=data.DOY_RANGE,
        maxCloudCover=100
    )

    return images

def get_image_ids(df, col, index):
    cell_id = int(df.iloc[index]["cell_id"])
    feature = col.filter(ee.Filter.eq("cell_id", cell_id)).first()

    images = _get_images_from_feature(feature)
    image_ids = images.aggregate_array("LANDSAT_SCENE_ID").getInfo()
    return zip(image_ids, itertools.repeat(feature))


@retry.Retry()
def get_image_label_metadata(image_id, feature):
    images = _get_images_from_feature(ee.Feature(feature))
    image = images.filter(ee.Filter.eq("LANDSAT_SCENE_ID", image_id)).first()

    image, label = data.prepare_image_for_export(image)
    metadata = data.prepare_metadata_for_export(image, feature)
    metadata = {key: val.getInfo() for key, val in metadata.items()}

    geom = feature.geometry(ERROR_MARGIN, data.PROJECTION)
    coords = geom.centroid(ERROR_MARGIN, data.PROJECTION).getInfo()["coordinates"]

    return image.clip(geom), label.clip(geom), metadata

    # request = dict(REQUEST)
    # request['grid']['affineTransform']['translateX'] = coords[0] + OFFSET_X
    # request['grid']['affineTransform']['translateY'] = coords[1] + OFFSET_Y

    # image_request = dict(request)
    # image_request['expression'] = image
    # np_image = np.load(io.BytesIO(ee.data.computePixels(image_request)))

    # label_request = dict(request)
    # label_request['expression'] = label
    # np_label = np.load(io.BytesIO(ee.data.computePixels(label_request)))

    # return np_image, np_label, metadata


def serialize_tensor(image, label, metadata):
    features = {
        key: tf.train.Feature(
            int64_list=tf.train.Int64List(value=[value])
        )
        for key, value in metadata.items()
    }

    features['image'] = tf.train.Feature(
        bytes_list=tf.train.BytesList(
            value=[tf.io.serialize_tensor(image).numpy()]
        )
    )

    features['label'] = tf.train.Feature(
        bytes_list=tf.train.BytesList(
            value=[tf.io.serialize_tensor(label).numpy()]
        )
    )

    example = tf.train.Example(features=tf.train.Features(feature=features))
    return example.SerializeToString()


Map = geemap.Map()
def add_to_map(image, label, metadata):
    Map.addLayer(image)


class ProcessSampleGroup(beam.PTransform):
    def __init__(self, df, col, prefix):
        super().__init__()
        self.prefix = prefix
        self.col = col
        self.df = df

    def expand(self, pcoll):
        return (
            pcoll
            | beam.FlatMap(lambda i: get_image_ids(self.df, self.col, i))
            | beam.Reshuffle()
            | beam.MapTuple(get_image_label_metadata)
            | beam.MapTuple(add_to_map)
            # | beam.MapTuple(serialize_tensor)
            # | beam.io.WriteToTFRecord(self.prefix, file_name_suffix=".tfrecord.gz")
        )

def write_tfrecord(col, output_prefix):
    df = geemap.ee_to_df(
        col, col_names=['disturbance_type', 'ecozone', 'cell_id']
    )
    ecozones = set(df['ecozone'])
    disturbance_types = set(df['disturbance_type'])

    def filter_df(i, ecozone, disturbance_type):
        elem = df.iloc[i]
        matches_ecozone = elem['ecozone'] == ecozone
        matches_disturbance_type = elem['disturbance_type'] == disturbance_type
        return matches_ecozone and matches_disturbance_type

    with beam.Pipeline() as pipeline:
        pcoll = pipeline | beam.Create(df.index)

        for ecozone in ecozones:
            for disturbance_type in disturbance_types:
                path = os.path.join(
                    output_prefix,
                    f"ecozone{ecozone}",
                    disturbance_type
                )

                filter_label = f"filter {ecozone} {disturbance_type}"
                inner_pcoll = pcoll | filter_label >> beam.Filter(
                    lambda i: filter_df(i, ecozone, disturbance_type)
                )

                process_label = f"process {ecozone} {disturbance_type}"
                inner_pcoll | process_label >> ProcessSampleGroup(
                    prefix=path, df=df, col=col
                )

train_col = ee.FeatureCollection(os.path.join(ASSET_PATH, "data", "train_cells"))
write_tfrecord(train_col.limit(1), os.path.join("scratch", "train"))

In [None]:
Map