<a href="https://colab.research.google.com/github/boothmanrylan/canadaMSSForestDisturbances/blob/main/exportTrainingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!pip install -q -q --upgrade pip
!pip install -q -q "apache-beam[gcp]==2.50.0"
!pip install -q -q geemap
!pip install -q -q msslib

In [None]:
import os
import io
import itertools

import google
from google.colab import auth
from google.api_core import retry

import requests

import ee
import geemap
import geopandas

import numpy as np
from numpy.lib import recfunctions as rfn
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt

In [None]:
PROJECT = 'api-project-269347469410'
BUCKET = 'gs://rylan-mssforestdisturbances/'
LOCATION = 'us-central1'

HIGH_VOLUME_ENDPOINT = 'https://earthengine-highvolume.googleapis.com'

auth.authenticate_user()

os.environ['GOOGLE_CLOUD_PROJECT'] = PROJECT
!gcloud config set project {PROJECT}

credentials, _ = google.auth.default()
ee.Initialize(credentials, project=PROJECT, opt_url=HIGH_VOLUME_ENDPOINT)

from msslib import msslib

In [None]:
!git clone --quiet https://github.com/boothmanrylan/canadaMSSForestDisturbances.git
%cd canadaMSSForestDisturbances
from mss_forest_disturbances import data

In [None]:
MAX_REQUESTS = 20
ASSET_PATH = "projects/api-project-269347469410/assets/rylan-mssforestdisturbances/"

# Step 1. Create a Covering Grid of Forest Dominated Canada

Step 1.1

Create a grid that covers all of forest dominated Canada, excluding cells that are >70% water. Export the resulting grid as an Earth Engine asset.

In [None]:
GRID_CELL_SIZE = 512
grid = data.build_land_covering_grid(
    ee.FeatureCollection(data.ECOZONES).geometry(),
    GRID_CELL_SIZE
)
grid_list = grid.toList(grid.size())
ids = ee.List.sequence(0, grid.size().subtract(1))
id_grid = ee.FeatureCollection(
    ids.map(lambda i: ee.Feature(grid_list.get(i)).set('cell_id', i))
)

task = ee.batch.Export.table.toAsset(
    collection=id_grid,
    description="export_land_covering_grid",
    assetId=os.path.join(ASSET_PATH, "data", "land_covering_grid")
)
task.start()

Step 1.2

For each year for which we are generating training data estimate the amount of harvest and fire that occurred in each cell of the grid created in Step 1.1. Export the resulting FeatureCollection as an Earth Engine asset.

In [None]:
def set_id(feature):
    cell_id = feature.getNumber('cell_id').format("%d")
    year = feature.getNumber('year').format("%d")
    id = cell_id.cat('_').cat(year)
    return feature.set("id", id)

base_grid = ee.FeatureCollection(os.path.join(ASSET_PATH, "data", "land_covering_grid"))

for year in range(1985, 1996):
    annual_grid = data.add_disturbance_counts(base_grid, year).map(set_id)

    asset_name = f"disturbance_estimate_grid_{year}"
    task = ee.batch.Export.table.toAsset(
        collection=annual_grid,
        description=f"export_grid_with_disturbance_estimates_{year}",
        assetId=os.path.join(ASSET_PATH, "data", "annual_grids", asset_name)
    )
    task.start()

# Step 2. Select Cells from Grid to Create Train/Test/Val Datasets

In [None]:
annual_grids_assets = [
    os.path.join(
        ASSET_PATH,
        "data",
        "annual_grids",
        f"disturbance_estimate_grid_{year}"
    )
    for year in range(1985, 1996)
]
annual_grids = ee.FeatureCollection([
    ee.FeatureCollection(asset)
    for asset in annual_grids_assets
]).flatten()

# perform the train/test/val splitting individually within each ecozone
ecozones = annual_grids.aggregate_array("ecozone").distinct()
ecozone_grids = [
    annual_grids.filter(ee.Filter.eq("ecozone", x))
    for x in ecozones.getInfo()
]

forested_ecozones = ee.FeatureCollection(
    "users/boothmanrylan/forest_dominated_ecozones"
)
total_forested_area = forested_ecozones.geometry().area()

def calc_area(ecozone_id):
    ecozone = forested_ecozones.filter(ee.Filter.eq("ECOZONE_ID", ecozone_id))
    return ecozone.geometry().area()

ecozone_areas = ecozones.map(calc_area)
ecozone_areas_percentage = ecozone_areas.map(
    lambda x: ee.Number(x).divide(total_forested_area)
)

# select 1000 fire, 1000 harvest, and 500 undisturbed cells in total
# distributed across ecozones proportional to ecozone size

cell_counts = np.array([1000, 1000, 500])
splits = [0.7, 0.15, 0.15]
selected_cells = [
    data.sample_cells(grid, *np.ceil(cell_counts * percent).tolist(), *splits)
    for grid, percent in zip(ecozone_grids, ecozone_areas_percentage.getInfo())
]

# join the train/test/val groups from each ecozone
# shuffle to ensure ecozones are intermingled
train_cells = ee.FeatureCollection(
    [ecozone_selection[0] for ecozone_selection in selected_cells]
).flatten().sort("shuffle")
test_cells = ee.FeatureCollection(
    [ecozone_selection[1] for ecozone_selection in selected_cells]
).flatten().sort("shuffle")
val_cells = ee.FeatureCollection(
    [ecozone_selection[2] for ecozone_selection in selected_cells]
).flatten().sort("shuffle")

# export each group to Google Earth Engine
task = ee.batch.Export.table.toAsset(
    collection=train_cells,
    description="export_train_cells",
    assetId=os.path.join(ASSET_PATH, "data", "train_cells")
)
task.start()

task = ee.batch.Export.table.toAsset(
    collection=test_cells,
    description="export_test_cells",
    assetId=os.path.join(ASSET_PATH, "data", "test_cells")
)
task.start()

task = ee.batch.Export.table.toAsset(
    collection=val_cells,
    description="export_val_cells",
    assetId=os.path.join(ASSET_PATH, "data", "val_cells")
)
task.start()

In [None]:
def count_images(feat):
    year = feat.getNumber("year")
    geom = feat.geometry()
    centroid = geom.centroid(1)

    images = msslib.getCol(
        aoi=centroid,
        yearRange=[year, year],
        doyRange=data.DOY_RANGE,
        maxCloudCover=100
    )
    return feat.set("num_images", images.size())

train_cells = ee.FeatureCollection(
    os.path.join(ASSET_PATH, "data", "val_cells")
)
train_cells = train_cells.map(count_images)
filtered_cells = train_cells.filter(ee.Filter.eq("num_images", 0))
print(train_cells.size().getInfo(), filtered_cells.size().getInfo())

# Step 3. Export Image Patches

Based on https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/people-and-planet-ai/land-cover-classification
and https://github.com/google/earthengine-community/blob/master/guides/linked/Earth_Engine_training_patches_computePixels.ipynb

In [None]:
image_uri = f"us-central1-docker.pkg.dev/{PROJECT}/dataflow-containers/dataflow/dockerfile:1.0"

In [None]:
# this only needs to be run once to create the docker image in the artifact registry
!gcloud builds submit --tag {image_uri} .

In [None]:
temp_location = os.path.join(BUCKET, 'temp')
staging_location = os.path.join(BUCKET, 'staging')
output_prefix = os.path.join(BUCKET, 'scratch', 'test_export2')
input_asset = os.path.join(ASSET_PATH, 'data', 'train_cells')

!python dataflow_job.py \
    --runner='DataflowRunner' \
    --project='{PROJECT}' \
    --job_name='test-data-export' \
    --region='us-central1' \
    --temp_location='{temp_location}' \
    --staging_location='{staging_location}' \
    --num_workers=20 \
    --max-requests=20 \
    --input-asset='{input_asset}' \
    --output-prefix='{output_prefix}' \
    --experiments=use_runner_v2 \
    --sdk_container_image='{image_uri}' \
    --sdk_location=container

# Step 4. Verify TFRecords were Created Properly

In [None]:
from dataflow_job import BANDS

IMAGE_FEATURES = {
    b: tf.io.FixedLenFeature(shape=[512, 512], dtype=tf.float32)
    for b in BANDS
}

LABEL_FEATURES = {
    "label": tf.io.FixedLenFeature(shape=[512, 512], dtype=tf.int64)
}

METADATA_FEATURES = {
    m: tf.io.FixedLenFeature(shape=1, dtype=tf.int64)
    for m in ["ecozone", "doy"]
}

def parse(example_proto):
    image = tf.io.parse_single_example(example_proto, IMAGE_FEATURES)
    metadata = tf.io.parse_single_example(example_proto, METADATA_FEATURES)
    label = tf.io.parse_single_example(example_proto, LABEL_FEATURES)
    return image, metadata, label

files = tf.data.Dataset.list_files(f"{output_prefix}/*/*.tfrecord.gz")
dataset = tf.data.TFRecordDataset(files, compression_type="GZIP")
dataset = dataset.map(parse, num_parallel_calls=5)

for im, m, label in dataset.take(5):
    im = tf.stack([im[b] for b in BANDS], axis=-1)
    label = label["label"]

    fig, axes = plt.subplots(1, 2, squeeze=True)
    axes[0].imshow(im[:, :, :3], vmin=0.02, vmax=0.08)
    axes[1].imshow(label)
    plt.show()
    print([(k, v.numpy()) for k, v in m.items()])
