# Data preparation, modeling and classification

## Setup

### Imports

In [1]:
from google.colab import drive
import tensorflow as tf
from pprint import pprint
from tensorflow import keras
import os
import json

In [2]:
# set seed
tf.random.set_seed(42)

### Authentifications

In [3]:
# mount drive if not already mounted
drive.mount('/content/drive')

Mounted at /content/drive


### Variables

In [4]:
OUTPUT_FOLDER = 'farm_plots'

In [5]:
TRAIN_FILE_PREFIX = 'train'

In [6]:
TEST_FILE_PREFIX = 'test'

In [7]:
FILE_EXTENSION = '.tfrecord.gz'

In [8]:
TRAIN_FILE_PATH = 'drive/MyDrive/' + OUTPUT_FOLDER + '/' + TRAIN_FILE_PREFIX + FILE_EXTENSION

In [9]:
TEST_FILE_PATH = 'drive/MyDrive/' + OUTPUT_FOLDER + '/' + TEST_FILE_PREFIX + FILE_EXTENSION

In [10]:
BANDS = ['B2', 'B3', 'B4', 'B8']

In [11]:
LABEL = 'landcover'

In [12]:
N_CLASSES = 3

In [13]:
FEATURE_NAMES = list(BANDS)

In [14]:
FEATURE_NAMES.append(LABEL)

In [15]:
IMAGE_FILE_PREFIX = 'image'

In [16]:
OUTPUT_IMAGE_FILE = 'drive/MyDrive/' + OUTPUT_FOLDER + '/classified_image.TFRecord'

### Functions

In [17]:
def parse_tfrecord(example_proto):

  """The parsing function.

  Read a serialized example into the structure defined by featuresDict.

  Args:
    example_proto: a serialized Example.

  Returns:
    A tuple of the predictors dictionary and the label, cast to an `int32`.
  """

  parsed_features = tf.io.parse_single_example(example_proto, features_dict)
  labels = parsed_features.pop(LABEL)

  return parsed_features, tf.cast(labels, tf.int32)

In [18]:
def normalized_difference(a, b):

  """Compute normalized difference of two inputs.

  Compute (a - b) / (a + b).  If the denomenator is zero, add a small delta.

  Args:
    a: an input tensor with shape=[1]
    b: an input tensor with shape=[1]

  Returns:
    The normalized difference as a tensor.
  """

  nd = (a - b) / (a + b)
  nd_inf = (a - b) / (a + b + 0.000001)

  return tf.where(tf.math.is_finite(nd), nd, nd_inf)

In [19]:
def add_ndvi(features, label):

  """Add NDVI to the dataset.
  Args:
    features: a dictionary of input tensors keyed by feature name.
    label: the target label

  Returns:
    A tuple of the input dictionary with an NDVI tensor added and the label.
  """

  features['NDVI'] = normalized_difference(features['B8'], features['B4'])
  
  return features, label

In [20]:
# Keras requires inputs as a tuple
# note that inputs must be in the right shape
# also note that to use categorical_crossentropy loss the label needs to be turned into a one-hot vector
def to_tuple(inputs, label):
  return (tf.transpose(list(inputs.values())),
          tf.one_hot(indices=label, depth=N_CLASSES))

In [21]:
# parsing function
def parse_image(example_proto):
  return tf.io.parse_single_example(example_proto, image_features_dict)

## Prepare and pre-process data

In [22]:
# create dataset from TFRecord file
train_dataset = tf.data.TFRecordDataset(TRAIN_FILE_PATH, compression_type='GZIP')

In [23]:
# list of fixed-length features, all of which are float32
columns = [tf.io.FixedLenFeature(shape=[1], dtype=tf.float32) for k in FEATURE_NAMES]

In [24]:
# dictionary with names as keys, features as values
features_dict = dict(zip(FEATURE_NAMES, columns))
pprint(features_dict)

{'B2': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 'B3': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 'B4': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 'B8': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None),
 'landcover': FixedLenFeature(shape=[1], dtype=tf.float32, default_value=None)}


In [25]:
# map function over dataset
parsed_dataset = train_dataset.map(parse_tfrecord, num_parallel_calls=5)
pprint(iter(parsed_dataset).next())

({'B2': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.03195], dtype=float32)>,
  'B3': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.05305], dtype=float32)>,
  'B4': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.03505], dtype=float32)>,
  'B8': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.2423], dtype=float32)>},
 <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)


In [26]:
# add NDVI to dataset
input_dataset = parsed_dataset.map(add_ndvi)

In [27]:
# map to_tuple function, shuffle and batch
input_dataset = input_dataset.map(to_tuple).batch(8)

## Setup and train model

In [28]:
# define model
model = tf.keras.models.Sequential([tf.keras.layers.Dense(64, activation=tf.nn.relu),
                                    tf.keras.layers.Dropout(0.2),
                                    tf.keras.layers.Dense(64, activation=tf.nn.relu),
                                    tf.keras.layers.Dropout(0.2),
                                    tf.keras.layers.Dense(64, activation=tf.nn.relu),
                                    tf.keras.layers.Dropout(0.2),
                                    tf.keras.layers.Dense(N_CLASSES, activation=tf.nn.softmax)])

In [29]:
# compile model
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [30]:
# fit model

model.fit(x=input_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fbc12333490>

In [31]:
# evaluate model in test dataset

test_dataset = (
    tf.data.TFRecordDataset(TEST_FILE_PATH, compression_type='GZIP')
    .map(parse_tfrecord, num_parallel_calls=5)
    .map(add_ndvi)
    .map(to_tuple)
    .batch(1)
    )

model.evaluate(test_dataset)



[0.49376755952835083, 0.9642857313156128]

## Classify image from Earth Engine

In [32]:
# get list of all files in output folder
files_list = os.listdir('drive/MyDrive/' + OUTPUT_FOLDER)

In [33]:
# get only files generated by image export
exported_files_list = [s for s in files_list if IMAGE_FILE_PREFIX in s]

In [34]:
# get list of image files and JSON mixer file

image_files_list = []
json_file = None

for f in exported_files_list:
  if f.endswith('.tfrecord.gz'):
    image_files_list.append(f)
  elif f.endswith('.json'):
    json_file = f

In [35]:
# make sure files are in right order
image_files_list.sort()
json_file_path = 'drive/MyDrive/' + OUTPUT_FOLDER + '/' + 'image-mixer.json'
pprint(image_files_list)
print(json_file)
print(json_file_path)

['image-00000.tfrecord.gz']
image-mixer.json
drive/MyDrive/farm_plots/image-mixer.json


In [36]:
# load contents of mixer file to JSON object
json_text = !cat {json_file_path}

In [37]:
# get single string w/ newlines from IPython.utils.text.SList
mixer = json.loads(json_text.nlstr)
pprint(mixer)

{'patchDimensions': [256, 256],
 'patchesPerRow': 6,
 'projection': {'affine': {'doubleMatrix': [8.983152841195215e-05,
                                            0.0,
                                            23.493010804878963,
                                            0.0,
                                            -8.983152841195215e-05,
                                            -16.639853333859545]},
                'crs': 'EPSG:4326'},
 'totalPatches': 12}


In [38]:
image_files_list_path = ['drive/MyDrive/' + OUTPUT_FOLDER + '/' + i for i in image_files_list]

In [39]:
# get relevant info from JSON mixer file
patch_width = mixer['patchDimensions'][0]
patch_height = mixer['patchDimensions'][1]
patches = mixer['totalPatches']
patch_dimensions_flat = [patch_width * patch_height, 1]

In [40]:
# note that tensors are in the shape of a patch, one patch for each band
image_columns = [tf.io.FixedLenFeature(shape=patch_dimensions_flat, dtype=tf.float32) for k in BANDS]

In [41]:
# parsing dictionary
image_features_dict = dict(zip(BANDS, image_columns))

In [42]:
# note that you can make one dataset from many files by specifying a list
image_dataset = tf.data.TFRecordDataset(image_files_list_path, compression_type='GZIP')

In [43]:
# parse data into tensors, one long tensor per patch
image_dataset = image_dataset.map(parse_image, num_parallel_calls=5)

In [44]:
# break our long tensors into many little ones
image_dataset = image_dataset.flat_map(lambda features: tf.data.Dataset.from_tensor_slices(features))

In [45]:
# add additional features (NDVI), including features that don't have a label
image_dataset = image_dataset.map(lambda features: add_ndvi(features, None)[0])

In [46]:
# turn dictionary in each record into a tuple without a label
image_dataset = image_dataset.map(lambda data_dict: (tf.transpose(list(data_dict.values())), ))

In [47]:
# turn each patch into a batch
image_dataset = image_dataset.batch(patch_width * patch_height)

In [48]:
# run prediction in batches, with as many steps as there are patches
predictions = model.predict(image_dataset, steps=patches, verbose=1)



In [49]:
# note that predictions come as a numpy array
print(predictions[0])

[[0.51863724 0.04946535 0.43189737]]


## Write classified image to Drive

In [50]:
print('Writing to file ' + OUTPUT_IMAGE_FILE)

Writing to file drive/MyDrive/farm_plots/classified_image.TFRecord


In [51]:
# instantiate writer
writer = tf.io.TFRecordWriter(OUTPUT_IMAGE_FILE)

In [52]:
# every patch-worth of predictions we'll dump an example into the output file with a single feature that holds our predictions
# since our predictions are already in the order of the exported data, the patches we create here will also be in the right order

patch = [[], [], [], []]

cur_patch = 1

for prediction in predictions:

  patch[0].append(tf.argmax(prediction, 1))
  patch[1].append(prediction[0][0])
  patch[2].append(prediction[0][1])
  patch[3].append(prediction[0][2])

  # once we've seen a patches-worth of class_ids...

  if (len(patch[0]) == patch_width * patch_height):
    print('Done with patch ' + str(cur_patch) + ' of ' + str(patches) + '...')

    # create an example

    example = tf.train.Example(
        features=tf.train.Features(
            feature={                
                'prediction': tf.train.Feature(                    
                    int64_list=tf.train.Int64List(
                        value=patch[0])),                     
                     'vegetation': tf.train.Feature(
                         float_list=tf.train.FloatList(
                             value=patch[1])),
                     'water': tf.train.Feature(
                         float_list=tf.train.FloatList(
                             value=patch[2])),
                     'farm_plots': tf.train.Feature(
                         float_list=tf.train.FloatList(
                             value=patch[3])),
                     }
                     )
        )
    
    # write the example to the file and clear our patch array so it's ready for another batch of class ids

    writer.write(example.SerializeToString())
    patch = [[], [], [], []]
    cur_patch += 1

Done with patch 1 of 12...
Done with patch 2 of 12...
Done with patch 3 of 12...
Done with patch 4 of 12...
Done with patch 5 of 12...
Done with patch 6 of 12...
Done with patch 7 of 12...
Done with patch 8 of 12...
Done with patch 9 of 12...
Done with patch 10 of 12...
Done with patch 11 of 12...
Done with patch 12 of 12...


In [53]:
# close writer
writer.close()

## Upload classified image to Earth Engine

Uploads via the command line only work with Cloud Storage, not with Drive. Thus, the classified image in TFRecord format with the respecive mixer file in JSON format need to be uploaded to Earth Engine manually.