From 79ab472ce74774fb4f7e009d02e3637aa5b630fe Mon Sep 17 00:00:00 2001 From: James McClain Date: Tue, 31 Jul 2018 14:15:39 -0400 Subject: [PATCH] Remote --- .../object_detection_geojson_file_test.py | 3 +- src/rastervision/ml_backends/tf_deeplab.py | 143 ++++++++++++------ .../ml_backends/tf_object_detection_api.py | 16 +- .../ml_tasks/semantic_segmentation.py | 2 - src/rastervision/protos/train.proto | 2 +- .../segmentation/deeplab-test-remote.json | 94 ++++++++++++ .../segmentation/deeplab-test.json | 2 +- src/rastervision/utils/files.py | 9 +- 8 files changed, 209 insertions(+), 62 deletions(-) create mode 100644 src/rastervision/samples/workflow-configs/segmentation/deeplab-test-remote.json diff --git a/src/rastervision/label_stores/object_detection_geojson_file_test.py b/src/rastervision/label_stores/object_detection_geojson_file_test.py index ce504ba7af..34224dd008 100644 --- a/src/rastervision/label_stores/object_detection_geojson_file_test.py +++ b/src/rastervision/label_stores/object_detection_geojson_file_test.py @@ -13,8 +13,7 @@ from rastervision.core.crs_transformer import CRSTransformer from rastervision.core.box import Box from rastervision.core.class_map import ClassMap, ClassItem -from rastervision.utils.files import (NotReadableError, - NotWritableError) +from rastervision.utils.files import (NotReadableError, NotWritableError) class DoubleCRSTransformer(CRSTransformer): diff --git a/src/rastervision/ml_backends/tf_deeplab.py b/src/rastervision/ml_backends/tf_deeplab.py index 21d5dbc2cb..33f7f10585 100644 --- a/src/rastervision/ml_backends/tf_deeplab.py +++ b/src/rastervision/ml_backends/tf_deeplab.py @@ -1,6 +1,9 @@ import io +import os +import glob import numpy as np import shutil +import tarfile import tempfile import tensorflow as tf import uuid @@ -10,6 +13,7 @@ from subprocess import Popen from tensorflow.core.example.example_pb2 import Example from typing import (List, Tuple) +from urllib.parse import urlparse from object_detection.utils import dataset_util from rastervision.core.box import Box @@ -18,9 +22,11 @@ from rastervision.core.scene import Scene from rastervision.core.training_data import TrainingData from rastervision.ml_backends.tf_object_detection_api import ( - terminate_at_exit, TRAIN, VALIDATION) -from rastervision.utils.files import make_dir + write_tf_record, terminate_at_exit, TRAIN, VALIDATION) from rastervision.utils.misc import save_img +from rastervision.utils.files import (get_local_path, upload_if_needed, + make_dir, download_if_needed, + sync_dir) def numpy_to_png(array: np.ndarray) -> str: @@ -57,23 +63,6 @@ def png_to_numpy(png: str, dtype=np.uint8) -> np.ndarray: return np.array(im) -def write_tf_record(tf_examples: List[Example], output_path: str) -> None: - """Write an array of TFRecords to the given output path. - - Args: - tf_examples: An array of TFRecords; a - list(tensorflow.core.example.example_pb2.Example) - output_path: The path where the records should be stored. - - Returns: - None - - """ - with tf.python_io.TFRecordWriter(output_path) as writer: - for tf_example in tf_examples: - writer.write(tf_example.SerializeToString()) - - def make_tf_examples(training_data: TrainingData, class_map: ClassMap) -> List[Example]: """Take training data and a class map and return a list of TFRecords. @@ -227,6 +216,10 @@ def fn(n): return tf.train.Example(features=features) +def get_record_uri(uri: str, split: str) -> str: + return join(uri, '{}-0.record'.format(split)) + + class TFDeeplab(MLBackend): """MLBackend-derived type that implements the TensorFlow DeepLab backend. @@ -234,9 +227,9 @@ class TFDeeplab(MLBackend): """ def __init__(self): - """Constructor""" - # persist scene training packages for when output_uri is remote - self.scene_training_packages = [] + """Constructor.""" + self.temp_dir_obj = tempfile.TemporaryDirectory() + self.temp_dir = self.temp_dir_obj.name def process_scene_data(self, scene: Scene, data: TrainingData, class_map: ClassMap, options) -> str: @@ -253,15 +246,17 @@ def process_scene_data(self, scene: Scene, data: TrainingData, the config file. Returns: - The path to the generated file. + The local path to the generated file. """ - base_uri = options.output_uri - make_dir(base_uri) - tf_examples = make_tf_examples(data, class_map) + + base_uri = options.output_uri split = '{}-{}'.format(scene.id, uuid.uuid4()) record_path = join(base_uri, '{}.record'.format(split)) + record_path = get_local_path(record_path, self.temp_dir) + + make_dir(record_path, use_dirname=True) write_tf_record(tf_examples, record_path) return record_path @@ -273,7 +268,6 @@ def process_sceneset_results(self, training_results: List[str], (one for training data and one for validation data). Args: - training_results: A list of paths to TFRecords containing training data. valiation_results: A list of paths to TFRecords @@ -287,38 +281,84 @@ def process_sceneset_results(self, training_results: List[str], None """ base_uri = options.output_uri - - training_record_path = join(base_uri, '{}-0.record'.format(TRAIN)) - validation_record_path = join(base_uri, - '{}-0.record'.format(VALIDATION)) - merge_tf_records(training_record_path, training_results) - merge_tf_records(validation_record_path, validation_results) + training_record_path = get_record_uri(base_uri, TRAIN) + training_record_path_local = get_local_path(training_record_path, + self.temp_dir) + validation_record_path = get_record_uri(base_uri, VALIDATION) + validation_record_path_local = get_local_path(validation_record_path, + self.temp_dir) + + make_dir(training_record_path_local, use_dirname=True) + make_dir(validation_record_path_local, use_dirname=True) # sic + merge_tf_records(training_record_path_local, training_results) + merge_tf_records(validation_record_path_local, validation_results) + upload_if_needed(training_record_path_local, training_record_path) + upload_if_needed(validation_record_path_local, validation_record_path) if options.debug: training_zip_path = join(base_uri, '{}'.format(TRAIN)) + training_zip_path_local = get_local_path(training_zip_path, + self.temp_dir) validation_zip_path = join(base_uri, '{}'.format(VALIDATION)) + validation_zip_path_local = get_local_path(validation_zip_path, + self.temp_dir) + with tempfile.TemporaryDirectory() as debug_dir: - make_debug_images(training_record_path, debug_dir) - shutil.make_archive(training_zip_path, 'zip', debug_dir) + make_debug_images(training_record_path_local, debug_dir) + shutil.make_archive(training_zip_path_local, 'zip', debug_dir) with tempfile.TemporaryDirectory() as debug_dir: - make_debug_images(validation_record_path, debug_dir) - shutil.make_archive(validation_zip_path, 'zip', debug_dir) + make_debug_images(validation_record_path_local, debug_dir) + shutil.make_archive(validation_zip_path_local, 'zip', + debug_dir) + upload_if_needed('{}.zip'.format(training_zip_path_local), + '{}.zip'.format(training_zip_path)) + upload_if_needed('{}.zip'.format(validation_zip_path_local), + '{}.zip'.format(validation_zip_path)) + + def train(self, class_map: ClassMap, options) -> None: + """Train a DeepLab model using the options provided in the + `segmentation_options` section of the workflow config file. - def train(self, class_map, options): - import pdb - pdb.set_trace() + Args: + class_map: A mapping between integral and textual classes. + options: Options provided in the `segmentation_options` + section of the workflow configuration file. + + Returns: + None + """ soptions = options.segmentation_options - train_logdir = options.output_uri - dataset_dir = options.training_data_uri train_py = soptions.train_py - tf_initial_checkpoints = soptions.tf_initial_checkpoint + # Setup local input and output directories + train_logdir = options.output_uri + # XXX inspite of the prohibition, it might make sense to log + # directly to s3 in the remote case. + train_logdir_local = get_local_path(train_logdir, self.temp_dir) + dataset_dir = options.training_data_uri + dataset_dir_local = get_local_path(dataset_dir, self.temp_dir) + make_dir(train_logdir_local) + make_dir(dataset_dir_local) # sic + download_if_needed(get_record_uri(dataset_dir, TRAIN), self.temp_dir) + + # Download and untar initial checkpoint. + tf_initial_checkpoints_uri = soptions.tf_initial_checkpoints_uri + make_dir(self.temp_dir) + download_if_needed(tf_initial_checkpoints_uri, self.temp_dir) + tfic_tarball = get_local_path(tf_initial_checkpoints_uri, + self.temp_dir) + tfic_dir = os.path.dirname(tfic_tarball) + with tarfile.open(tfic_tarball, 'r:gz') as tar: + tar.extractall(tfic_dir) + tfic_index = glob.glob('{}/*/*.index'.format(tfic_dir))[0] + + # Build array of argments that will be used to run the DeepLab + # training script. args = ['python', train_py] - args.append('--train_logdir={}'.format(train_logdir)) - args.append( - '--tf_initial_checkpoint={}'.format(tf_initial_checkpoints)) - args.append('--dataset_dir={}'.format(dataset_dir)) + args.append('--train_logdir={}'.format(train_logdir_local)) + args.append('--tf_initial_checkpoint={}'.format(tfic_index)) + args.append('--dataset_dir={}'.format(dataset_dir_local)) args.append('--training_number_of_steps={}'.format( soptions.training_number_of_steps)) if len(soptions.train_split) > 0: @@ -336,12 +376,17 @@ def train(self, class_map, options): if len(soptions.dataset): args.append('--dataset="{}"'.format(soptions.dataset)) + # Train train_process = Popen(args) terminate_at_exit(train_process) + train_process.wait() - # XXX tensorboard + if urlparse(train_logdir).scheme == 's3': + sync_dir(train_logdir_local, train_logdir, delete=True) - train_process.wait() + # XXX tensorboard def predict(self, chip, options): + import pdb + pdb.set_trace() return 1 diff --git a/src/rastervision/ml_backends/tf_object_detection_api.py b/src/rastervision/ml_backends/tf_object_detection_api.py index f2d603607d..436cabe39a 100644 --- a/src/rastervision/ml_backends/tf_object_detection_api.py +++ b/src/rastervision/ml_backends/tf_object_detection_api.py @@ -16,6 +16,9 @@ import numpy as np from google.protobuf import text_format +from tensorflow.core.example.example_pb2 import Example +from typing import List + from object_detection.utils import dataset_util from object_detection.protos.string_int_label_map_pb2 import ( StringIntLabelMap, StringIntLabelMapItem) @@ -84,7 +87,18 @@ def create_tf_example(image, window, labels, class_map, chip_id=''): return tf_example -def write_tf_record(tf_examples, output_path): +def write_tf_record(tf_examples: List[Example], output_path: str) -> None: + """Write an array of TFRecords to the given output path. + + Args: + tf_examples: An array of TFRecords; a + list(tensorflow.core.example.example_pb2.Example) + output_path: The path where the records should be stored. + + Returns: + None + + """ with tf.python_io.TFRecordWriter(output_path) as writer: for tf_example in tf_examples: writer.write(tf_example.SerializeToString()) diff --git a/src/rastervision/ml_tasks/semantic_segmentation.py b/src/rastervision/ml_tasks/semantic_segmentation.py index 0281e863b7..374255c209 100644 --- a/src/rastervision/ml_tasks/semantic_segmentation.py +++ b/src/rastervision/ml_tasks/semantic_segmentation.py @@ -39,8 +39,6 @@ def get_train_windows(self, scene: Scene, options) -> List[Box]: return windows def get_train_labels(self, window, scene, options): - import pdb - pdb.set_trace() label_store = scene.ground_truth_label_store chip = label_store.src._get_chip(window) fn = label_store.fn diff --git a/src/rastervision/protos/train.proto b/src/rastervision/protos/train.proto index 71a60ef881..86273478a3 100644 --- a/src/rastervision/protos/train.proto +++ b/src/rastervision/protos/train.proto @@ -19,7 +19,7 @@ message TrainConfig { message SegmentationOptions { optional string train_py = 1 [default="/opt/tf-models/deeplab/train.py"]; - required string tf_initial_checkpoint = 2; + required string tf_initial_checkpoints_uri = 2; optional int32 training_number_of_steps = 3 [default=1]; optional string train_split = 4; optional string model_variant = 5; diff --git a/src/rastervision/samples/workflow-configs/segmentation/deeplab-test-remote.json b/src/rastervision/samples/workflow-configs/segmentation/deeplab-test-remote.json new file mode 100644 index 0000000000..c3650b5d74 --- /dev/null +++ b/src/rastervision/samples/workflow-configs/segmentation/deeplab-test-remote.json @@ -0,0 +1,94 @@ +{ + "train_scenes": [ + { + "id": "2-10", + "raster_source": { + "geotiff_files": { + "uris": [ + "/opt/data/top_potsdam_2_10_RGBIR.tif" + ] + } + }, + "ground_truth_label_store": { + "segmentation_raster_file": { + "src": { + "geotiff_files": { + "uris": [ + "/opt/data/top_potsdam_2_10_label_georeferenced.tif" + ] + } + }, + "src_classes": [ 6, 1 ], + "dst_classes": [ 1, 0 ] + } + } + } + ], + "test_scenes": [ + { + "id": "2-11", + "raster_source": { + "geotiff_files": { + "uris": [ + "/opt/data/top_potsdam_2_11_RGBIR.tif" + ] + } + }, + "ground_truth_label_store": { + "segmentation_raster_file": { + "src": { + "geotiff_files": { + "uris": [ + "/opt/data/top_potsdam_2_11_label_georeferenced.tif" + ] + } + }, + "src_classes": [ 6 ], + "dst_classes": [ 1 ] + } + } + } + ], + "machine_learning": { + "task": "SEMANTIC_SEGMENTATION", + "backend": "TF_DEEPLAB", + "class_items": [ + { + "id": 1, + "name": "car" + } + ] + }, + "make_training_chips_options": { + "segmentation_options": { + } + }, + "train_options": { + "pretrained_model_uri": "/dev/null", + "backend_config_uri": "/dev/null", + "sync_interval": 600, + "segmentation_options": { + "tf_initial_checkpoints_uri": "{rv_root}/deeplabv3_mnv2_pascal_train_aug_2018_01_29.tar.gz" + } + }, + "predict_options": { + "segmentation_options": { + } + }, + "eval_options": { + }, + "debug": true, + "chip_size": 300, + "raster_transformer": { + "channel_order": [0, 1, 2] + }, + "remote_uri_map": { + "rv_root": "s3://raster-vision-mcclain", + "raw": "s3://raster-vision-mcclain/raw-data" + }, + "raw_dataset_key": "cowc-potsdam", + "dataset_key": "test", + "model_key": "deeplab", + "prediction_key": "test-set", + "eval_key": "default" +} diff --git a/src/rastervision/samples/workflow-configs/segmentation/deeplab-test.json b/src/rastervision/samples/workflow-configs/segmentation/deeplab-test.json index 742d90325a..d2f8df219b 100644 --- a/src/rastervision/samples/workflow-configs/segmentation/deeplab-test.json +++ b/src/rastervision/samples/workflow-configs/segmentation/deeplab-test.json @@ -68,7 +68,7 @@ "backend_config_uri": "/dev/null", "sync_interval": 600, "segmentation_options": { - "tf_initial_checkpoint": "/opt/data/deeplabv3_mnv2_pascal_train_aug/model.ckpt-30000.index" + "tf_initial_checkpoints_uri": "/opt/data/deeplabv3_mnv2_pascal_train_aug_2018_01_29.tar.gz" } }, "predict_options": { diff --git a/src/rastervision/utils/files.py b/src/rastervision/utils/files.py index a54f1f9281..b95a7a2e8d 100644 --- a/src/rastervision/utils/files.py +++ b/src/rastervision/utils/files.py @@ -185,8 +185,7 @@ def upload_if_needed(src_path, dst_uri): s3.upload_file(src_path, parsed_uri.netloc, parsed_uri.path[1:]) except Exception: - raise NotWritableError( - 'Could not write {}'.format(dst_uri)) + raise NotWritableError('Could not write {}'.format(dst_uri)) else: sync_dir(src_path, dst_uri, delete=True) @@ -213,8 +212,7 @@ def file_to_str(file_uri): file_buffer) return file_buffer.getvalue().decode('utf-8') except botocore.exceptions.ClientError: - raise NotReadableError( - 'Could not read {}'.format(file_uri)) + raise NotReadableError('Could not read {}'.format(file_uri)) else: if not os.path.isfile(file_uri): raise NotReadableError('Could not read {}'.format(file_uri)) @@ -241,8 +239,7 @@ def str_to_file(content_str, file_uri): s3 = boto3.client('s3') s3.upload_fileobj(str_buffer, bucket, key) except Exception: - raise NotWritableError( - 'Could not write {}'.format(file_uri)) + raise NotWritableError('Could not write {}'.format(file_uri)) else: make_dir(file_uri, use_dirname=True) with open(file_uri, 'w') as content_file: