awslabs · rahul003 · Sep 10, 2020 · Aug 10, 2020 · Aug 10, 2020 · Aug 11, 2020
diff --git a/docs/analysis.md b/docs/analysis.md
@@ -30,8 +30,10 @@ This page describes the programming model that SageMaker Debugger provides for y
 		* [steps](#steps-1)
 		* [value](#value)
 		* [reduction_value](#reduction_value)
-		* [reduction_values](#reduction_values)
+		* [shape](#shape)
 		* [values](#values)
+		* [reduction_values](#reduction_values)
+		* [shapes](#shapes)
 		* [workers](#workers-1)
 		* [prev_steps](#prev_steps)
 * [Rules](#Rules)
@@ -356,6 +358,34 @@ trial.tensor(name).reduction_value(step_num, reduction_name,
 ###### Returns
 `numpy.ndarray` The reduction value of tensor at the given step and worker (if the training job saved data from multiple workers) as a 1x1 numpy array. If this reduction was saved for the tensor during training as part of specification through reduction config, it will be loaded and returned. If the given reduction was not saved then, but the full tensor was saved, the reduction will be computed on the fly and returned. If both the chosen reduction and full tensor are not available, this method raises `TensorUnavailableForStep` exception.
 
+#### shape
+Get the shape of the chosen tensor at a particular step.
+
+```python
+trial.tensor(name).shape(step_num, mode=modes.GLOBAL, worker=None)
+
+```
+###### Arguments
+- `step_num (int)` The step number whose value is to be returned for the mode passed through the next parameter.
+- `mode (smdebug.modes enum value)` The mode applicable for the step number passed above. Defaults to `modes.GLOBAL`
+- `worker (str)` This parameter is only applicable for distributed training. You can retrieve the value of the tensor from a specific worker by passing the worker name. You can query all the workers seen by the trial with the `trial.workers()` method. You might also be interested in querying the workers which saved a value for the tensor at a specific step, this is possible with the method: `trial.tensor(name).workers(step, mode)`
+
+###### Returns
+`tuple(int)`  If only the shape of this tensor was saved through `save_shape` configuration in ReductionConfig, it will be returned. If the full tensor was saved, then shape will be computed and returned today. If both the shape and full tensor are not available, this method raises `TensorUnavailableForStep` exception.
+
+#### values
+Get the values of the tensor for all steps of a given mode.
+
+```python
+trial.tensor(name).values(mode=modes.GLOBAL, worker=None)
+```
+
+###### Arguments
+- `mode (smdebug.modes enum value)` The mode applicable for the step number passed above. Defaults to `modes.GLOBAL`
+- `worker (str)` This parameter is only applicable for distributed training. You can retrieve the value of the tensor from a specific worker by passing the worker name. You can query all the workers seen by the trial with the `trial.workers()` method. You might also be interested in querying the workers which saved a value for the tensor at a specific step, this is possible with the method: `trial.tensor(name).workers(step, mode)`
+
+###### Returns
+`dict[int -> numpy.ndarray]` A dictionary with step numbers as keys and numpy arrays representing the value of the tensor as values.
 
 #### reduction_values
 Get all reduction values saved for the chosen tensor at a particular step. A reduction value is a tensor reduced to a single value through reduction or aggregation operations. Please go through the description of the method `reduction_value` for more details.
@@ -372,19 +402,19 @@ trial.tensor(name).reduction_values(step_num, mode=modes.GLOBAL, worker=None)
 ###### Returns
 `dict[(str, bool) -> numpy.ndarray]` A dictionary with keys being tuples of the form `(reduction_name, abs)` to a 1x1 numpy ndarray value. `abs` here is a boolean that denotes whether the reduction was performed on the absolute value of the tensor or not. Note that this method only returns the reductions which were saved from the training job. It does not compute all known reductions and return them if only the raw tensor was saved.
 
-#### values
-Get the values of the tensor for all steps of a given mode.
+#### shapes
+Get the shapes of the tensor for all steps of a given mode.
 
 ```python
-trial.tensor(name).values(mode=modes.GLOBAL, worker=None)
+trial.tensor(name).shapes(mode=modes.GLOBAL, worker=None)
 ```
 
 ###### Arguments
 - `mode (smdebug.modes enum value)` The mode applicable for the step number passed above. Defaults to `modes.GLOBAL`
 - `worker (str)` This parameter is only applicable for distributed training. You can retrieve the value of the tensor from a specific worker by passing the worker name. You can query all the workers seen by the trial with the `trial.workers()` method. You might also be interested in querying the workers which saved a value for the tensor at a specific step, this is possible with the method: `trial.tensor(name).workers(step, mode)`
 
 ###### Returns
-`dict[int -> numpy.ndarray]` A dictionary with step numbers as keys and numpy arrays representing the value of the tensor as values.
+`dict[int -> tuple(int)]` A dictionary with step numbers as keys and tuples of ints representing the shapes of the tensor as values.
 
 #### workers
 Get all the workers for which this tensor was saved at a given step

diff --git a/docs/api.md b/docs/api.md
@@ -96,6 +96,7 @@ include_workers
 include_regex
 reductions
 save_raw_tensor
+save_shape
 save_interval
 save_steps
 start_step

diff --git a/smdebug/core/hook.py b/smdebug/core/hook.py
@@ -420,6 +420,17 @@ def _prepare_collections(self):
         self.prepared_collections = True
 
     #### End of Save Manager methods ####
+    @staticmethod
+    def _close_given_writer_map(writer_dict):
+        # Delete all the dist training writers
+        to_delete_writers = []
+        for key, writer in writer_dict.items():
+            # close calls flush
+            writer.close()
+            to_delete_writers.append(key)
+
+        for key in to_delete_writers:
+            del writer_dict[key]
 
     def _close_writers(self) -> None:
         if self.dry_run:
@@ -433,16 +444,7 @@ def _close_writers(self) -> None:
             self.writer.close()
             self.writer = None
 
-        to_delete_writers = []
-
-        # Delete all the tb writers
-        for mode, writer in self.tb_writers.items():
-            if writer is not None:
-                writer.flush()
-                writer.close()
-                to_delete_writers.append(mode)
-        for mode in to_delete_writers:
-            del self.tb_writers[mode]
+        self._close_given_writer_map(self.tb_writers)
 
     def _initialize_writers(self, only_initialize_if_missing=False) -> None:
         # Function is overridden in smdebug/tensorflow/base_hook.py
@@ -470,8 +472,12 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
         if self.save_all_workers is False:
             if self.worker != self.chief_worker:
                 return
+
         self.writer = FileWriter(trial_dir=self.out_dir, step=self.step, worker=self.worker)
 
+    def _get_main_writer(self) -> List[FileWriter]:
+        return [self.writer] if self.writer else []
+
     def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]:
         """
         :param tensor_name:
@@ -480,7 +486,7 @@ def _get_writers(self, tensor_name, tensor_ref=None) -> List[FileWriter]:
         """
         if self.save_all_workers is False and self.worker != self.chief_worker:
             return []
-        return [self.writer] if self.writer else []
+        return self._get_main_writer()
 
     def _maybe_get_tb_writer(self) -> Optional[FileWriter]:
         """ Returns a FileWriter object if `hook.tensorboard_dir` has been specified, else None.
@@ -749,6 +755,31 @@ def _write_raw_tensor(self, tensor_name, tensor_value, save_collections, tensor_
                 self._write_raw_tensor_simple(tensor_name, tensor_value, tensor_ref=tensor_ref)
                 break
 
+    def _write_shape(self, tensor_name, tensor_value, save_collections, tensor_ref=None):
+        writers = self._get_writers(tensor_name, tensor_ref=tensor_ref)
+        for s_col in save_collections:
+            reduction_config = s_col.reduction_config
+            if self.dry_run is False and reduction_config.save_shape is True:
+                numpy_tensor_value = self._make_numpy_array(tensor_value)
+                this_size, this_shape = size_and_shape(numpy_tensor_value)
+                # In TF Keras and Variables in all interfaces of TF, sometimes we output tensors with
+                # more meaningful names than the origina name. Outputting
+                # both Smdebug given name and original name in such cases
+                if tensor_ref is not None and tensor_ref.tf_obj is not None:
+                    original_name = tensor_ref.tf_obj.name
+                else:
+                    original_name = None
+
+                for writer in writers:
+                    writer.write_shape(
+                        tensor_name,
+                        this_shape,
+                        self.mode,
+                        self.mode_steps[self.mode],
+                        original_name=original_name,
+                    )
+                break
+
     def _write_raw_tensor_simple(self, tensor_name, tensor_value, tensor_ref=None, timestamp=None):
         # tensor_ref is used by TF
         # todo: if fp16, check perf of saving as fp16 in proto vs as fp32
@@ -828,6 +859,9 @@ def _write_for_tensor(self, tensor_name, tensor_value, save_collections, tensor_
         :param save_collections: list of collections which are being saved for this step
         """
         self._log_save(tensor_name, save_collections)
+
+        self._write_shape(tensor_name, tensor_value, save_collections, tensor_ref=tensor_ref)
+
         # write reductions defined for collections this tensor may be part of
         self._write_reductions(tensor_name, tensor_value, save_collections, tensor_ref=tensor_ref)
 

diff --git a/smdebug/core/index_reader.py b/smdebug/core/index_reader.py
@@ -16,7 +16,7 @@
     MISSING_EVENT_FILE_RETRY_LIMIT,
     MISSING_EVENT_FILE_RETRY_LIMIT_KEY,
 )
-from smdebug.core.locations import IndexFileLocationUtils, TensorLocation
+from smdebug.core.locations import IndexFileLocationUtils, TensorLocation, TensorShape
 from smdebug.core.logger import get_logger
 from smdebug.core.modes import ModeKeys
 from smdebug.core.s3_utils import list_s3_objects
@@ -120,12 +120,22 @@ def fetch_tensor_value(self, tensor_location: TensorLocation):
     def list_event_files(self, start_after_prefix):
         pass
 
-    @abstractmethod
     def load_tensor_data_from_index_files(
         self, start_after_key=None, range_steps=None
     ) -> Tuple[Dict[str, Dict[int, Dict[str, TensorLocation]]], str]:
         """Return a triply nested dict referring to tensor data."""
 
+        responses, steps, last_index_token, workers = self.read_index_files(
+            start_after_key, range_steps
+        )
+
+        tensor_data = {}
+        for step, response, worker in zip(steps, responses, workers):
+            tensor_data = self._update_tensors_from_json(
+                tensor_data, step, response, self.path, worker
+            )
+        return tensor_data, last_index_token
+
     @abstractmethod
     def _is_event_file_present(self, file_name) -> bool:
         pass
@@ -203,8 +213,10 @@ def _validate(index_dict):
             raise IndexReaderException("meta section is not present")
         if len(index_dict["meta"]) == 0:
             raise IndexReaderException("meta section is empty")
-        if "tensor_payload" not in index_dict:
-            raise IndexReaderException("tensor_payload section is not present")
+        if "tensor_payload" not in index_dict and "shape_payload" not in index_dict:
+            raise IndexReaderException(
+                "neither tensor_payload nor shape_payload sections are present"
+            )
 
     def _update_tensors_from_json(
         self, index_tensors_dict, step, response: bytes, path, worker
@@ -233,28 +245,41 @@ def _update_tensors_from_json(
         mode = index_meta["mode"]
         mode = ModeKeys[mode.strip()]
         mode_step = index_meta["mode_step"]
-        event_file_name = os.path.join(path, index_meta["event_file_name"])
-        tensors = index_dict["tensor_payload"]
-        for tensor in tensors:
-            tensor_name = tensor["tensorname"]
-            start_idx = tensor["start_idx"]
-            length = tensor["length"]
-            tensor_location = TensorLocation(
-                tensor_name, mode, mode_step, event_file_name, start_idx, length, worker
-            )
+
+        to_update_index_dict = []
+
+        if "tensor_payload" in index_dict and len(index_dict["tensor_payload"]):
+            event_file_name = os.path.join(path, index_meta["event_file_name"])
+            for tensor in index_dict["tensor_payload"]:
+                tensor_name = tensor["tensorname"]
+                start_idx = tensor["start_idx"]
+                length = tensor["length"]
+                tensor_location = TensorLocation(
+                    tensor_name, mode, mode_step, event_file_name, start_idx, length, worker
+                )
+                to_update_index_dict.append((tensor_name, step, tensor_location))
+
+        if "shape_payload" in index_dict and len(index_dict["shape_payload"]):
+            for tensor in index_dict["shape_payload"]:
+                tensor_name = tensor["tensorname"]
+                original_name = tensor["originalname"]
+                shape = tensor["shape"]
+                ts = TensorShape(tensor_name, mode, mode_step, shape, original_name)
+                to_update_index_dict.append((tensor_name, step, ts))
+
+        for tu in to_update_index_dict:
+            tensor_name, step, obj = tu
+            if isinstance(obj, TensorLocation):
+                obj_dict = {"tensor_location": obj}
+            elif isinstance(obj, TensorShape):
+                obj_dict = {"tensor_shape": obj}
             if tensor_name in index_tensors_dict:
                 if step in index_tensors_dict[tensor_name]:
-                    index_tensors_dict[tensor_name][step].update(
-                        {worker: {"tensor_location": tensor_location}}
-                    )
+                    index_tensors_dict[tensor_name][step].update({worker: obj_dict})
                 else:
-                    index_tensors_dict[tensor_name].update(
-                        {step: {worker: {"tensor_location": tensor_location}}}
-                    )
+                    index_tensors_dict[tensor_name].update({step: {worker: obj_dict}})
             else:
-                index_tensors_dict[tensor_name] = {
-                    step: {worker: {"tensor_location": tensor_location}}
-                }
+                index_tensors_dict[tensor_name] = {step: {worker: obj_dict}}
         return index_tensors_dict
 
 
@@ -285,22 +310,6 @@ def fetch_tensor_value(self, tensor_location: TensorLocation) -> np.ndarray:
         tensor_name, step, tensor_data, mode, mode_step = tensor_tuple
         return tensor_data
 
-    def load_tensor_data_from_index_files(
-        self, start_after_key=None, range_steps=None
-    ) -> Tuple[Dict[str, Dict[int, Dict[str, TensorLocation]]], str]:
-        """Return a triply nested dict referring to tensor data."""
-
-        responses, steps, last_index_token, workers = self.read_index_files(
-            start_after_key, range_steps
-        )
-
-        tensor_data = {}
-        for step, response, worker in zip(steps, responses, workers):
-            tensor_data = self._update_tensors_from_json(
-                tensor_data, step, response, self.path, worker
-            )
-        return tensor_data, last_index_token
-
     def read_index_files(
         self, start_after_key: str, range_steps=None
     ) -> Tuple[List[bytes], list, str, List[str]]:
@@ -398,21 +407,6 @@ def fetch_tensor_value(self, tensor_location: TensorLocation) -> np.ndarray:
         tensor_name, step, tensor_data, mode, mode_step = tensor_tuple
         return tensor_data
 
-    def load_tensor_data_from_index_files(
-        self, start_after_key=None, range_steps=None
-    ) -> Tuple[Dict[str, Dict[int, Dict[str, TensorLocation]]], str]:
-        """Return a triply nested dict referring to tensor data."""
-
-        responses, steps, last_index_token, workers = self.read_index_files(
-            start_after_key, range_steps
-        )
-        tensor_data = {}
-        for step, response, worker in zip(steps, responses, workers):
-            tensor_data = self._update_tensors_from_json(
-                tensor_data, step, response, self.path, worker
-            )
-        return tensor_data, last_index_token
-
     def read_index_files(
         self, start_after_key: str, range_steps=None
     ) -> Tuple[List[bytes], list, str, List[str]]:

diff --git a/smdebug/core/locations.py b/smdebug/core/locations.py
@@ -24,6 +24,18 @@ def to_dict(self):
         return {"tensorname": self.tensorname, "start_idx": self.start_idx, "length": self.length}
 
 
+class TensorShape:
+    def __init__(self, name, mode, mode_step, shape, original_name=None):
+        self.name = name
+        self.original_name = original_name if original_name is not None else name
+        self.mode = mode
+        self.mode_step = mode_step
+        self.shape = tuple(shape)
+
+    def to_dict(self):
+        return {"tensorname": self.name, "originalname": self.original_name, "shape": self.shape}
+
+
 STEP_NUMBER_FORMATTING_LENGTH = "012"