awslabs · Zarana-Parekh · Aug 9, 2018 · Aug 9, 2018 · Aug 9, 2018 · Aug 9, 2018
@@ -10,6 +10,10 @@ Note that Sockeye has checks in place to not translate with an old model that wa
 
 Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.
 
+## [1.18.46]
+### Added
+- Added pointer networks implementation for summarization based on the paper 'Get to the point: Pointer Generator Networks'
+
 ## [1.18.45]
 ### Added
 - Added an 8 layer LSTM model similar (but not exactly identical) to the 'GNMT' architecture to autopilot.

@@ -24,4 +24,5 @@ recursive-include docs Makefile
 recursive-include tutorials *.md
 recursive-include tutorials *.png
 recursive-include tutorials *.py
+recursive-include tutorials *.sh
 recursive-include test *.txt
@@ -11,4 +11,4 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '1.18.45'
+__version__ = '1.18.46'
@@ -405,6 +405,18 @@ def add_pointer_args(params):
                         type=int,
                         default=2,
                         help='Shortest word length that can be pointed to')
+    params.add_argument('--max-oov-words',
+                        type=int_greater_or_equal(1),
+                        default=50,
+                        help='maximum out-of-vocabulary words to consider. Default: %(default)s')
+    params.add_argument('--use-coverage-loss',
+                        action='store_true',
+                        default=False,
+                        help='Use coverage loss function. Default: %(default)s.')
+    params.add_argument('--coverage-loss-weight',
+                        type=float,
+                        default=0.01,
+                        help='weight of the coverage loss term for pointer networks')
 
 
 def add_bucketing_args(params):
@@ -1065,6 +1077,7 @@ def add_translate_cli_args(params):
     add_inference_args(params)
     add_device_args(params)
     add_logging_args(params)
+    add_pointer_args(params)
 
 
 def add_max_output_cli_args(params):

@@ -115,13 +115,19 @@ def __init__(self,
                     context)
 
     def decode_and_evaluate(self,
+                            use_pointer_nets: bool,
+                            max_oov_words: int,
+                            pointer_nets_type: str,
                             checkpoint: Optional[int] = None,
                             output_name: str = os.devnull) -> Dict[str, float]:
         """
         Decodes data set and evaluates given a checkpoint.
 
         :param checkpoint: Checkpoint to load parameters from.
         :param output_name: Filename to write translations to. Defaults to /dev/null.
+        :param use_pointer_nets: Flag to indicate if pointer network is enabled
+        :param max_oov_words: Maximum number of words to consider in the extended vocabulary (with pointer networks)
+        :param pointer_nets_type: Pointer Networks implementation to use.
         :return: Mapping of metric names to scores.
         """
         models, source_vocabs, target_vocab = inference.load_models(
@@ -143,7 +149,10 @@ def decode_and_evaluate(self,
                                           source_vocabs=source_vocabs,
                                           target_vocab=target_vocab,
                                           restrict_lexicon=None,
-                                          store_beam=False)
+                                          store_beam=False,
+                                          use_pointer_nets=use_pointer_nets,
+                                          max_oov_words=max_oov_words,
+                                          pointer_nets_type=pointer_nets_type)
         trans_wall_time = 0.0
         translations = []
         with data_io.smart_open(output_name, 'w') as output:

@@ -27,6 +27,7 @@
 VOCAB_SYMBOLS = [PAD_SYMBOL, UNK_SYMBOL, BOS_SYMBOL, EOS_SYMBOL]
 # reserve extra space for the EOS or BOS symbol that is added to both source and target
 SPACE_FOR_XOS = 1
+MAX_OOV_WORDS = 50
 
 ARG_SEPARATOR = ":"
 
@@ -390,7 +391,8 @@
 # pointer networks
 POINTER_NET_RNN = "rnn"
 POINTER_NET_SHARED = "shared"
-POINTER_NET_CHOICES = [POINTER_NET_RNN]
+POINTER_NET_SUMMARY = "summary"
+POINTER_NET_CHOICES = [POINTER_NET_RNN, POINTER_NET_SUMMARY]
 
 # data sharding
 SHARD_NAME = "shard.%05d"

@@ -470,7 +470,9 @@ def __init__(self,
                  layer_normalization: bool = False,
                  attention_in_upper_layers: bool = False,
                  dtype: str = C.DTYPE_FP32,
-                 enc_last_hidden_concat_to_embedding: bool = False) -> None:
+                 enc_last_hidden_concat_to_embedding: bool = False,
+                 use_pointer_nets: bool = False,
+                 pointer_nets_type: str = C.POINTER_NET_SUMMARY) -> None:
 
         super().__init__()
         self.max_seq_len_source = max_seq_len_source
@@ -484,6 +486,8 @@ def __init__(self,
         self.attention_in_upper_layers = attention_in_upper_layers
         self.enc_last_hidden_concat_to_embedding = enc_last_hidden_concat_to_embedding
         self.dtype = dtype
+        self.use_pointer_nets = use_pointer_nets
+        self.pointer_nets_type = pointer_nets_type
 
 
 @Decoder.register(RecurrentDecoderConfig, C.RNN_DECODER_PREFIX)
@@ -582,6 +586,7 @@ def decode_sequence(self,
         """
 
         # target_embed: target_seq_len * (batch_size, num_target_embed)
+        target_embed_local = target_embed
         target_embed = mx.sym.split(data=target_embed, num_outputs=target_embed_max_length, axis=1, squeeze_axis=True)
 
         # Get last state from source (batch_size, num_target_embed)
@@ -606,7 +611,7 @@ def decode_sequence(self,
         hidden_states = []  # type: List[mx.sym.Symbol]
         context_vectors = []  # type: List[mx.sym.Symbol]
         attention_probs = []  # type: List[mx.sym.Symbol]
-        # TODO: possible alternative: feed back the context vector instead of the hidden (see lamtram)
+        coverage_vectors = [] # type: List[mx.sym.Symbol]
         self.reset()
         for seq_idx in range(target_embed_max_length):
             # hidden: (batch_size, rnn_num_hidden)
@@ -619,11 +624,23 @@ def decode_sequence(self,
             hidden_states.append(state.hidden)
             context_vectors.append(attention_state.context)
             attention_probs.append(attention_state.probs)
+            coverage_vectors.append(attention_state.dynamic_source)
 
         # concatenate along time axis: (batch_size, target_embed_max_length, rnn_num_hidden)
-        return mx.sym.Group([mx.sym.stack(*hidden_states, axis=1, name='%shidden_stack' % self.prefix), \
-                             mx.sym.stack(*context_vectors, axis=1, name='%scontext_stack' % self.prefix),
-                             mx.sym.stack(*attention_probs, axis=1, name='%sattention_stack' % self.prefix)])
+        if self.rnn_config.use_pointer_nets and self.rnn_config.pointer_nets_type == C.POINTER_NET_SUMMARY:
+            return mx.sym.Group([mx.sym.stack(*hidden_states, axis=1, name='%shidden_stack' % self.prefix),
+                                    # expected size: (batch_size, trg_max_length, encoder_num_hidden)
+                                 mx.sym.stack(*context_vectors, axis=1, name='%scontext_stack' % self.prefix),
+                                    # expected size: (batch_size, trg_max_length, attn_len)
+                                    mx.sym.stack(*attention_probs, axis=1, name='%sattention_stack' % self.prefix),
+                                    # expected size: (batch_size, trg_max_length, attn_len)
+                                    mx.sym.stack(*coverage_vectors, axis=1, name='%scoverage_stack' % self.prefix),
+                                    # expected size: (batch_size, trg_max_length, trg_embed_len)
+                                    target_embed_local])
+        else:
+            return mx.sym.Group([mx.sym.stack(*hidden_states, axis=1, name='%shidden_stack' % self.prefix),
+                                 mx.sym.stack(*context_vectors, axis=1, name='%scontext_stack' % self.prefix),
+                                 mx.sym.stack(*attention_probs, axis=1, name='%sattention_stack' % self.prefix)])
 
     def decode_step(self,
                     step: int,

@@ -134,7 +134,7 @@ def embeddings(args: argparse.Namespace):
         if not tokens:
             continue
         print("Input:", line.rstrip())
-        ids = tokens2ids(tokens, vocab)
+        ids = tokens2ids(tokens, vocab, use_pointer_nets=False, max_oov_words=0, point_nets_type=None)
         for token, token_id in zip(tokens, ids):
             print("%s id=%d" % (token, token_id))
             neighbours = nearest_k(sims, token_id, args.k, args.gamma)

@@ -78,7 +78,10 @@ def get_pretrained_caption_net(args: argparse.Namespace,
                                                 source_image_size=tuple(
                                                     args.feature_size),
                                                 source_root=args.source_root,
-                                                use_feature_loader=image_preextracted_features)
+                                                use_feature_loader=image_preextracted_features,
+                                                use_pointer_nets=False,
+                                                max_oov_words=C.MAX_OOV_WORDS,
+                                                pointer_nets_type=None)
     return translator
 
 

@@ -57,11 +57,17 @@ def __init__(self,
         self.use_feature_loader = use_feature_loader
 
     def decode_and_evaluate(self,
+                            use_pointer_nets: bool,
+                            max_oov_words: int,
+                            pointer_nets_type: str,
                             checkpoint: Optional[int] = None,
                             output_name: str = os.devnull) -> Dict[str, float]:
         """
         Decodes data set and evaluates given a checkpoint.
 
+        :param use_pointer_nets: Flag to indicate if pointer network is enabled(not available with captioning as of now)
+        :param max_oov_words: Maximum number of words to consider in the extended vocabulary (with pointer networks)
+        :param pointer_nets_type: Pointer Networks Implementation to use.
         :param checkpoint: Checkpoint to load parameters from.
         :param output_name: Filename to write translations to. Defaults to /dev/null.
         :return: Mapping of metric names to scores.

@@ -153,8 +153,9 @@ def _get_inference_input(self,
             image_paths[j] = path
             # Preprocess constraints
             if trans_input.constraints is not None:
-                raw_constraints[j] = [data_io.tokens2ids(phrase, self.vocab_target) for phrase in
-                                      trans_input.constraints]
+                raw_constraints[j] = [data_io.tokens2ids(phrase, self.vocab_target, use_pointer_nets=False,
+                                                         max_oov_words=1, point_nets_type=C.POINTER_NET_SUMMARY)
+                                      for phrase in trans_input.constraints]
 
         # Read data and zero pad if necessary
         images = self.data_loader(image_paths)

@@ -226,7 +226,11 @@ def create_model_config(args: argparse.Namespace,
     config_loss = loss.LossConfig(name=args.loss,
                                   vocab_size=vocab_target_size,
                                   normalization_type=args.loss_normalization_type,
-                                  label_smoothing=args.label_smoothing)
+                                  label_smoothing=args.label_smoothing,
+                                  use_pointer_nets=False,
+                                  use_coverage_loss=False,
+                                  coverage_loss_weight=0,
+                                  pointer_nets_type=C.POINTER_NET_SUMMARY)
 
     model_config = model.ModelConfig(config_data=config_data,
                                      vocab_source_size=0,
@@ -384,7 +388,10 @@ def train(args: argparse.Namespace):
                     mxmonitor_pattern=args.monitor_pattern,
                     mxmonitor_stat_func=args.monitor_stat_func,
                     allow_missing_parameters=args.allow_missing_params,
-                    existing_parameters=args.params)
+                    existing_parameters=args.params,
+                    use_pointer_nets=False,
+                    max_oov_words=C.MAX_OOV_WORDS,
+                    pointer_nets_type=C.POINTER_NET_SUMMARY)
 
 
 if __name__ == "__main__":