Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Pointer Networks implementation based on the Vocabulary Approach #505

Closed
wants to merge 10 commits into from
4 changes: 4 additions & 0 deletions CHANGELOG.md
Expand Up @@ -10,6 +10,10 @@ Note that Sockeye has checks in place to not translate with an old model that wa

Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.

## [1.18.46]
### Added
- Added pointer networks implementation for summarization based on the paper 'Get to the point: Pointer Generator Networks'

## [1.18.45]
### Added
- Added an 8 layer LSTM model similar (but not exactly identical) to the 'GNMT' architecture to autopilot.
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Expand Up @@ -24,4 +24,5 @@ recursive-include docs Makefile
recursive-include tutorials *.md
recursive-include tutorials *.png
recursive-include tutorials *.py
recursive-include tutorials *.sh
recursive-include test *.txt
2 changes: 1 addition & 1 deletion sockeye/__init__.py
Expand Up @@ -11,4 +11,4 @@
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

__version__ = '1.18.45'
__version__ = '1.18.46'
13 changes: 13 additions & 0 deletions sockeye/arguments.py
Expand Up @@ -405,6 +405,18 @@ def add_pointer_args(params):
type=int,
default=2,
help='Shortest word length that can be pointed to')
params.add_argument('--max-oov-words',
type=int_greater_or_equal(1),
default=50,
help='maximum out-of-vocabulary words to consider. Default: %(default)s')
params.add_argument('--use-coverage-loss',
action='store_true',
default=False,
help='Use coverage loss function. Default: %(default)s.')
params.add_argument('--coverage-loss-weight',
type=float,
default=0.01,
help='weight of the coverage loss term for pointer networks')


def add_bucketing_args(params):
Expand Down Expand Up @@ -1065,6 +1077,7 @@ def add_translate_cli_args(params):
add_inference_args(params)
add_device_args(params)
add_logging_args(params)
add_pointer_args(params)


def add_max_output_cli_args(params):
Expand Down
11 changes: 10 additions & 1 deletion sockeye/checkpoint_decoder.py
Expand Up @@ -115,13 +115,19 @@ def __init__(self,
context)

def decode_and_evaluate(self,
use_pointer_nets: bool,
max_oov_words: int,
pointer_nets_type: str,
checkpoint: Optional[int] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These three arguments will be read by the model from the ModelConfig. You shouldn't need to pass them through at all.

output_name: str = os.devnull) -> Dict[str, float]:
"""
Decodes data set and evaluates given a checkpoint.

:param checkpoint: Checkpoint to load parameters from.
:param output_name: Filename to write translations to. Defaults to /dev/null.
:param use_pointer_nets: Flag to indicate if pointer network is enabled
:param max_oov_words: Maximum number of words to consider in the extended vocabulary (with pointer networks)
:param pointer_nets_type: Pointer Networks implementation to use.
:return: Mapping of metric names to scores.
"""
models, source_vocabs, target_vocab = inference.load_models(
Expand All @@ -143,7 +149,10 @@ def decode_and_evaluate(self,
source_vocabs=source_vocabs,
target_vocab=target_vocab,
restrict_lexicon=None,
store_beam=False)
store_beam=False,
use_pointer_nets=use_pointer_nets,
max_oov_words=max_oov_words,
pointer_nets_type=pointer_nets_type)
trans_wall_time = 0.0
translations = []
with data_io.smart_open(output_name, 'w') as output:
Expand Down
4 changes: 3 additions & 1 deletion sockeye/constants.py
Expand Up @@ -27,6 +27,7 @@
VOCAB_SYMBOLS = [PAD_SYMBOL, UNK_SYMBOL, BOS_SYMBOL, EOS_SYMBOL]
# reserve extra space for the EOS or BOS symbol that is added to both source and target
SPACE_FOR_XOS = 1
MAX_OOV_WORDS = 50
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not reuse another variable for this, such as the maximum input length? That way we avoid creating another variable by using a good default. Is there any reason to set it to something other than that?


ARG_SEPARATOR = ":"

Expand Down Expand Up @@ -390,7 +391,8 @@
# pointer networks
POINTER_NET_RNN = "rnn"
POINTER_NET_SHARED = "shared"
POINTER_NET_CHOICES = [POINTER_NET_RNN]
POINTER_NET_SUMMARY = "summary"
POINTER_NET_CHOICES = [POINTER_NET_RNN, POINTER_NET_SUMMARY]

# data sharding
SHARD_NAME = "shard.%05d"
Expand Down
172 changes: 135 additions & 37 deletions sockeye/data_io.py

Large diffs are not rendered by default.

27 changes: 22 additions & 5 deletions sockeye/decoder.py
Expand Up @@ -470,7 +470,9 @@ def __init__(self,
layer_normalization: bool = False,
attention_in_upper_layers: bool = False,
dtype: str = C.DTYPE_FP32,
enc_last_hidden_concat_to_embedding: bool = False) -> None:
enc_last_hidden_concat_to_embedding: bool = False,
use_pointer_nets: bool = False,
pointer_nets_type: str = C.POINTER_NET_SUMMARY) -> None:

super().__init__()
self.max_seq_len_source = max_seq_len_source
Expand All @@ -484,6 +486,8 @@ def __init__(self,
self.attention_in_upper_layers = attention_in_upper_layers
self.enc_last_hidden_concat_to_embedding = enc_last_hidden_concat_to_embedding
self.dtype = dtype
self.use_pointer_nets = use_pointer_nets
self.pointer_nets_type = pointer_nets_type


@Decoder.register(RecurrentDecoderConfig, C.RNN_DECODER_PREFIX)
Expand Down Expand Up @@ -582,6 +586,7 @@ def decode_sequence(self,
"""

# target_embed: target_seq_len * (batch_size, num_target_embed)
target_embed_local = target_embed
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It isn't necessary to save this and then return it. This reassignment here does not change the value in the caller.

target_embed = mx.sym.split(data=target_embed, num_outputs=target_embed_max_length, axis=1, squeeze_axis=True)

# Get last state from source (batch_size, num_target_embed)
Expand All @@ -606,7 +611,7 @@ def decode_sequence(self,
hidden_states = [] # type: List[mx.sym.Symbol]
context_vectors = [] # type: List[mx.sym.Symbol]
attention_probs = [] # type: List[mx.sym.Symbol]
# TODO: possible alternative: feed back the context vector instead of the hidden (see lamtram)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's keep this line in.

coverage_vectors = [] # type: List[mx.sym.Symbol]
self.reset()
for seq_idx in range(target_embed_max_length):
# hidden: (batch_size, rnn_num_hidden)
Expand All @@ -619,11 +624,23 @@ def decode_sequence(self,
hidden_states.append(state.hidden)
context_vectors.append(attention_state.context)
attention_probs.append(attention_state.probs)
coverage_vectors.append(attention_state.dynamic_source)

# concatenate along time axis: (batch_size, target_embed_max_length, rnn_num_hidden)
return mx.sym.Group([mx.sym.stack(*hidden_states, axis=1, name='%shidden_stack' % self.prefix), \
mx.sym.stack(*context_vectors, axis=1, name='%scontext_stack' % self.prefix),
mx.sym.stack(*attention_probs, axis=1, name='%sattention_stack' % self.prefix)])
if self.rnn_config.use_pointer_nets and self.rnn_config.pointer_nets_type == C.POINTER_NET_SUMMARY:
return mx.sym.Group([mx.sym.stack(*hidden_states, axis=1, name='%shidden_stack' % self.prefix),
# expected size: (batch_size, trg_max_length, encoder_num_hidden)
mx.sym.stack(*context_vectors, axis=1, name='%scontext_stack' % self.prefix),
# expected size: (batch_size, trg_max_length, attn_len)
mx.sym.stack(*attention_probs, axis=1, name='%sattention_stack' % self.prefix),
# expected size: (batch_size, trg_max_length, attn_len)
mx.sym.stack(*coverage_vectors, axis=1, name='%scoverage_stack' % self.prefix),
# expected size: (batch_size, trg_max_length, trg_embed_len)
target_embed_local])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to return this. The caller already has it.

else:
return mx.sym.Group([mx.sym.stack(*hidden_states, axis=1, name='%shidden_stack' % self.prefix),
mx.sym.stack(*context_vectors, axis=1, name='%scontext_stack' % self.prefix),
mx.sym.stack(*attention_probs, axis=1, name='%sattention_stack' % self.prefix)])

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should get rid of the if statement here. Better to just return coverage vectors even if it's an empty symbol/list.

def decode_step(self,
step: int,
Expand Down
2 changes: 1 addition & 1 deletion sockeye/embeddings.py
Expand Up @@ -134,7 +134,7 @@ def embeddings(args: argparse.Namespace):
if not tokens:
continue
print("Input:", line.rstrip())
ids = tokens2ids(tokens, vocab)
ids = tokens2ids(tokens, vocab, use_pointer_nets=False, max_oov_words=0, point_nets_type=None)
for token, token_id in zip(tokens, ids):
print("%s id=%d" % (token, token_id))
neighbours = nearest_k(sims, token_id, args.k, args.gamma)
Expand Down
5 changes: 4 additions & 1 deletion sockeye/image_captioning/captioner.py
Expand Up @@ -78,7 +78,10 @@ def get_pretrained_caption_net(args: argparse.Namespace,
source_image_size=tuple(
args.feature_size),
source_root=args.source_root,
use_feature_loader=image_preextracted_features)
use_feature_loader=image_preextracted_features,
use_pointer_nets=False,
max_oov_words=C.MAX_OOV_WORDS,
pointer_nets_type=None)
return translator


Expand Down
6 changes: 6 additions & 0 deletions sockeye/image_captioning/checkpoint_decoder.py
Expand Up @@ -57,11 +57,17 @@ def __init__(self,
self.use_feature_loader = use_feature_loader

def decode_and_evaluate(self,
use_pointer_nets: bool,
max_oov_words: int,
pointer_nets_type: str,
checkpoint: Optional[int] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't be needed (per above).

output_name: str = os.devnull) -> Dict[str, float]:
"""
Decodes data set and evaluates given a checkpoint.

:param use_pointer_nets: Flag to indicate if pointer network is enabled(not available with captioning as of now)
:param max_oov_words: Maximum number of words to consider in the extended vocabulary (with pointer networks)
:param pointer_nets_type: Pointer Networks Implementation to use.
:param checkpoint: Checkpoint to load parameters from.
:param output_name: Filename to write translations to. Defaults to /dev/null.
:return: Mapping of metric names to scores.
Expand Down
5 changes: 3 additions & 2 deletions sockeye/image_captioning/inference.py
Expand Up @@ -153,8 +153,9 @@ def _get_inference_input(self,
image_paths[j] = path
# Preprocess constraints
if trans_input.constraints is not None:
raw_constraints[j] = [data_io.tokens2ids(phrase, self.vocab_target) for phrase in
trans_input.constraints]
raw_constraints[j] = [data_io.tokens2ids(phrase, self.vocab_target, use_pointer_nets=False,
max_oov_words=1, point_nets_type=C.POINTER_NET_SUMMARY)
for phrase in trans_input.constraints]

# Read data and zero pad if necessary
images = self.data_loader(image_paths)
Expand Down
11 changes: 9 additions & 2 deletions sockeye/image_captioning/train.py
Expand Up @@ -226,7 +226,11 @@ def create_model_config(args: argparse.Namespace,
config_loss = loss.LossConfig(name=args.loss,
vocab_size=vocab_target_size,
normalization_type=args.loss_normalization_type,
label_smoothing=args.label_smoothing)
label_smoothing=args.label_smoothing,
use_pointer_nets=False,
use_coverage_loss=False,
coverage_loss_weight=0,
pointer_nets_type=C.POINTER_NET_SUMMARY)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that pointer nets are not going to be supported for image captioning. Could these calls rely on defaults, such that no changes need to be made to the image captioning code?

model_config = model.ModelConfig(config_data=config_data,
vocab_source_size=0,
Expand Down Expand Up @@ -384,7 +388,10 @@ def train(args: argparse.Namespace):
mxmonitor_pattern=args.monitor_pattern,
mxmonitor_stat_func=args.monitor_stat_func,
allow_missing_parameters=args.allow_missing_params,
existing_parameters=args.params)
existing_parameters=args.params,
use_pointer_nets=False,
max_oov_words=C.MAX_OOV_WORDS,
pointer_nets_type=C.POINTER_NET_SUMMARY)


if __name__ == "__main__":
Expand Down