End-to-end testing with various model configurations (#85)

* End-to-end testing with various model configurations * Reorganize unit and integration tests * More reorganization, add system tests pytest coverage 57% -> 80%
awslabs · Jul 26, 2017 · 7da864e · 7da864e
1 parent 6c22eb2
commit 7da864e
Show file tree

Hide file tree

Showing 34 changed files with 398 additions and 95 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -23,4 +23,4 @@ script:
   - pylint --rcfile=pylintrc test -E
   - mypy --ignore-missing-imports --follow-imports=silent @typechecked-files
   - check-manifest --ignore sockeye/git_version.py
-
+#  - python -m pytest test/system
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -8,6 +8,7 @@ include sockeye/git_version.py
 exclude *.sh
 include pytest.ini
 recursive-include test *.py
+recursive-include test *.ini
 recursive-include docs *.bat
 recursive-include docs *.md
 recursive-include docs *.py

diff --git a/pre-commit.sh b/pre-commit.sh
@@ -10,7 +10,7 @@
 STASH_NAME="pre-commit-$(date +%s)"
 git stash save -q --keep-index $STASH_NAME
 
-# Run unit tests
+# Run unit and integration tests
 python3 setup.py test
 TEST_RESULT=$?
 
@@ -27,16 +27,21 @@ TESTS_LINT_RESULT=$?
 mypy --ignore-missing-imports --follow-imports=silent @typechecked-files
 MYPY_RESULT=$?
 
+# Run system tests
+python3 -m pytest test/system
+SYSTEM_RESULT=$?
+
 # Pop our stashed files
 STASHES=$(git stash list)
 if [[ $STASHES == "$STASH_NAME" ]]; then
   git stash pop -q
 fi
 
-[ $TEST_RESULT -ne 0 ] && echo 'Unit tests failed' && exit 1
+[ $TEST_RESULT -ne 0 ] && echo 'Unit or integration tests failed' && exit 1
 [ $SOCKEYE_LINT_RESULT -ne 0 ] && echo 'pylint found errors in the sockeye package' && exit 1
 [ $TESTS_LINT_RESULT -ne 0 ] && echo 'pylint found errors in the test package' && exit 1
 [ $MYPY_RESULT -ne 0 ] && echo 'mypy found incorrect type usage' && exit 1
+[ $SYSTEM_RESULT -ne 0 ] && echo 'System tests failed' && exit 1
 
 echo 'all pre-commit checks passed'
 exit 0
diff --git a/pytest.ini b/pytest.ini
@@ -1,2 +1,2 @@
 [pytest]
-addopts = --cov sockeye test -v
+addopts = --cov sockeye test/unit test/integration -v
diff --git a/sockeye/average.py b/sockeye/average.py
@@ -74,15 +74,15 @@ def find_checkpoints(model_path: str, size=4, strategy="best", maximize=False, m
     """
     Finds N best points from .metrics file according to strategy
 
-    :param metric: Metric according to which checkpoints are selected.  Corresponds to columns in model\metrics file.
+    :param metric: Metric according to which checkpoints are selected.  Corresponds to columns in model/metrics file.
     :param model_path: Path to model.
     :param size: Number of checkpoints to combine.
     :param strategy: Combination strategy.
     :param maximize: Whether the value of the metric should be maximized.
     :return: List of paths corresponding to chosen checkpoints.
     """
     metrics_path = os.path.join(model_path, C.METRICS_NAME)
-    points = _read_metrics_points(metrics_path, model_path, metric=metric)
+    points = sockeye.utils.read_metrics_points(metrics_path, model_path, metric=metric)
 
     if strategy == "best":
         # N best scoring points
@@ -111,35 +111,6 @@ def find_checkpoints(model_path: str, size=4, strategy="best", maximize=False, m
     return params_paths
 
 
-def _read_metrics_points(path: str, model_path: str, metric: str) -> List[Tuple[float, int]]:
-    """
-    Reads lines from .metrics file and return list of elements [val, checkpoint]
-
-    :param metric: Metric according to which checkpoints are selected.  Corresponds to columns in model\metrics file.
-    :param path: File to read metric values from.
-    :param model_path: path where the params files reside.
-    :return: List of pairs (metric value, checkpoint).
-    """
-    points = []
-    # First field is checkpoint id
-    # Metric on validation (dev) set looks like this: METRIC-val=N
-    with open(path, "r") as metrics_in:
-        for line in metrics_in:
-            fields = line.split()
-            checkpoint = int(fields[0])
-            # Check that the corresponding params files exists
-            if not os.path.exists(os.path.join(model_path, C.PARAMS_NAME % checkpoint)):
-                continue
-            for field in fields[1:]:
-                key_value = field.split("=")
-                if len(key_value) == 2:
-                    metric_set = key_value[0].split("-")
-                    if len(metric_set) == 2 and metric_set[0] == metric and metric_set[1] == "val":
-                        metric_value = float(key_value[1])
-                        points.append([metric_value, checkpoint])
-    return points
-
-
 def _strategy_best(points, size, maximize):
     top_n = sorted(points, reverse=maximize)[:size]
     return top_n

diff --git a/sockeye/utils.py b/sockeye/utils.py
@@ -31,6 +31,7 @@
 import numpy as np
 
 from sockeye import __version__
+import sockeye.constants as C
 
 logger = logging.getLogger(__name__)
 
@@ -525,3 +526,32 @@ def namedtuple_with_defaults(typename, field_names, default_values: Mapping[str,
         prototype = T(*default_values)
     T.__new__.__defaults__ = tuple(prototype)
     return T
+
+
+def read_metrics_points(path: str, model_path: str, metric: str) -> List[Tuple[float, int]]:
+    """
+    Reads lines from .metrics file and return list of elements [val, checkpoint]
+
+    :param metric: Metric according to which checkpoints are selected.  Corresponds to columns in model/metrics file.
+    :param path: File to read metric values from.
+    :param model_path: path where the params files reside.
+    :return: List of pairs (metric value, checkpoint).
+    """
+    points = []
+    # First field is checkpoint id
+    # Metric on validation (dev) set looks like this: METRIC-val=N
+    with open(path, "r") as metrics_in:
+        for line in metrics_in:
+            fields = line.split()
+            checkpoint = int(fields[0])
+            # Check that the corresponding params files exists
+            if not os.path.exists(os.path.join(model_path, C.PARAMS_NAME % checkpoint)):
+                continue
+            for field in fields[1:]:
+                key_value = field.split("=")
+                if len(key_value) == 2:
+                    metric_set = key_value[0].split("-")
+                    if len(metric_set) == 2 and metric_set[0] == metric and metric_set[1] == "val":
+                        metric_value = float(key_value[1])
+                        points.append([metric_value, checkpoint])
+    return points
diff --git a/test/__init__.py b/test/__init__.py
@@ -5,9 +5,8 @@
 # is located at
 #
 #     http://aws.amazon.com/apache2.0/
-# 
+#
 # or in the "license" file accompanying this file. This file is distributed on
 # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
-
diff --git a/test/common.py b/test/common.py
@@ -0,0 +1,159 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import os
+import random
+import sys
+from tempfile import TemporaryDirectory
+from typing import Optional, Tuple
+from unittest.mock import patch
+
+import mxnet as mx
+import numpy as np
+
+import sockeye.bleu
+import sockeye.constants as C
+import sockeye.train
+import sockeye.translate
+import sockeye.utils
+
+
+def gaussian_vector(shape, return_symbol=False):
+    """
+    Generates random normal tensors (diagonal covariance)
+
+    :param shape: shape of the tensor.
+    :param return_symbol: True if the result should be a Symbol, False if it should be an Numpy array.
+    :return: A gaussian tensor.
+    """
+    return mx.sym.random_normal(shape=shape) if return_symbol else np.random.normal(size=shape)
+
+
+def integer_vector(shape, max_value, return_symbol=False):
+    """
+    Generates a random positive integer tensor
+
+    :param shape: shape of the tensor.
+    :param max_value: maximum integer value.
+    :param return_symbol: True if the result should be a Symbol, False if it should be an Numpy array.
+    :return: A random integer tensor.
+    """
+    return mx.sym.round(mx.sym.random_uniform(shape=shape) * max_value) if return_symbol \
+        else np.round(np.random.uniform(size=shape) * max_value)
+
+
+def uniform_vector(shape, min_value=0, max_value=1, return_symbol=False):
+    """
+    Generates a uniformly random tensor
+
+    :param shape: shape of the tensor
+    :param min_value: minimum possible value
+    :param max_value: maximum possible value (exclusive)
+    :param return_symbol: True if the result should be a mx.sym.Symbol, False if it should be a Numpy array
+    :return:
+    """
+    return mx.sym.random_uniform(low=min_value, high=max_value, shape=shape) if return_symbol \
+        else np.random.uniform(low=min_value, high=max_value, size=shape)
+
+
+def generate_random_sentence(vocab_size, max_len):
+    """
+    Generates a random "sentence" as a list of integers.
+
+    :param vocab_size: Number of words in the "vocabulary". Note that due to
+                       the inclusion of special words (BOS, EOS, UNK) this does *not*
+                       correspond to the maximum possible value.
+    :param max_len: maximum sentence length.
+    """
+    length = random.randint(1, max_len)
+    # Due to the special words, the actual words start at index 3 and go up to vocab_size+2
+    return [random.randint(3, vocab_size + 2) for _ in range(length)]
+
+
+_DIGITS = "0123456789"
+
+
+def generate_digits_file(source_path: str,
+                         target_path: str,
+                         line_count: int = 100,
+                         line_length: int = 9,
+                         sort_target: bool = False):
+    with open(source_path, "w") as source_out, open(target_path, "w") as target_out:
+        for _ in range(line_count):
+            digits = [random.choice(_DIGITS) for _ in range(random.randint(1, line_length))]
+            print(" ".join(digits), file=source_out)
+            if sort_target:
+                digits.sort()
+            print(" ".join(digits), file=target_out)
+
+
+_TRAIN_PARAMS_COMMON = "--use-cpu --max-seq-len {max_len} --source {train_source} --target {train_target}" \
+                       " --validation-source {dev_source} --validation-target {dev_target} --output {model}"
+
+
+_TRANSLATE_PARAMS_COMMON = "--use-cpu --models {model} --input {input} --output {output}"
+
+
+def run_train_translate(train_params: str,
+                        translate_params: str,
+                        train_source_path: str,
+                        train_target_path: str,
+                        dev_source_path: str,
+                        dev_target_path: str,
+                        max_seq_len: int = 10,
+                        work_dir: Optional[str] = None) -> Tuple[float, float]:
+    """
+    Train a model and translate a dev set.  Report perplexity and BLEU.
+
+    :param train_params: Command line args for model training.
+    :param translate_params: Command line args for translation.
+    :param perplexity_thresh: Maximum perplexity for success
+    :param bleu_thresh: Minimum BLEU score for success
+    :return: (perplexity, bleu)
+    """
+    with TemporaryDirectory(dir=work_dir, prefix="test_train_translate.") as work_dir:
+
+        # Train model
+        model_path = os.path.join(work_dir, "model")
+        params = "{} {} {}".format(sockeye.train.__file__,
+                                   _TRAIN_PARAMS_COMMON.format(train_source=train_source_path,
+                                                               train_target=train_target_path,
+                                                               dev_source=dev_source_path,
+                                                               dev_target=dev_target_path,
+                                                               model=model_path,
+                                                               max_len=max_seq_len),
+                                   train_params)
+        with patch.object(sys, "argv", params.split()):
+            sockeye.train.main()
+
+        # Translate corpus
+        out_path = os.path.join(work_dir, "out.txt")
+        params = "{} {} {}".format(sockeye.translate.__file__,
+                                   _TRANSLATE_PARAMS_COMMON.format(model=model_path,
+                                                                   input=dev_source_path,
+                                                                   output=out_path),
+                                   translate_params)
+        with patch.object(sys, "argv", params.split()):
+            sockeye.translate.main()
+
+        # Measure perplexity
+        checkpoints = sockeye.utils.read_metrics_points(path=os.path.join(model_path, C.METRICS_NAME),
+                                                        model_path=model_path,
+                                                        metric=C.PERPLEXITY)
+        perplexity = checkpoints[-1][0]
+
+        # Measure BLEU
+        bleu = sockeye.bleu.corpus_bleu(open(out_path, "r").readlines(),
+                                        open(dev_target_path, "r").readlines())
+
+        return perplexity, bleu
diff --git a/test/integration/__init__.py b/test/integration/__init__.py
@@ -0,0 +1,12 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
diff --git a/test/integration/test_seq_copy_int.py b/test/integration/test_seq_copy_int.py
@@ -0,0 +1,66 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import os
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from test.common import generate_digits_file, run_train_translate
+
+_TRAIN_LINE_COUNT = 100
+_DEV_LINE_COUNT = 10
+_LINE_MAX_LENGTH = 9
+
+@pytest.mark.parametrize("train_params, translate_params", [
+    # "Vanilla" LSTM encoder-decoder with attention
+    ("--encoder rnn --rnn-num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 16 --num-embed 8 --attention-type mlp"
+     " --attention-num-hidden 16 --batch-size 8 --loss cross-entropy --optimized-metric perplexity --max-updates 10"
+     " --checkpoint-frequency 10 --optimizer adam --initial-learning-rate 0.01",
+     "--beam-size 2"),
+    # "Kitchen sink" LSTM encoder-decoder with attention
+    ("--encoder rnn --rnn-num-layers 4 --rnn-cell-type lstm --rnn-num-hidden 16 --rnn-residual-connections"
+     " --num-embed 16 --attention-type coverage --attention-num-hidden 16 --weight-tying --attention-use-prev-word"
+     " --context-gating --layer-normalization --batch-size 8 --loss smoothed-cross-entropy"
+     " --smoothed-cross-entropy-alpha 0.1 --normalize-loss --optimized-metric perplexity --max-updates 10"
+     " --checkpoint-frequency 10 --dropout 0.1 --optimizer adam --initial-learning-rate 0.01",
+     "--beam-size 2"),
+    # Convolutional embedding encoder + LSTM encoder-decoder with attention
+    ("--encoder rnn-with-conv-embed --conv-embed-max-filter-width 3 --conv-embed-num-filters 4 4 8"
+     " --conv-embed-pool-stride 2 --conv-embed-num-highway-layers 1 --rnn-num-layers 1 --rnn-cell-type lstm"
+     " --rnn-num-hidden 16 --num-embed 8 --attention-num-hidden 16 --batch-size 8 --loss cross-entropy"
+     " --optimized-metric perplexity --max-updates 10 --checkpoint-frequency 10 --optimizer adam"
+     " --initial-learning-rate 0.01",
+     "--beam-size 2"),
+])
+
+def test_seq_copy(train_params, translate_params):
+    """Task: copy short sequences of digits"""
+    with TemporaryDirectory(prefix="test_seq_copy") as work_dir:
+        # Simple digits files for train/dev data
+        train_source_path = os.path.join(work_dir, "train.src")
+        train_target_path = os.path.join(work_dir, "train.tgt")
+        dev_source_path = os.path.join(work_dir, "dev.src")
+        dev_target_path = os.path.join(work_dir, "dev.tgt")
+        generate_digits_file(train_source_path, train_target_path, _TRAIN_LINE_COUNT, _LINE_MAX_LENGTH)
+        generate_digits_file(dev_source_path, dev_target_path, _DEV_LINE_COUNT, _LINE_MAX_LENGTH)
+        # Test model configuration
+        # Ignore return values (perplexity and BLEU) for integration test
+        run_train_translate(train_params,
+                            translate_params,
+                            train_source_path,
+                            train_target_path,
+                            dev_source_path,
+                            dev_target_path,
+                            max_seq_len=_LINE_MAX_LENGTH + 1,
+                            work_dir=work_dir)