diff --git a/.travis.yml b/.travis.yml
index b176f5976..bebef1e95 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -23,4 +23,4 @@ script:
   - pylint --rcfile=pylintrc test -E
   - mypy --ignore-missing-imports --follow-imports=silent @typechecked-files
   - check-manifest --ignore sockeye/git_version.py
-
+#  - python -m pytest test/system
diff --git a/MANIFEST.in b/MANIFEST.in
index 2138cc6a2..8afac1fb6 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -8,6 +8,7 @@ include sockeye/git_version.py
 exclude *.sh
 include pytest.ini
 recursive-include test *.py
+recursive-include test *.ini
 recursive-include docs *.bat
 recursive-include docs *.md
 recursive-include docs *.py
diff --git a/pre-commit.sh b/pre-commit.sh
index 888b1dc6a..ef2ae78aa 100755
--- a/pre-commit.sh
+++ b/pre-commit.sh
@@ -10,7 +10,7 @@
 STASH_NAME="pre-commit-$(date +%s)"
 git stash save -q --keep-index $STASH_NAME
 
-# Run unit tests
+# Run unit and integration tests
 python3 setup.py test
 TEST_RESULT=$?
 
@@ -27,16 +27,21 @@ TESTS_LINT_RESULT=$?
 mypy --ignore-missing-imports --follow-imports=silent @typechecked-files
 MYPY_RESULT=$?
 
+# Run system tests
+python3 -m pytest test/system
+SYSTEM_RESULT=$?
+
 # Pop our stashed files
 STASHES=$(git stash list)
 if [[ $STASHES == "$STASH_NAME" ]]; then
   git stash pop -q
 fi
 
-[ $TEST_RESULT -ne 0 ] && echo 'Unit tests failed' && exit 1
+[ $TEST_RESULT -ne 0 ] && echo 'Unit or integration tests failed' && exit 1
 [ $SOCKEYE_LINT_RESULT -ne 0 ] && echo 'pylint found errors in the sockeye package' && exit 1
 [ $TESTS_LINT_RESULT -ne 0 ] && echo 'pylint found errors in the test package' && exit 1
 [ $MYPY_RESULT -ne 0 ] && echo 'mypy found incorrect type usage' && exit 1
+[ $SYSTEM_RESULT -ne 0 ] && echo 'System tests failed' && exit 1
 
 echo 'all pre-commit checks passed'
 exit 0
diff --git a/pytest.ini b/pytest.ini
index d05332148..f45f864b4 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,2 +1,2 @@
 [pytest]
-addopts = --cov sockeye test -v
+addopts = --cov sockeye test/unit test/integration -v
diff --git a/sockeye/average.py b/sockeye/average.py
index 17befd2bd..72aca8b1b 100644
--- a/sockeye/average.py
+++ b/sockeye/average.py
@@ -74,7 +74,7 @@ def find_checkpoints(model_path: str, size=4, strategy="best", maximize=False, m
     """
     Finds N best points from .metrics file according to strategy
 
-    :param metric: Metric according to which checkpoints are selected.  Corresponds to columns in model\metrics file.
+    :param metric: Metric according to which checkpoints are selected.  Corresponds to columns in model/metrics file.
     :param model_path: Path to model.
     :param size: Number of checkpoints to combine.
     :param strategy: Combination strategy.
@@ -82,7 +82,7 @@ def find_checkpoints(model_path: str, size=4, strategy="best", maximize=False, m
     :return: List of paths corresponding to chosen checkpoints.
     """
     metrics_path = os.path.join(model_path, C.METRICS_NAME)
-    points = _read_metrics_points(metrics_path, model_path, metric=metric)
+    points = sockeye.utils.read_metrics_points(metrics_path, model_path, metric=metric)
 
     if strategy == "best":
         # N best scoring points
@@ -111,35 +111,6 @@ def find_checkpoints(model_path: str, size=4, strategy="best", maximize=False, m
     return params_paths
 
 
-def _read_metrics_points(path: str, model_path: str, metric: str) -> List[Tuple[float, int]]:
-    """
-    Reads lines from .metrics file and return list of elements [val, checkpoint]
-
-    :param metric: Metric according to which checkpoints are selected.  Corresponds to columns in model\metrics file.
-    :param path: File to read metric values from.
-    :param model_path: path where the params files reside.
-    :return: List of pairs (metric value, checkpoint).
-    """
-    points = []
-    # First field is checkpoint id
-    # Metric on validation (dev) set looks like this: METRIC-val=N
-    with open(path, "r") as metrics_in:
-        for line in metrics_in:
-            fields = line.split()
-            checkpoint = int(fields[0])
-            # Check that the corresponding params files exists
-            if not os.path.exists(os.path.join(model_path, C.PARAMS_NAME % checkpoint)):
-                continue
-            for field in fields[1:]:
-                key_value = field.split("=")
-                if len(key_value) == 2:
-                    metric_set = key_value[0].split("-")
-                    if len(metric_set) == 2 and metric_set[0] == metric and metric_set[1] == "val":
-                        metric_value = float(key_value[1])
-                        points.append([metric_value, checkpoint])
-    return points
-
-
 def _strategy_best(points, size, maximize):
     top_n = sorted(points, reverse=maximize)[:size]
     return top_n
diff --git a/sockeye/utils.py b/sockeye/utils.py
index 0a17768b7..5b90ffbd1 100644
--- a/sockeye/utils.py
+++ b/sockeye/utils.py
@@ -31,6 +31,7 @@
 import numpy as np
 
 from sockeye import __version__
+import sockeye.constants as C
 
 logger = logging.getLogger(__name__)
 
@@ -525,3 +526,32 @@ def namedtuple_with_defaults(typename, field_names, default_values: Mapping[str,
         prototype = T(*default_values)
     T.__new__.__defaults__ = tuple(prototype)
     return T
+
+
+def read_metrics_points(path: str, model_path: str, metric: str) -> List[Tuple[float, int]]:
+    """
+    Reads lines from .metrics file and return list of elements [val, checkpoint]
+
+    :param metric: Metric according to which checkpoints are selected.  Corresponds to columns in model/metrics file.
+    :param path: File to read metric values from.
+    :param model_path: path where the params files reside.
+    :return: List of pairs (metric value, checkpoint).
+    """
+    points = []
+    # First field is checkpoint id
+    # Metric on validation (dev) set looks like this: METRIC-val=N
+    with open(path, "r") as metrics_in:
+        for line in metrics_in:
+            fields = line.split()
+            checkpoint = int(fields[0])
+            # Check that the corresponding params files exists
+            if not os.path.exists(os.path.join(model_path, C.PARAMS_NAME % checkpoint)):
+                continue
+            for field in fields[1:]:
+                key_value = field.split("=")
+                if len(key_value) == 2:
+                    metric_set = key_value[0].split("-")
+                    if len(metric_set) == 2 and metric_set[0] == metric and metric_set[1] == "val":
+                        metric_value = float(key_value[1])
+                        points.append([metric_value, checkpoint])
+    return points
diff --git a/test/__init__.py b/test/__init__.py
index 214e3177f..3d9e97c1e 100644
--- a/test/__init__.py
+++ b/test/__init__.py
@@ -5,9 +5,8 @@
 # is located at
 #
 #     http://aws.amazon.com/apache2.0/
-# 
+#
 # or in the "license" file accompanying this file. This file is distributed on
 # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
-
diff --git a/test/common.py b/test/common.py
new file mode 100644
index 000000000..1c71ba361
--- /dev/null
+++ b/test/common.py
@@ -0,0 +1,159 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import os
+import random
+import sys
+from tempfile import TemporaryDirectory
+from typing import Optional, Tuple
+from unittest.mock import patch
+
+import mxnet as mx
+import numpy as np
+
+import sockeye.bleu
+import sockeye.constants as C
+import sockeye.train
+import sockeye.translate
+import sockeye.utils
+
+
+def gaussian_vector(shape, return_symbol=False):
+    """
+    Generates random normal tensors (diagonal covariance)
+
+    :param shape: shape of the tensor.
+    :param return_symbol: True if the result should be a Symbol, False if it should be an Numpy array.
+    :return: A gaussian tensor.
+    """
+    return mx.sym.random_normal(shape=shape) if return_symbol else np.random.normal(size=shape)
+
+
+def integer_vector(shape, max_value, return_symbol=False):
+    """
+    Generates a random positive integer tensor
+
+    :param shape: shape of the tensor.
+    :param max_value: maximum integer value.
+    :param return_symbol: True if the result should be a Symbol, False if it should be an Numpy array.
+    :return: A random integer tensor.
+    """
+    return mx.sym.round(mx.sym.random_uniform(shape=shape) * max_value) if return_symbol \
+        else np.round(np.random.uniform(size=shape) * max_value)
+
+
+def uniform_vector(shape, min_value=0, max_value=1, return_symbol=False):
+    """
+    Generates a uniformly random tensor
+
+    :param shape: shape of the tensor
+    :param min_value: minimum possible value
+    :param max_value: maximum possible value (exclusive)
+    :param return_symbol: True if the result should be a mx.sym.Symbol, False if it should be a Numpy array
+    :return:
+    """
+    return mx.sym.random_uniform(low=min_value, high=max_value, shape=shape) if return_symbol \
+        else np.random.uniform(low=min_value, high=max_value, size=shape)
+
+
+def generate_random_sentence(vocab_size, max_len):
+    """
+    Generates a random "sentence" as a list of integers.
+
+    :param vocab_size: Number of words in the "vocabulary". Note that due to
+                       the inclusion of special words (BOS, EOS, UNK) this does *not*
+                       correspond to the maximum possible value.
+    :param max_len: maximum sentence length.
+    """
+    length = random.randint(1, max_len)
+    # Due to the special words, the actual words start at index 3 and go up to vocab_size+2
+    return [random.randint(3, vocab_size + 2) for _ in range(length)]
+
+
+_DIGITS = "0123456789"
+
+
+def generate_digits_file(source_path: str,
+                         target_path: str,
+                         line_count: int = 100,
+                         line_length: int = 9,
+                         sort_target: bool = False):
+    with open(source_path, "w") as source_out, open(target_path, "w") as target_out:
+        for _ in range(line_count):
+            digits = [random.choice(_DIGITS) for _ in range(random.randint(1, line_length))]
+            print(" ".join(digits), file=source_out)
+            if sort_target:
+                digits.sort()
+            print(" ".join(digits), file=target_out)
+
+
+_TRAIN_PARAMS_COMMON = "--use-cpu --max-seq-len {max_len} --source {train_source} --target {train_target}" \
+                       " --validation-source {dev_source} --validation-target {dev_target} --output {model}"
+
+
+_TRANSLATE_PARAMS_COMMON = "--use-cpu --models {model} --input {input} --output {output}"
+
+
+def run_train_translate(train_params: str,
+                        translate_params: str,
+                        train_source_path: str,
+                        train_target_path: str,
+                        dev_source_path: str,
+                        dev_target_path: str,
+                        max_seq_len: int = 10,
+                        work_dir: Optional[str] = None) -> Tuple[float, float]:
+    """
+    Train a model and translate a dev set.  Report perplexity and BLEU.
+
+    :param train_params: Command line args for model training.
+    :param translate_params: Command line args for translation.
+    :param perplexity_thresh: Maximum perplexity for success
+    :param bleu_thresh: Minimum BLEU score for success
+    :return: (perplexity, bleu)
+    """
+    with TemporaryDirectory(dir=work_dir, prefix="test_train_translate.") as work_dir:
+
+        # Train model
+        model_path = os.path.join(work_dir, "model")
+        params = "{} {} {}".format(sockeye.train.__file__,
+                                   _TRAIN_PARAMS_COMMON.format(train_source=train_source_path,
+                                                               train_target=train_target_path,
+                                                               dev_source=dev_source_path,
+                                                               dev_target=dev_target_path,
+                                                               model=model_path,
+                                                               max_len=max_seq_len),
+                                   train_params)
+        with patch.object(sys, "argv", params.split()):
+            sockeye.train.main()
+
+        # Translate corpus
+        out_path = os.path.join(work_dir, "out.txt")
+        params = "{} {} {}".format(sockeye.translate.__file__,
+                                   _TRANSLATE_PARAMS_COMMON.format(model=model_path,
+                                                                   input=dev_source_path,
+                                                                   output=out_path),
+                                   translate_params)
+        with patch.object(sys, "argv", params.split()):
+            sockeye.translate.main()
+
+        # Measure perplexity
+        checkpoints = sockeye.utils.read_metrics_points(path=os.path.join(model_path, C.METRICS_NAME),
+                                                        model_path=model_path,
+                                                        metric=C.PERPLEXITY)
+        perplexity = checkpoints[-1][0]
+
+        # Measure BLEU
+        bleu = sockeye.bleu.corpus_bleu(open(out_path, "r").readlines(),
+                                        open(dev_target_path, "r").readlines())
+
+        return perplexity, bleu
diff --git a/test/integration/__init__.py b/test/integration/__init__.py
new file mode 100644
index 000000000..3d9e97c1e
--- /dev/null
+++ b/test/integration/__init__.py
@@ -0,0 +1,12 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
diff --git a/test/integration/test_seq_copy_int.py b/test/integration/test_seq_copy_int.py
new file mode 100644
index 000000000..e39c1411b
--- /dev/null
+++ b/test/integration/test_seq_copy_int.py
@@ -0,0 +1,66 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import os
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from test.common import generate_digits_file, run_train_translate
+
+_TRAIN_LINE_COUNT = 100
+_DEV_LINE_COUNT = 10
+_LINE_MAX_LENGTH = 9
+
+@pytest.mark.parametrize("train_params, translate_params", [
+    # "Vanilla" LSTM encoder-decoder with attention
+    ("--encoder rnn --rnn-num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 16 --num-embed 8 --attention-type mlp"
+     " --attention-num-hidden 16 --batch-size 8 --loss cross-entropy --optimized-metric perplexity --max-updates 10"
+     " --checkpoint-frequency 10 --optimizer adam --initial-learning-rate 0.01",
+     "--beam-size 2"),
+    # "Kitchen sink" LSTM encoder-decoder with attention
+    ("--encoder rnn --rnn-num-layers 4 --rnn-cell-type lstm --rnn-num-hidden 16 --rnn-residual-connections"
+     " --num-embed 16 --attention-type coverage --attention-num-hidden 16 --weight-tying --attention-use-prev-word"
+     " --context-gating --layer-normalization --batch-size 8 --loss smoothed-cross-entropy"
+     " --smoothed-cross-entropy-alpha 0.1 --normalize-loss --optimized-metric perplexity --max-updates 10"
+     " --checkpoint-frequency 10 --dropout 0.1 --optimizer adam --initial-learning-rate 0.01",
+     "--beam-size 2"),
+    # Convolutional embedding encoder + LSTM encoder-decoder with attention
+    ("--encoder rnn-with-conv-embed --conv-embed-max-filter-width 3 --conv-embed-num-filters 4 4 8"
+     " --conv-embed-pool-stride 2 --conv-embed-num-highway-layers 1 --rnn-num-layers 1 --rnn-cell-type lstm"
+     " --rnn-num-hidden 16 --num-embed 8 --attention-num-hidden 16 --batch-size 8 --loss cross-entropy"
+     " --optimized-metric perplexity --max-updates 10 --checkpoint-frequency 10 --optimizer adam"
+     " --initial-learning-rate 0.01",
+     "--beam-size 2"),
+])
+
+def test_seq_copy(train_params, translate_params):
+    """Task: copy short sequences of digits"""
+    with TemporaryDirectory(prefix="test_seq_copy") as work_dir:
+        # Simple digits files for train/dev data
+        train_source_path = os.path.join(work_dir, "train.src")
+        train_target_path = os.path.join(work_dir, "train.tgt")
+        dev_source_path = os.path.join(work_dir, "dev.src")
+        dev_target_path = os.path.join(work_dir, "dev.tgt")
+        generate_digits_file(train_source_path, train_target_path, _TRAIN_LINE_COUNT, _LINE_MAX_LENGTH)
+        generate_digits_file(dev_source_path, dev_target_path, _DEV_LINE_COUNT, _LINE_MAX_LENGTH)
+        # Test model configuration
+        # Ignore return values (perplexity and BLEU) for integration test
+        run_train_translate(train_params,
+                            translate_params,
+                            train_source_path,
+                            train_target_path,
+                            dev_source_path,
+                            dev_target_path,
+                            max_seq_len=_LINE_MAX_LENGTH + 1,
+                            work_dir=work_dir)
diff --git a/test/system/__init__.py b/test/system/__init__.py
new file mode 100644
index 000000000..3d9e97c1e
--- /dev/null
+++ b/test/system/__init__.py
@@ -0,0 +1,12 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
diff --git a/test/system/pytest.ini b/test/system/pytest.ini
new file mode 100644
index 000000000..ca0c9f171
--- /dev/null
+++ b/test/system/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+addopts = -s
diff --git a/test/system/test_seq_copy_sys.py b/test/system/test_seq_copy_sys.py
new file mode 100644
index 000000000..4095cd25f
--- /dev/null
+++ b/test/system/test_seq_copy_sys.py
@@ -0,0 +1,88 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+import os
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from test.common import generate_digits_file, run_train_translate
+
+
+_TRAIN_LINE_COUNT = 10000
+_DEV_LINE_COUNT = 100
+_LINE_MAX_LENGTH = 9
+
+
+@pytest.mark.parametrize("train_params, translate_params, perplexity_thresh, bleu_thresh", [
+    # "Vanilla" LSTM encoder-decoder with attention
+    ("--encoder rnn --rnn-num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32 --attention-type mlp"
+     " --attention-num-hidden 32 --batch-size 16 --loss cross-entropy --optimized-metric perplexity --max-updates 10000"
+     " --checkpoint-frequency 1000 --optimizer adam --initial-learning-rate 0.001",
+     "--beam-size 5",
+     1.01,
+     0.98),
+])
+def test_seq_copy(train_params, translate_params, perplexity_thresh, bleu_thresh):
+    """Task: copy short sequences of digits"""
+    with TemporaryDirectory(prefix="test_seq_copy.") as work_dir:
+        # Simple digits files for train/dev data
+        train_source_path = os.path.join(work_dir, "train.src")
+        train_target_path = os.path.join(work_dir, "train.tgt")
+        dev_source_path = os.path.join(work_dir, "dev.src")
+        dev_target_path = os.path.join(work_dir, "dev.tgt")
+        generate_digits_file(train_source_path, train_target_path, _TRAIN_LINE_COUNT, _LINE_MAX_LENGTH)
+        generate_digits_file(dev_source_path, dev_target_path, _DEV_LINE_COUNT, _LINE_MAX_LENGTH)
+        # Test model configuration
+        perplexity, bleu = run_train_translate(train_params,
+                                               translate_params,
+                                               train_source_path,
+                                               train_target_path,
+                                               dev_source_path,
+                                               dev_target_path,
+                                               max_seq_len=_LINE_MAX_LENGTH + 1,
+                                               work_dir=work_dir)
+        assert perplexity <= perplexity_thresh
+        assert bleu >= bleu_thresh
+
+
+@pytest.mark.parametrize("train_params, translate_params, perplexity_thresh, bleu_thresh", [
+    # "Vanilla" LSTM encoder-decoder with attention
+    ("--encoder rnn --rnn-num-layers 1 --rnn-cell-type lstm --rnn-num-hidden 64 --num-embed 32 --attention-type mlp"
+     " --attention-num-hidden 32 --batch-size 16 --loss cross-entropy --optimized-metric perplexity --max-updates 10000"
+     " --checkpoint-frequency 1000 --optimizer adam --initial-learning-rate 0.001",
+     "--beam-size 5",
+     1.01,
+     0.98),
+])
+def test_seq_sort(train_params, translate_params, perplexity_thresh, bleu_thresh):
+    """Task: sort short sequences of digits"""
+    with TemporaryDirectory(prefix="test_seq_sort.") as work_dir:
+        # Simple digits files for train/dev data
+        train_source_path = os.path.join(work_dir, "train.src")
+        train_target_path = os.path.join(work_dir, "train.tgt")
+        dev_source_path = os.path.join(work_dir, "dev.src")
+        dev_target_path = os.path.join(work_dir, "dev.tgt")
+        generate_digits_file(train_source_path, train_target_path, _TRAIN_LINE_COUNT, _LINE_MAX_LENGTH, sort_target=True)
+        generate_digits_file(dev_source_path, dev_target_path, _DEV_LINE_COUNT, _LINE_MAX_LENGTH, sort_target=True)
+        # Test model configuration
+        perplexity, bleu = run_train_translate(train_params,
+                                               translate_params,
+                                               train_source_path,
+                                               train_target_path,
+                                               dev_source_path,
+                                               dev_target_path,
+                                               max_seq_len=_LINE_MAX_LENGTH + 1,
+                                               work_dir=work_dir)
+        assert perplexity <= perplexity_thresh
+        assert bleu >= bleu_thresh
diff --git a/test/unit/__init__.py b/test/unit/__init__.py
new file mode 100644
index 000000000..3d9e97c1e
--- /dev/null
+++ b/test/unit/__init__.py
@@ -0,0 +1,12 @@
+# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You may not
+# use this file except in compliance with the License. A copy of the License
+# is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+# express or implied. See the License for the specific language governing
+# permissions and limitations under the License.
diff --git a/test/test_arguments.py b/test/unit/test_arguments.py
similarity index 100%
rename from test/test_arguments.py
rename to test/unit/test_arguments.py
diff --git a/test/test_attention.py b/test/unit/test_attention.py
similarity index 99%
rename from test/test_attention.py
rename to test/unit/test_attention.py
index b5dd3eb8d..f643fc99f 100644
--- a/test/test_attention.py
+++ b/test/unit/test_attention.py
@@ -18,7 +18,7 @@
 import sockeye.attention
 import sockeye.constants as C
 import sockeye.coverage
-from test.test_utils import gaussian_vector, integer_vector
+from test.common import gaussian_vector, integer_vector
 
 attention_types = [C.ATT_BILINEAR, C.ATT_DOT, C.ATT_DOT_SCALED, C.ATT_LOC, C.ATT_MLP]
 
diff --git a/test/test_average.py b/test/unit/test_average.py
similarity index 100%
rename from test/test_average.py
rename to test/unit/test_average.py
diff --git a/test/test_bleu.py b/test/unit/test_bleu.py
similarity index 100%
rename from test/test_bleu.py
rename to test/unit/test_bleu.py
diff --git a/test/test_callback.py b/test/unit/test_callback.py
similarity index 100%
rename from test/test_callback.py
rename to test/unit/test_callback.py
diff --git a/test/test_checkpoint.py b/test/unit/test_checkpoint.py
similarity index 98%
rename from test/test_checkpoint.py
rename to test/unit/test_checkpoint.py
index 1f9fa101f..cdbaf80bf 100644
--- a/test/test_checkpoint.py
+++ b/test/unit/test_checkpoint.py
@@ -13,7 +13,7 @@
 
 from math import isclose
 import tempfile
-from test.test_utils import generate_random_sentence
+from test.common import generate_random_sentence
 
 import sockeye.data_io
 import mxnet as mx
diff --git a/test/test_config.py b/test/unit/test_config.py
similarity index 100%
rename from test/test_config.py
rename to test/unit/test_config.py
diff --git a/test/test_coverage.py b/test/unit/test_coverage.py
similarity index 99%
rename from test/test_coverage.py
rename to test/unit/test_coverage.py
index ecb2c6bf1..3d285ca55 100644
--- a/test/test_coverage.py
+++ b/test/unit/test_coverage.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 import sockeye.coverage
-from test.test_utils import gaussian_vector, integer_vector, uniform_vector
+from test.common import gaussian_vector, integer_vector, uniform_vector
 
 activation_types = ["tanh", "sigmoid", "relu", "softrelu"]
 
diff --git a/test/test_data_io.py b/test/unit/test_data_io.py
similarity index 100%
rename from test/test_data_io.py
rename to test/unit/test_data_io.py
diff --git a/test/test_decoder.py b/test/unit/test_decoder.py
similarity index 98%
rename from test/test_decoder.py
rename to test/unit/test_decoder.py
index 8b4377678..ce3019501 100644
--- a/test/test_decoder.py
+++ b/test/unit/test_decoder.py
@@ -19,7 +19,7 @@
 import sockeye.constants as C
 import sockeye.coverage
 import sockeye.decoder
-from test.test_utils import gaussian_vector, integer_vector
+from test.common import gaussian_vector, integer_vector
 
 step_tests = [(C.GRU_TYPE, True), (C.LSTM_TYPE, False)]
 
diff --git a/test/test_encoder.py b/test/unit/test_encoder.py
similarity index 100%
rename from test/test_encoder.py
rename to test/unit/test_encoder.py
diff --git a/test/test_layers.py b/test/unit/test_layers.py
similarity index 100%
rename from test/test_layers.py
rename to test/unit/test_layers.py
diff --git a/test/test_loss.py b/test/unit/test_loss.py
similarity index 100%
rename from test/test_loss.py
rename to test/unit/test_loss.py
diff --git a/test/test_lr_scheduler.py b/test/unit/test_lr_scheduler.py
similarity index 100%
rename from test/test_lr_scheduler.py
rename to test/unit/test_lr_scheduler.py
diff --git a/test/test_output_handler.py b/test/unit/test_output_handler.py
similarity index 100%
rename from test/test_output_handler.py
rename to test/unit/test_output_handler.py
diff --git a/test/test_params.py b/test/unit/test_params.py
similarity index 100%
rename from test/test_params.py
rename to test/unit/test_params.py
diff --git a/test/test_rnn.py b/test/unit/test_rnn.py
similarity index 100%
rename from test/test_rnn.py
rename to test/unit/test_rnn.py
diff --git a/test/test_translate.py b/test/unit/test_translate.py
similarity index 100%
rename from test/test_translate.py
rename to test/unit/test_translate.py
diff --git a/test/test_utils.py b/test/unit/test_utils.py
similarity index 74%
rename from test/test_utils.py
rename to test/unit/test_utils.py
index 3bfa0e903..48bbc47f9 100644
--- a/test/test_utils.py
+++ b/test/unit/test_utils.py
@@ -12,9 +12,7 @@
 # permissions and limitations under the License.
 
 import sockeye.utils
-import mxnet as mx
 import numpy as np
-import random
 import pytest
 from sockeye.utils import check_condition, SockeyeError
 
@@ -31,58 +29,6 @@ def test_get_alignments():
         assert alignment == expected_alignment
 
 
-def gaussian_vector(shape, return_symbol=False):
-    """
-    Generates random normal tensors (diagonal covariance)
-    
-    :param shape: shape of the tensor.
-    :param return_symbol: True if the result should be a Symbol, False if it should be an Numpy array.
-    :return: A gaussian tensor.
-    """
-    return mx.sym.random_normal(shape=shape) if return_symbol else np.random.normal(size=shape)
-
-
-def integer_vector(shape, max_value, return_symbol=False):
-    """
-    Generates a random positive integer tensor
-    
-    :param shape: shape of the tensor.
-    :param max_value: maximum integer value.
-    :param return_symbol: True if the result should be a Symbol, False if it should be an Numpy array.
-    :return: A random integer tensor.
-    """
-    return mx.sym.round(mx.sym.random_uniform(shape=shape) * max_value) if return_symbol \
-        else np.round(np.random.uniform(size=shape) * max_value)
-
-
-def uniform_vector(shape, min_value=0, max_value=1, return_symbol=False):
-    """
-    Generates a uniformly random tensor
-    
-    :param shape: shape of the tensor
-    :param min_value: minimum possible value
-    :param max_value: maximum possible value (exclusive)
-    :param return_symbol: True if the result should be a mx.sym.Symbol, False if it should be a Numpy array
-    :return: 
-    """
-    return mx.sym.random_uniform(low=min_value, high=max_value, shape=shape) if return_symbol \
-        else np.random.uniform(low=min_value, high=max_value, size=shape)
-
-
-def generate_random_sentence(vocab_size, max_len):
-    """
-    Generates a random "sentence" as a list of integers.
-
-    :param vocab_size: Number of words in the "vocabulary". Note that due to
-                       the inclusion of special words (BOS, EOS, UNK) this does *not*
-                       correspond to the maximum possible value.
-    :param max_len: maximum sentence length.
-    """
-    length = random.randint(1, max_len)
-    # Due to the special words, the actual words start at index 3 and go up to vocab_size+2
-    return [random.randint(3, vocab_size + 2) for _ in range(length)]
-
-
 device_params = [([-4, 3, 5], 6, [0, 1, 2, 3, 4, 5]),
                  ([-2, 3, -2, 5], 6, [0, 1, 2, 3, 4, 5]),
                  ([-1], 1, [0]),
diff --git a/test/test_vocab.py b/test/unit/test_vocab.py
similarity index 100%
rename from test/test_vocab.py
rename to test/unit/test_vocab.py