added tests of error merging and sorting

Yelp · Jan 30, 2016 · be803fc · be803fc
1 parent 9ac3a77
commit be803fc
Show file tree

Hide file tree

Showing 2 changed files with 218 additions and 19 deletions.
diff --git a/mrjob/logs/errors.py b/mrjob/logs/errors.py
@@ -23,37 +23,31 @@ def _pick_error(log_interpretation):
     step, history, and task interpretations. Returns None if there
     are no errors.
     """
-    error_lists = [
-        log_interpretation.get(log_type, {}).get('errors')
-        for log_type in 'step', 'history', 'task'
-    ]
+    def yield_errors():
+        for log_type in ('step', 'history', 'task'):
+            errors = log_interpretation.get(log_type, {}).get('errors')
+            for error in errors or ():
+                yield error
 
-    errors = _merge_and_sort_errors(error_lists)
+    errors = _merge_and_sort_errors(yield_errors())
     if errors:
         return errors[0]
     else:
         return None
 
 
-def _merge_and_sort_errors(error_lists):
+def _merge_and_sort_errors(errors):
     """Merge errors from one or more lists of errors and then return
     them, sorted by recency.
 
     We allow None in place of an error list.
     """
     key_to_error = {}
 
-    for errors in error_lists:
-        if errors is None:
-            continue
+    for error in errors:
+        key = _time_sort_key(error)
+        key_to_error.setdefault(key, {})
+        key_to_error[key].update(error)
 
-        # catch common interface errors
-        if isinstance(errors, (dict, string_types)):
-            raise TypeError
-
-        for error in errors:
-            key = _time_sort_key(error)
-            key_to_error.setdefault(key, {})
-            key_to_error[key].update(error)
-
-    return [error for key, error in sorted(errors.items(), reverse=True)]
+    return [error for key, error in
+            sorted(key_to_error.items(), reverse=True)]
diff --git a/tests/logs/test_error.py b/tests/logs/test_error.py
@@ -0,0 +1,205 @@
+# Copyright 2015 Yelp
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from mrjob.logs.errors import _merge_and_sort_errors
+from mrjob.logs.errors import _pick_error
+
+from tests.py2 import TestCase
+
+
+class PickErrorTestCase(TestCase):
+
+    def test_empty(self):
+        self.assertEqual(_pick_error({}), None)
+        # make sure we can handle log interpretations without error
+        self.assertEqual(_pick_error(dict(history={})), None)
+
+    def test_pick_most_recent_error(self):
+        log_interpretation=dict(
+            history=dict(
+                errors=[
+                    dict(
+                        container_id='container_1450486922681_0005_01_000003',
+                        hadoop_error=dict(message='BOOM'),
+                        task_error=dict(message='things exploding'),
+                    ),
+                    dict(
+                        container_id='container_1450486922681_0005_01_000004',
+                        hadoop_error=dict(message='elephant problems'),
+                    ),
+                ],
+            ),
+        )
+
+        self.assertEqual(
+            _pick_error(log_interpretation),
+            dict(
+                container_id='container_1450486922681_0005_01_000004',
+                hadoop_error=dict(message='elephant problems'),
+            ),
+        )
+
+    def test_merge_order(self):
+        # task logs usually have the best info and should be merged last
+        log_interpretation = dict(
+            step=dict(
+                errors=[
+                    dict(
+                        container_id='container_1450486922681_0005_01_000004',
+                        hadoop_error=dict(message='BOOM'),
+                    ),
+                ],
+            ),
+            history=dict(
+                errors=[
+                    dict(
+                        container_id='container_1450486922681_0005_01_000004',
+                        hadoop_error=dict(
+                            message='BOOM',
+                            path='history.jhist',
+                        ),
+                        split=dict(path='snake_facts.txt'),
+                    ),
+                ],
+            ),
+            task=dict(
+                errors=[
+                    dict(
+                        container_id='container_1450486922681_0005_01_000004',
+                        hadoop_error=dict(
+                            message='BOOM',
+                            path='some_syslog',
+                        ),
+                        task_error=dict(
+                            message='exploding snakes, now?!',
+                            path='some_stderr',
+                        ),
+                    ),
+                ],
+            )
+        )
+
+        self.assertEqual(
+            _pick_error(log_interpretation),
+            dict(
+                container_id='container_1450486922681_0005_01_000004',
+                hadoop_error=dict(
+                    message='BOOM',
+                    path='some_syslog',
+                ),
+                split=dict(path='snake_facts.txt'),
+                task_error=dict(
+                    message='exploding snakes, now?!',
+                    path='some_stderr',
+                ),
+            ),
+        )
+
+
+
+
+
+
+class MergeAndSortErrorsTestCase(TestCase):
+
+    def test_empty(self):
+        self.assertEqual(_merge_and_sort_errors([]), [])
+
+    def test_single_error(self):
+        error = dict(
+            container_id='container_1450486922681_0005_01_000003',
+            hadoop_error=dict(message='BOOM'),
+        )
+
+        self.assertEqual(_merge_and_sort_errors([error]), [error])
+
+    def test_merge_errors(self):
+        errors = [
+            dict(
+                container_id='container_1450486922681_0005_01_000003',
+                hadoop_error=dict(message='BOOM')
+            ),
+            dict(  # from a different container, shouldn't be merged
+                container_id='container_1450486922681_0005_01_000004',
+                hadoop_error=dict(message='bad stuff, maybe?')
+            ),
+            dict(
+                container_id='container_1450486922681_0005_01_000003',
+                hadoop_error=dict(
+                    message='BOOM',
+                    path='history.jhist',
+                ),
+                split=dict(path='tricky_input')
+            ),
+            dict(
+                container_id='container_1450486922681_0005_01_000003',
+                hadoop_error=dict(
+                    message='BOOM\n',
+                    path='some_syslog',
+                ),
+                task_error=dict(
+                    message='it was probably snakes',
+                    path='some_stderr',
+                ),
+            ),
+        ]
+
+        self.assertEqual(
+            _merge_and_sort_errors(errors),
+            [
+                dict(
+                    container_id='container_1450486922681_0005_01_000004',
+                    hadoop_error=dict(message='bad stuff, maybe?')
+                ),
+                dict(
+                    container_id='container_1450486922681_0005_01_000003',
+                    hadoop_error=dict(
+                        message='BOOM\n',
+                        path='some_syslog',
+                    ),
+                    split=dict(path='tricky_input'),
+                    task_error=dict(
+                        message='it was probably snakes',
+                        path='some_stderr'
+                    ),
+                )
+            ])
+
+    def test_can_merge_with_incomplete_ids(self):
+        # this shouldn't happen if the _interpret_*() methods are
+        # written correctly, but just in case
+
+        errors = [
+            dict(
+                attempt_id='attempt_201512232143_0008_r_000000_0',
+                hadoop_error=dict(message='BOOM'),
+                split=dict(path='trade_secrets.dat'),
+            ),
+            dict(
+                attempt_id='attempt_201512232143_0008_r_000000_0',
+                hadoop_error=dict(message='BOOM'),
+                task_id='task_201512232143_0008_r_000000',
+            ),
+        ]
+
+        self.assertEqual(
+            _merge_and_sort_errors(errors),
+            [
+                dict(
+                    attempt_id='attempt_201512232143_0008_r_000000_0',
+                    hadoop_error=dict(message='BOOM'),
+                    split=dict(path='trade_secrets.dat'),
+                    task_id='task_201512232143_0008_r_000000',
+                ),
+            ]
+        )