triage: bail on certain global clusters after 30s

triage works by clustering test failures in two stages: - locally: create clusters of test failures for each unique test - globally: merge each test's clusters into a global set of clusters The clustering/merging is done by computing edit distance between the failure text of each test failure or failure cluster and accepting the first pair that has an edit distance of 10% of their combined length. This can add up in the worst case, where edit distance is going to be computed for every existing cluster before creating a new cluster. We've arbitrarily handled it thus far by: - truncating failure text to ~200k~ 10k chars - bailing out on local clustering after 60s per unique test This PR adds: - bailing out on global clustering of pathological / low value clusters after 30s - more logging to see where clustering is working vs. not
adshmh · May 20, 2020 · cda33e7 · cda33e7
1 parent 8da074a
commit cda33e7
Showing 1 changed file with 30 additions and 8 deletions.
diff --git a/triage/summarize.py b/triage/summarize.py
@@ -49,6 +49,10 @@
     r'|(?<=minion-group-|default-pool-)[-0-9a-z]{4,}'  # node names
 )
 
+LONG_OUTPUT_LEN = 10000
+TRUNCATED_SEP = '\n...[truncated]...\n'
+MAX_CLUSTER_TEXT_LEN = LONG_OUTPUT_LEN + len(TRUNCATED_SEP)
+
 
 def normalize(s):
     """
@@ -84,12 +88,12 @@ def repl(m):
 
     s = flakeReasonOrdinalRE.sub(repl, s)
 
-    if len(s) > 10000:
+    if len(s) > LONG_OUTPUT_LEN:
         # for long strings, remove repeated lines!
         s = re.sub(r'(?m)^(.*\n)\1+', r'\1', s)
 
-    if len(s) > 10000:  # ridiculously long test output
-        s = s[:5000] + '\n...[truncated]...\n' + s[-5000:]
+    if len(s) > LONG_OUTPUT_LEN:  # ridiculously long test output
+        s = s[:int(LONG_OUTPUT_LEN/2)] + TRUNCATED_SEP + s[-int(LONG_OUTPUT_LEN/2):]
 
     return s
 
@@ -287,7 +291,7 @@ def cluster_local(failed_tests):
                    reverse=True),
             1):
         num_failures += len(tests)
-        logging.info('%4d/%4d, %d failures, %s', n, len(failed_tests), len(tests), test_name)
+        logging.info('%4d/%4d tests, %5d failures, %s', n, len(failed_tests), len(tests), test_name)
         sys.stdout.flush()
         clustered[test_name] = cluster_test(tests)
     elapsed = time.time() - start
@@ -333,19 +337,37 @@ def cluster_global(clustered, previous_clustered):
                    key=lambda kv: sum(len(x) for x in kv[1].values()),
                    reverse=True),
             1):
-        logging.info('%4d/%4d, %d clusters, %s', n, len(clustered), len(test_clusters), test_name)
+        logging.info('%4d/%4d tests, %4d clusters, %s', n, len(clustered), len(test_clusters), test_name)
+        test_start = time.time()
         # Look at clusters with the most failures first
-        for key, tests in sorted(test_clusters.items(),
-                                 key=lambda x: len(x[1]), reverse=True):
-            num_failures += len(tests)
+        for m, (key, tests) in enumerate(
+                sorted(test_clusters.items(),
+                       key=lambda x: len(x[1]),
+                       reverse=True),
+                1):
+            cluster_start = time.time()
+            ftext_len = len(key)
+            num_clusters = len(test_clusters)
+            num_tests = len(tests)
+            cluster_case = ""
+            logging.info('  %4d/%4d clusters, %5d chars failure text, %5d failures ...', m, num_clusters, ftext_len, num_tests)
+            num_failures += num_tests
             if key in clusters:
+                cluster_case = "EXISTING"
                 clusters[key].setdefault(test_name, []).extend(tests)
+            # if we've taken longer than 30 seconds for this test, bail on pathological / low value cases
+            elif time.time() > test_start + 30 and ftext_len > MAX_CLUSTER_TEXT_LEN/2 and num_tests == 1:
+                cluster_case = "BAILED"
             else:
                 other = find_match(key, clusters)
                 if other:
+                    cluster_case = "OTHER"
                     clusters[other].setdefault(test_name, []).extend(tests)
                 else:
+                    cluster_case = "NEW"
                     clusters[key] = {test_name: list(tests)}
+            cluster_dur = time.time() - cluster_start
+            logging.info('  %4d/%4d clusters, %5d chars failure text, %5d failures, cluster:%s in %d sec, test: %s', m, num_clusters, ftext_len, num_tests, cluster_case, cluster_dur, test_name)
 
     # If we seeded clusters using the previous run's keys, some of those
     # clusters may have disappeared. Remove the resulting empty entries.