From 6272bbcdc670639f9508026da9b85698a3a29257 Mon Sep 17 00:00:00 2001 From: Marshall Ward Date: Tue, 14 May 2024 12:10:19 -0400 Subject: [PATCH] Testing: Correct perf parsing of C++ output This patch fixes errors in the parser of perf output. Previously, each record was assumed to be separated by spaces, but this failed for more generic records (usually from C++) which included signatures (such as `f(a, b)`) or templates (`f`). Nested constructs were also possible. This is fixed by introducing a simple tokenizer which extracts <, (, and whitespace from the output , then rebuilds the records by combining any whitespace which appears inside of delimiters. This patch should hopefully resolve the CI errors in GitHub Actions. --- .testing/tools/parse_perf.py | 60 +++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/.testing/tools/parse_perf.py b/.testing/tools/parse_perf.py index 7cbffd995d..76c6be5bcb 100755 --- a/.testing/tools/parse_perf.py +++ b/.testing/tools/parse_perf.py @@ -3,10 +3,20 @@ import collections import json import os +import re import shlex import subprocess import sys +perf_scanner = re.Scanner([ + (r'<', lambda scanner, token: token), + (r'>', lambda scanner, token: token), + (r'\(', lambda scanner, token: token), + (r'\)', lambda scanner, token: token), + (r'[ \t]+', lambda scanner, token: token), + (r'[^<>() \t]+', lambda scanner, token: token), +]) + def main(): desc = 'Parse perf.data and return in JSON format.' @@ -58,15 +68,55 @@ def parse_perf_report(perf_data_path): # get per-symbol count else: + tokens, remainder = perf_scanner.scan(line) + if remainder: + print('Line could not be tokenized', file=sys.stderr) + print(' line:', repr(line), file=sys.stderr) + print(' tokens:', tokens, file=sys.stderr) + print(' remainder:', remainder, file=sys.stderr) + sys.exit(os.EX_DATAERR) + + # Construct record from tokens + # (NOTE: Not a proper grammar, just dumb bracket counting) + record = [] + bracks = 0 + parens = 0 + + for tok in tokens: + if tok == '<': + bracks += 1 + + if tok == '(': + parens += 1 + + rec = record[-1] if record else None + + inside_bracket = rec and (bracks > 0 or parens > 0) + lead_rec = tok in '<(' and rec and not rec.isspace() + tail_rec = not tok.isspace() and rec and rec[-1] in '>)' + + if inside_bracket or lead_rec or tail_rec: + record[-1] += tok + else: + record.append(tok) + + if tok == '>': + bracks -= 1 + if tok == '(': + parens -= 1 + + # Strip any whitespace tokens + record = [rec for rec in record if not rec.isspace()] + try: - tokens = line.split() - symbol = tokens[2] - period = int(tokens[3]) - except ValueError: + symbol = record[2] + period = int(record[3]) + except: print("parse_perf.py: Error extracting symbol count", - file=sys.stderr) + file=sys.stderr) print("line:", repr(line), file=sys.stderr) print("tokens:", tokens, file=sys.stderr) + print("record:", record, file=sys.stderr) raise profile[event_name]['symbol'][symbol] = period