Skip to content

Commit c32db2e

Browse files
fix: replace TAG_RE loop with split-based tokenizer — CodeQL final
Previous fix (TAG_RE = /<([^>]*)>/g) was linear in practice but CodeQL still flagged it as js/polynomial-redos. Rewrote tokenize() to use zero regex on the hot path: xml.split('<') → for each chunk, indexOf('>') → slice out the tag body → classify. TAG_NAME_RE (anchored /^…/ matching only a tag name) remains for name extraction — linear by construction, no backtracking possible. Handles: XML prolog (<?…?>), DOCTYPE, comments (<!-- -->), all directives (<!…), open/close/self-closing tags, unterminated tag bodies (treated as literal text). Verification: bunx nx test adt-rfc → 13 tests green (unchanged). Behaviour identical for all existing SOAP-RFC fixtures. Generated with [Devin](https://cli.devin.ai/docs) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
1 parent 708ca6c commit c32db2e

1 file changed

Lines changed: 30 additions & 24 deletions

File tree

packages/adt-rfc/src/lib/transport/soap-rfc.ts

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -110,12 +110,9 @@ interface Token {
110110
text?: string;
111111
}
112112

113-
// Single linear alternative: a tag body bounded by `>`. `[^>]*` cannot exceed
114-
// the next `>`, so matching is O(N) with no backtracking. SOAP-RFC responses
115-
// from SAP do not include CDATA sections; if that ever changes, handle it
116-
// with a streaming scanner (not a regex) to keep parsing linear.
117-
const TAG_RE = /<([^>]*)>/g;
118-
113+
// Linear tokenizer via split on '<' + indexOf('>'). Zero regex on the
114+
// hot path means CodeQL cannot flag it as polynomial-redos and the
115+
// runtime is guaranteed O(N).
119116
const TAG_NAME_RE = /^\/?([A-Za-z_][\w.:-]*)/;
120117

121118
function tagNameOf(inside: string): string | undefined {
@@ -125,40 +122,49 @@ function tagNameOf(inside: string): string | undefined {
125122

126123
function tokenize(xml: string): Token[] {
127124
const tokens: Token[] = [];
128-
let lastIndex = 0;
129-
TAG_RE.lastIndex = 0;
130-
let m: RegExpExecArray | null;
131-
while ((m = TAG_RE.exec(xml)) !== null) {
132-
// text segment between previous tag and this one
133-
if (m.index > lastIndex) {
134-
const text = xml.slice(lastIndex, m.index);
135-
if (text.trim().length > 0) tokens.push({ kind: 'text', text });
125+
const parts = xml.split('<');
126+
// parts[0] is whatever text precedes the first '<'.
127+
if (parts[0] && parts[0].trim().length > 0) {
128+
tokens.push({ kind: 'text', text: parts[0] });
129+
}
130+
for (let i = 1; i < parts.length; i++) {
131+
const part = parts[i];
132+
const gt = part.indexOf('>');
133+
if (gt < 0) {
134+
// malformed (no '>'): treat everything as literal text
135+
if (part.trim().length > 0) {
136+
tokens.push({ kind: 'text', text: '<' + part });
137+
}
138+
continue;
136139
}
137-
const raw = m[0];
138-
const inside = m[1] ?? '';
139-
// Skip XML prolog (<?xml ...?>), comments, and DOCTYPE — we never
140-
// emit tokens for these and they are non-structural for RFC parsing.
140+
const inside = part.slice(0, gt);
141+
const trailing = part.slice(gt + 1);
142+
// Skip XML prolog (<?...?>), DOCTYPE, comments (<!-- -->), and any
143+
// directive (<!...) — non-structural for RFC parsing.
141144
if (
142145
inside.startsWith('?') ||
143146
inside.startsWith('!--') ||
144147
inside.startsWith('!DOCTYPE') ||
145148
inside.startsWith('!')
146149
) {
147-
lastIndex = TAG_RE.lastIndex;
150+
if (trailing && trailing.trim().length > 0) {
151+
tokens.push({ kind: 'text', text: trailing });
152+
}
148153
continue;
149154
}
150155
const name = tagNameOf(inside);
151156
if (!name) {
152-
// malformed tag body without a name — treat as literal text
153-
tokens.push({ kind: 'text', text: raw });
154-
} else if (raw.startsWith('</')) {
157+
tokens.push({ kind: 'text', text: '<' + inside + '>' });
158+
} else if (inside.startsWith('/')) {
155159
tokens.push({ kind: 'close', name });
156-
} else if (raw.endsWith('/>')) {
160+
} else if (inside.endsWith('/')) {
157161
tokens.push({ kind: 'self', name });
158162
} else {
159163
tokens.push({ kind: 'open', name });
160164
}
161-
lastIndex = TAG_RE.lastIndex;
165+
if (trailing && trailing.trim().length > 0) {
166+
tokens.push({ kind: 'text', text: trailing });
167+
}
162168
}
163169
return tokens;
164170
}

0 commit comments

Comments
 (0)