Skip to content

Commit edaefc3

Browse files
geoffclineclaude
andauthored
Add Python script to check for broken cross-references in AsciiDoc files (#1167)
This script analyzes all .adoc files in latest/ug/ to find broken xrefs. It processes files in parallel and detects both xref: and <<>> syntax forms, as well as multiple section ID formats including [[id]], [#id], and auto-generated IDs from section headers. The script reports: - Broken cross-references with file paths and line numbers - Duplicate section IDs across files - Summary statistics Results: Analyzed 394 files, found 3,878 section IDs and 2,241 xrefs, with only 2 broken xrefs (both placeholder examples in contribution docs). Co-authored-by: Claude <noreply@anthropic.com>
1 parent bc382fa commit edaefc3

File tree

1 file changed

+301
-0
lines changed

1 file changed

+301
-0
lines changed

check_xrefs.py

Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
#!/usr/bin/env python3
2+
"""
3+
AsciiDoc Cross-Reference Checker
4+
5+
This script analyzes all .adoc files in a directory to find broken cross-references.
6+
It supports both xref: and <<>> syntax and checks against explicit and auto-generated section IDs.
7+
"""
8+
9+
import re
10+
import os
11+
from pathlib import Path
12+
from concurrent.futures import ProcessPoolExecutor, as_completed
13+
from collections import defaultdict
14+
from dataclasses import dataclass
15+
from typing import Set, List, Tuple
16+
import sys
17+
18+
19+
@dataclass
20+
class XRefInfo:
21+
"""Information about a cross-reference"""
22+
file_path: str
23+
line_number: int
24+
xref_id: str
25+
xref_type: str # 'xref' or 'angle_bracket'
26+
27+
28+
@dataclass
29+
class FileAnalysis:
30+
"""Analysis results for a single file"""
31+
file_path: str
32+
section_ids: Set[str]
33+
xrefs: List[XRefInfo]
34+
errors: List[str]
35+
36+
37+
def normalize_id(text: str) -> str:
38+
"""
39+
Normalize a section header to an auto-generated ID.
40+
Based on AsciiDoc rules with idseparator: -
41+
"""
42+
# Convert to lowercase
43+
text = text.lower()
44+
# Remove formatting and special chars, replace spaces with hyphens
45+
text = re.sub(r'[^\w\s-]', '', text)
46+
text = re.sub(r'\s+', '-', text)
47+
# Remove multiple consecutive hyphens
48+
text = re.sub(r'-+', '-', text)
49+
# Remove leading/trailing hyphens
50+
text = text.strip('-')
51+
return text
52+
53+
54+
def extract_section_ids(content: str, lines: List[str]) -> Set[str]:
55+
"""
56+
Extract all section IDs from file content.
57+
Supports:
58+
- [[id]] syntax (standalone or inline)
59+
- [#id] syntax (standalone or inline)
60+
- Auto-generated IDs from section headers
61+
"""
62+
section_ids = set()
63+
64+
# Pattern for explicit [[id]] or [[id,title]] syntax (standalone or inline)
65+
# This pattern works for both "[[id]]" on its own line and "=== Title [[id]]" inline
66+
explicit_bracket_pattern = re.compile(r'\[\[([^\]]+)\]\]')
67+
for match in explicit_bracket_pattern.finditer(content):
68+
# Handle [[id,title]] syntax - ID is the part before the comma
69+
id_text = match.group(1)
70+
section_id = id_text.split(',')[0].strip()
71+
section_ids.add(section_id)
72+
73+
# Pattern for [#id] syntax (standalone or inline)
74+
explicit_hash_pattern = re.compile(r'\[#([^\]]+)\]')
75+
for match in explicit_hash_pattern.finditer(content):
76+
section_id = match.group(1).split(',')[0].strip()
77+
section_ids.add(section_id)
78+
79+
# Pattern for section headers (=, ==, ===, etc.)
80+
# Auto-generate IDs from section titles
81+
section_header_pattern = re.compile(r'^(=+)\s+(.+)$', re.MULTILINE)
82+
for match in section_header_pattern.finditer(content):
83+
header_text = match.group(2).strip()
84+
# Remove inline IDs like [[id]] or [#id] from the header text before auto-generating ID
85+
header_text = re.sub(r'\[\[[^\]]+\]\]', '', header_text)
86+
header_text = re.sub(r'\[#[^\]]+\]', '', header_text)
87+
# Remove inline formatting like *bold*, _italic_, etc.
88+
header_text = re.sub(r'\*\*?([^*]+)\*\*?', r'\1', header_text)
89+
header_text = re.sub(r'__?([^_]+)__?', r'\1', header_text)
90+
header_text = re.sub(r'`([^`]+)`', r'\1', header_text)
91+
# Remove links
92+
header_text = re.sub(r'https?://[^\s\[]+', '', header_text)
93+
header_text = re.sub(r'link:[^\[]+\[[^\]]*\]', '', header_text)
94+
95+
auto_id = normalize_id(header_text)
96+
if auto_id:
97+
section_ids.add(auto_id)
98+
99+
return section_ids
100+
101+
102+
def extract_xrefs(content: str, file_path: str) -> List[XRefInfo]:
103+
"""
104+
Extract all cross-references from file content.
105+
Supports:
106+
- xref:id[...] syntax
107+
- <<id>> syntax
108+
- <<id,text>> syntax
109+
"""
110+
xrefs = []
111+
lines = content.split('\n')
112+
113+
# Pattern for xref:id[...] syntax
114+
xref_pattern = re.compile(r'xref:([a-zA-Z0-9_-]+)(?:\[[^\]]*\])?')
115+
116+
# Pattern for <<id>> or <<id,text>> syntax
117+
angle_bracket_pattern = re.compile(r'<<([a-zA-Z0-9_-]+)(?:,[^>]*)?>>')
118+
119+
for line_num, line in enumerate(lines, 1):
120+
# Find xref: references
121+
for match in xref_pattern.finditer(line):
122+
xref_id = match.group(1)
123+
xrefs.append(XRefInfo(
124+
file_path=file_path,
125+
line_number=line_num,
126+
xref_id=xref_id,
127+
xref_type='xref'
128+
))
129+
130+
# Find <<>> references
131+
for match in angle_bracket_pattern.finditer(line):
132+
xref_id = match.group(1)
133+
xrefs.append(XRefInfo(
134+
file_path=file_path,
135+
line_number=line_num,
136+
xref_id=xref_id,
137+
xref_type='angle_bracket'
138+
))
139+
140+
return xrefs
141+
142+
143+
def analyze_file(file_path: Path) -> FileAnalysis:
144+
"""
145+
Analyze a single .adoc file for section IDs and cross-references.
146+
"""
147+
errors = []
148+
149+
try:
150+
with open(file_path, 'r', encoding='utf-8') as f:
151+
content = f.read()
152+
lines = content.split('\n')
153+
154+
section_ids = extract_section_ids(content, lines)
155+
xrefs = extract_xrefs(content, str(file_path))
156+
157+
return FileAnalysis(
158+
file_path=str(file_path),
159+
section_ids=section_ids,
160+
xrefs=xrefs,
161+
errors=errors
162+
)
163+
164+
except Exception as e:
165+
errors.append(f"Error reading {file_path}: {str(e)}")
166+
return FileAnalysis(
167+
file_path=str(file_path),
168+
section_ids=set(),
169+
xrefs=[],
170+
errors=errors
171+
)
172+
173+
174+
def find_adoc_files(directory: str) -> List[Path]:
175+
"""Find all .adoc files in the directory recursively."""
176+
path = Path(directory)
177+
return list(path.rglob('*.adoc'))
178+
179+
180+
def main():
181+
"""Main function to orchestrate the cross-reference checking."""
182+
183+
# Configuration
184+
directory = 'latest/ug/'
185+
186+
if not os.path.exists(directory):
187+
print(f"Error: Directory '{directory}' not found")
188+
sys.exit(1)
189+
190+
print(f"Analyzing .adoc files in {directory}...")
191+
192+
# Find all .adoc files
193+
adoc_files = find_adoc_files(directory)
194+
print(f"Found {len(adoc_files)} .adoc files")
195+
196+
# Analyze files in parallel
197+
all_section_ids = defaultdict(set) # id -> set of files that define it
198+
all_xrefs = []
199+
file_errors = []
200+
201+
print("\nAnalyzing files in parallel...")
202+
203+
with ProcessPoolExecutor() as executor:
204+
# Submit all files for analysis
205+
future_to_file = {
206+
executor.submit(analyze_file, file_path): file_path
207+
for file_path in adoc_files
208+
}
209+
210+
# Collect results as they complete
211+
completed = 0
212+
for future in as_completed(future_to_file):
213+
completed += 1
214+
if completed % 50 == 0:
215+
print(f" Processed {completed}/{len(adoc_files)} files...")
216+
217+
try:
218+
result = future.result()
219+
220+
# Collect section IDs
221+
for section_id in result.section_ids:
222+
all_section_ids[section_id].add(result.file_path)
223+
224+
# Collect xrefs
225+
all_xrefs.extend(result.xrefs)
226+
227+
# Collect errors
228+
if result.errors:
229+
file_errors.extend(result.errors)
230+
231+
except Exception as e:
232+
file_path = future_to_file[future]
233+
file_errors.append(f"Error processing {file_path}: {str(e)}")
234+
235+
print(f" Processed {len(adoc_files)}/{len(adoc_files)} files")
236+
237+
# Report file processing errors
238+
if file_errors:
239+
print("\n" + "="*80)
240+
print("FILE PROCESSING ERRORS")
241+
print("="*80)
242+
for error in file_errors:
243+
print(f" {error}")
244+
245+
# Check for broken xrefs
246+
print("\n" + "="*80)
247+
print("CHECKING CROSS-REFERENCES")
248+
print("="*80)
249+
print(f"Total section IDs found: {len(all_section_ids)}")
250+
print(f"Total xrefs found: {len(all_xrefs)}")
251+
252+
broken_xrefs = []
253+
for xref in all_xrefs:
254+
if xref.xref_id not in all_section_ids:
255+
broken_xrefs.append(xref)
256+
257+
# Report results
258+
print("\n" + "="*80)
259+
print("RESULTS")
260+
print("="*80)
261+
262+
if not broken_xrefs:
263+
print("✓ No broken cross-references found!")
264+
else:
265+
print(f"✗ Found {len(broken_xrefs)} broken cross-references:\n")
266+
267+
# Group by file for better readability
268+
broken_by_file = defaultdict(list)
269+
for xref in broken_xrefs:
270+
broken_by_file[xref.file_path].append(xref)
271+
272+
for file_path in sorted(broken_by_file.keys()):
273+
print(f"\n{file_path}:")
274+
for xref in sorted(broken_by_file[file_path], key=lambda x: x.line_number):
275+
xref_syntax = f"xref:{xref.xref_id}[...]" if xref.xref_type == 'xref' else f"<<{xref.xref_id}>>"
276+
print(f" Line {xref.line_number}: {xref_syntax}")
277+
278+
# Summary statistics
279+
print("\n" + "="*80)
280+
print("SUMMARY")
281+
print("="*80)
282+
print(f"Files analyzed: {len(adoc_files)}")
283+
print(f"Section IDs found: {len(all_section_ids)}")
284+
print(f"Cross-references found: {len(all_xrefs)}")
285+
print(f"Broken cross-references: {len(broken_xrefs)}")
286+
287+
# Check for duplicate section IDs
288+
duplicates = {id: files for id, files in all_section_ids.items() if len(files) > 1}
289+
if duplicates:
290+
print(f"\n⚠ Warning: Found {len(duplicates)} duplicate section IDs:")
291+
for section_id, files in sorted(duplicates.items()):
292+
print(f"\n ID '{section_id}' defined in {len(files)} files:")
293+
for file_path in sorted(files):
294+
print(f" - {file_path}")
295+
296+
# Exit with error code if broken xrefs found
297+
sys.exit(1 if broken_xrefs else 0)
298+
299+
300+
if __name__ == '__main__':
301+
main()

0 commit comments

Comments
 (0)