|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +AsciiDoc Cross-Reference Checker |
| 4 | +
|
| 5 | +This script analyzes all .adoc files in a directory to find broken cross-references. |
| 6 | +It supports both xref: and <<>> syntax and checks against explicit and auto-generated section IDs. |
| 7 | +""" |
| 8 | + |
| 9 | +import re |
| 10 | +import os |
| 11 | +from pathlib import Path |
| 12 | +from concurrent.futures import ProcessPoolExecutor, as_completed |
| 13 | +from collections import defaultdict |
| 14 | +from dataclasses import dataclass |
| 15 | +from typing import Set, List, Tuple |
| 16 | +import sys |
| 17 | + |
| 18 | + |
| 19 | +@dataclass |
| 20 | +class XRefInfo: |
| 21 | + """Information about a cross-reference""" |
| 22 | + file_path: str |
| 23 | + line_number: int |
| 24 | + xref_id: str |
| 25 | + xref_type: str # 'xref' or 'angle_bracket' |
| 26 | + |
| 27 | + |
| 28 | +@dataclass |
| 29 | +class FileAnalysis: |
| 30 | + """Analysis results for a single file""" |
| 31 | + file_path: str |
| 32 | + section_ids: Set[str] |
| 33 | + xrefs: List[XRefInfo] |
| 34 | + errors: List[str] |
| 35 | + |
| 36 | + |
| 37 | +def normalize_id(text: str) -> str: |
| 38 | + """ |
| 39 | + Normalize a section header to an auto-generated ID. |
| 40 | + Based on AsciiDoc rules with idseparator: - |
| 41 | + """ |
| 42 | + # Convert to lowercase |
| 43 | + text = text.lower() |
| 44 | + # Remove formatting and special chars, replace spaces with hyphens |
| 45 | + text = re.sub(r'[^\w\s-]', '', text) |
| 46 | + text = re.sub(r'\s+', '-', text) |
| 47 | + # Remove multiple consecutive hyphens |
| 48 | + text = re.sub(r'-+', '-', text) |
| 49 | + # Remove leading/trailing hyphens |
| 50 | + text = text.strip('-') |
| 51 | + return text |
| 52 | + |
| 53 | + |
| 54 | +def extract_section_ids(content: str, lines: List[str]) -> Set[str]: |
| 55 | + """ |
| 56 | + Extract all section IDs from file content. |
| 57 | + Supports: |
| 58 | + - [[id]] syntax (standalone or inline) |
| 59 | + - [#id] syntax (standalone or inline) |
| 60 | + - Auto-generated IDs from section headers |
| 61 | + """ |
| 62 | + section_ids = set() |
| 63 | + |
| 64 | + # Pattern for explicit [[id]] or [[id,title]] syntax (standalone or inline) |
| 65 | + # This pattern works for both "[[id]]" on its own line and "=== Title [[id]]" inline |
| 66 | + explicit_bracket_pattern = re.compile(r'\[\[([^\]]+)\]\]') |
| 67 | + for match in explicit_bracket_pattern.finditer(content): |
| 68 | + # Handle [[id,title]] syntax - ID is the part before the comma |
| 69 | + id_text = match.group(1) |
| 70 | + section_id = id_text.split(',')[0].strip() |
| 71 | + section_ids.add(section_id) |
| 72 | + |
| 73 | + # Pattern for [#id] syntax (standalone or inline) |
| 74 | + explicit_hash_pattern = re.compile(r'\[#([^\]]+)\]') |
| 75 | + for match in explicit_hash_pattern.finditer(content): |
| 76 | + section_id = match.group(1).split(',')[0].strip() |
| 77 | + section_ids.add(section_id) |
| 78 | + |
| 79 | + # Pattern for section headers (=, ==, ===, etc.) |
| 80 | + # Auto-generate IDs from section titles |
| 81 | + section_header_pattern = re.compile(r'^(=+)\s+(.+)$', re.MULTILINE) |
| 82 | + for match in section_header_pattern.finditer(content): |
| 83 | + header_text = match.group(2).strip() |
| 84 | + # Remove inline IDs like [[id]] or [#id] from the header text before auto-generating ID |
| 85 | + header_text = re.sub(r'\[\[[^\]]+\]\]', '', header_text) |
| 86 | + header_text = re.sub(r'\[#[^\]]+\]', '', header_text) |
| 87 | + # Remove inline formatting like *bold*, _italic_, etc. |
| 88 | + header_text = re.sub(r'\*\*?([^*]+)\*\*?', r'\1', header_text) |
| 89 | + header_text = re.sub(r'__?([^_]+)__?', r'\1', header_text) |
| 90 | + header_text = re.sub(r'`([^`]+)`', r'\1', header_text) |
| 91 | + # Remove links |
| 92 | + header_text = re.sub(r'https?://[^\s\[]+', '', header_text) |
| 93 | + header_text = re.sub(r'link:[^\[]+\[[^\]]*\]', '', header_text) |
| 94 | + |
| 95 | + auto_id = normalize_id(header_text) |
| 96 | + if auto_id: |
| 97 | + section_ids.add(auto_id) |
| 98 | + |
| 99 | + return section_ids |
| 100 | + |
| 101 | + |
| 102 | +def extract_xrefs(content: str, file_path: str) -> List[XRefInfo]: |
| 103 | + """ |
| 104 | + Extract all cross-references from file content. |
| 105 | + Supports: |
| 106 | + - xref:id[...] syntax |
| 107 | + - <<id>> syntax |
| 108 | + - <<id,text>> syntax |
| 109 | + """ |
| 110 | + xrefs = [] |
| 111 | + lines = content.split('\n') |
| 112 | + |
| 113 | + # Pattern for xref:id[...] syntax |
| 114 | + xref_pattern = re.compile(r'xref:([a-zA-Z0-9_-]+)(?:\[[^\]]*\])?') |
| 115 | + |
| 116 | + # Pattern for <<id>> or <<id,text>> syntax |
| 117 | + angle_bracket_pattern = re.compile(r'<<([a-zA-Z0-9_-]+)(?:,[^>]*)?>>') |
| 118 | + |
| 119 | + for line_num, line in enumerate(lines, 1): |
| 120 | + # Find xref: references |
| 121 | + for match in xref_pattern.finditer(line): |
| 122 | + xref_id = match.group(1) |
| 123 | + xrefs.append(XRefInfo( |
| 124 | + file_path=file_path, |
| 125 | + line_number=line_num, |
| 126 | + xref_id=xref_id, |
| 127 | + xref_type='xref' |
| 128 | + )) |
| 129 | + |
| 130 | + # Find <<>> references |
| 131 | + for match in angle_bracket_pattern.finditer(line): |
| 132 | + xref_id = match.group(1) |
| 133 | + xrefs.append(XRefInfo( |
| 134 | + file_path=file_path, |
| 135 | + line_number=line_num, |
| 136 | + xref_id=xref_id, |
| 137 | + xref_type='angle_bracket' |
| 138 | + )) |
| 139 | + |
| 140 | + return xrefs |
| 141 | + |
| 142 | + |
| 143 | +def analyze_file(file_path: Path) -> FileAnalysis: |
| 144 | + """ |
| 145 | + Analyze a single .adoc file for section IDs and cross-references. |
| 146 | + """ |
| 147 | + errors = [] |
| 148 | + |
| 149 | + try: |
| 150 | + with open(file_path, 'r', encoding='utf-8') as f: |
| 151 | + content = f.read() |
| 152 | + lines = content.split('\n') |
| 153 | + |
| 154 | + section_ids = extract_section_ids(content, lines) |
| 155 | + xrefs = extract_xrefs(content, str(file_path)) |
| 156 | + |
| 157 | + return FileAnalysis( |
| 158 | + file_path=str(file_path), |
| 159 | + section_ids=section_ids, |
| 160 | + xrefs=xrefs, |
| 161 | + errors=errors |
| 162 | + ) |
| 163 | + |
| 164 | + except Exception as e: |
| 165 | + errors.append(f"Error reading {file_path}: {str(e)}") |
| 166 | + return FileAnalysis( |
| 167 | + file_path=str(file_path), |
| 168 | + section_ids=set(), |
| 169 | + xrefs=[], |
| 170 | + errors=errors |
| 171 | + ) |
| 172 | + |
| 173 | + |
| 174 | +def find_adoc_files(directory: str) -> List[Path]: |
| 175 | + """Find all .adoc files in the directory recursively.""" |
| 176 | + path = Path(directory) |
| 177 | + return list(path.rglob('*.adoc')) |
| 178 | + |
| 179 | + |
| 180 | +def main(): |
| 181 | + """Main function to orchestrate the cross-reference checking.""" |
| 182 | + |
| 183 | + # Configuration |
| 184 | + directory = 'latest/ug/' |
| 185 | + |
| 186 | + if not os.path.exists(directory): |
| 187 | + print(f"Error: Directory '{directory}' not found") |
| 188 | + sys.exit(1) |
| 189 | + |
| 190 | + print(f"Analyzing .adoc files in {directory}...") |
| 191 | + |
| 192 | + # Find all .adoc files |
| 193 | + adoc_files = find_adoc_files(directory) |
| 194 | + print(f"Found {len(adoc_files)} .adoc files") |
| 195 | + |
| 196 | + # Analyze files in parallel |
| 197 | + all_section_ids = defaultdict(set) # id -> set of files that define it |
| 198 | + all_xrefs = [] |
| 199 | + file_errors = [] |
| 200 | + |
| 201 | + print("\nAnalyzing files in parallel...") |
| 202 | + |
| 203 | + with ProcessPoolExecutor() as executor: |
| 204 | + # Submit all files for analysis |
| 205 | + future_to_file = { |
| 206 | + executor.submit(analyze_file, file_path): file_path |
| 207 | + for file_path in adoc_files |
| 208 | + } |
| 209 | + |
| 210 | + # Collect results as they complete |
| 211 | + completed = 0 |
| 212 | + for future in as_completed(future_to_file): |
| 213 | + completed += 1 |
| 214 | + if completed % 50 == 0: |
| 215 | + print(f" Processed {completed}/{len(adoc_files)} files...") |
| 216 | + |
| 217 | + try: |
| 218 | + result = future.result() |
| 219 | + |
| 220 | + # Collect section IDs |
| 221 | + for section_id in result.section_ids: |
| 222 | + all_section_ids[section_id].add(result.file_path) |
| 223 | + |
| 224 | + # Collect xrefs |
| 225 | + all_xrefs.extend(result.xrefs) |
| 226 | + |
| 227 | + # Collect errors |
| 228 | + if result.errors: |
| 229 | + file_errors.extend(result.errors) |
| 230 | + |
| 231 | + except Exception as e: |
| 232 | + file_path = future_to_file[future] |
| 233 | + file_errors.append(f"Error processing {file_path}: {str(e)}") |
| 234 | + |
| 235 | + print(f" Processed {len(adoc_files)}/{len(adoc_files)} files") |
| 236 | + |
| 237 | + # Report file processing errors |
| 238 | + if file_errors: |
| 239 | + print("\n" + "="*80) |
| 240 | + print("FILE PROCESSING ERRORS") |
| 241 | + print("="*80) |
| 242 | + for error in file_errors: |
| 243 | + print(f" {error}") |
| 244 | + |
| 245 | + # Check for broken xrefs |
| 246 | + print("\n" + "="*80) |
| 247 | + print("CHECKING CROSS-REFERENCES") |
| 248 | + print("="*80) |
| 249 | + print(f"Total section IDs found: {len(all_section_ids)}") |
| 250 | + print(f"Total xrefs found: {len(all_xrefs)}") |
| 251 | + |
| 252 | + broken_xrefs = [] |
| 253 | + for xref in all_xrefs: |
| 254 | + if xref.xref_id not in all_section_ids: |
| 255 | + broken_xrefs.append(xref) |
| 256 | + |
| 257 | + # Report results |
| 258 | + print("\n" + "="*80) |
| 259 | + print("RESULTS") |
| 260 | + print("="*80) |
| 261 | + |
| 262 | + if not broken_xrefs: |
| 263 | + print("✓ No broken cross-references found!") |
| 264 | + else: |
| 265 | + print(f"✗ Found {len(broken_xrefs)} broken cross-references:\n") |
| 266 | + |
| 267 | + # Group by file for better readability |
| 268 | + broken_by_file = defaultdict(list) |
| 269 | + for xref in broken_xrefs: |
| 270 | + broken_by_file[xref.file_path].append(xref) |
| 271 | + |
| 272 | + for file_path in sorted(broken_by_file.keys()): |
| 273 | + print(f"\n{file_path}:") |
| 274 | + for xref in sorted(broken_by_file[file_path], key=lambda x: x.line_number): |
| 275 | + xref_syntax = f"xref:{xref.xref_id}[...]" if xref.xref_type == 'xref' else f"<<{xref.xref_id}>>" |
| 276 | + print(f" Line {xref.line_number}: {xref_syntax}") |
| 277 | + |
| 278 | + # Summary statistics |
| 279 | + print("\n" + "="*80) |
| 280 | + print("SUMMARY") |
| 281 | + print("="*80) |
| 282 | + print(f"Files analyzed: {len(adoc_files)}") |
| 283 | + print(f"Section IDs found: {len(all_section_ids)}") |
| 284 | + print(f"Cross-references found: {len(all_xrefs)}") |
| 285 | + print(f"Broken cross-references: {len(broken_xrefs)}") |
| 286 | + |
| 287 | + # Check for duplicate section IDs |
| 288 | + duplicates = {id: files for id, files in all_section_ids.items() if len(files) > 1} |
| 289 | + if duplicates: |
| 290 | + print(f"\n⚠ Warning: Found {len(duplicates)} duplicate section IDs:") |
| 291 | + for section_id, files in sorted(duplicates.items()): |
| 292 | + print(f"\n ID '{section_id}' defined in {len(files)} files:") |
| 293 | + for file_path in sorted(files): |
| 294 | + print(f" - {file_path}") |
| 295 | + |
| 296 | + # Exit with error code if broken xrefs found |
| 297 | + sys.exit(1 if broken_xrefs else 0) |
| 298 | + |
| 299 | + |
| 300 | +if __name__ == '__main__': |
| 301 | + main() |
0 commit comments