diff --git a/package-lock.json b/package-lock.json index 5ad99931e5..a510ab32f0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14187,6 +14187,7 @@ "version": "3.0.0", "resolved": "https://registry.npmjs.org/hast-util-has-property/-/hast-util-has-property-3.0.0.tgz", "integrity": "sha512-MNilsvEKLFpV604hwfhVStK0usFY/QmM5zX16bo7EjnAEGofr5YyI37kzopBlZJkHD4t887i+q/C8/tr5Q94cA==", + "license": "MIT", "dependencies": { "@types/hast": "^3.0.0" }, @@ -14199,6 +14200,7 @@ "version": "3.0.1", "resolved": "https://registry.npmjs.org/hast-util-is-body-ok-link/-/hast-util-is-body-ok-link-3.0.1.tgz", "integrity": "sha512-0qpnzOBLztXHbHQenVB8uNuxTnm/QBFUOmdOSsEn7GnBtyY07+ENTWVFBAnXd/zEgd9/SUG3lRY7hSIBWRgGpQ==", + "license": "MIT", "dependencies": { "@types/hast": "^3.0.0" }, @@ -14211,6 +14213,7 @@ "version": "3.0.0", "resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz", "integrity": "sha512-Val9mnv2IWpLbNPqc/pUem+a7Ipj2aHacCwgNfTiK0vJKl0LF+4Ba4+v1oPHFpf3bLYmreq0/l3Gud9S5OH42g==", + "license": "MIT", "dependencies": { "@types/hast": "^3.0.0" }, @@ -14223,6 +14226,7 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/hast-util-minify-whitespace/-/hast-util-minify-whitespace-1.0.1.tgz", "integrity": "sha512-L96fPOVpnclQE0xzdWb/D12VT5FabA7SnZOUMtL1DbXmYiHJMXZvFkIZfiMmTCNJHUeO2K9UYNXoVyfz+QHuOw==", + "license": "MIT", "dependencies": { "@types/hast": "^3.0.0", "hast-util-embedded": "^3.0.0", diff --git a/package.json b/package.json index a4aa7ff682..64880109f7 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,7 @@ "lint:code": "eslint .", "lint:code:fix": "eslint . --fix", "postinstall": "patch-package", - "postbuild": "node ./scripts/joinLlmsFiles.mjs" + "postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs" }, "devDependencies": { "@apify/eslint-config": "^1.0.0", diff --git a/scripts/indentLlmsFile.mjs b/scripts/indentLlmsFile.mjs new file mode 100644 index 0000000000..6a527b1b86 --- /dev/null +++ b/scripts/indentLlmsFile.mjs @@ -0,0 +1,152 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; + +const BUILD_DIR = path.resolve('build'); +const LLMS_FILE = path.join(BUILD_DIR, 'llms.txt'); + +const INDENT_LEVEL = 2; + +const MAIN_SECTIONS = ['/api.md', '/api/v2.md']; + +const BASE_URL = process.env.APIFY_DOCS_ABSOLUTE_URL || 'https://docs.apify.com'; + +/** + * Extracts the path from a URL, removing the base URL and query parameters + */ +function extractPathFromUrl(url) { + const urlObj = new URL(url); + return urlObj.pathname; +} + +/** + * Calculates the hierarchical depth of a URL path. + * This counts directory levels, not including the filename. + */ +function getUrlHierarchyDepth(url) { + const urlPath = extractPathFromUrl(url); + const segments = urlPath.split('/').filter((segment) => segment && segment !== ''); + const nonFileSegments = segments.filter((segment) => !segment.endsWith('.md')); + + return nonFileSegments.length; +} + +/** + * Determines if a URL is a main section page (level 0) + */ +function isMainSectionPage(url) { + const urlPath = extractPathFromUrl(url); + const segments = urlPath.split('/').filter((segment) => segment && segment !== ''); + + // Main pages are those with only one segment (the .md file) + if (segments.length === 1) { + return true; + } + + // Special cases for main API pages + if (MAIN_SECTIONS.includes(urlPath)) { + return true; + } + + return false; +} + +/** + * Determines the indentation level for a documentation link based on its URL hierarchy. + */ +function getLinkIndentation(url) { + // Main section pages get no indentation + if (isMainSectionPage(url)) { + return 0; + } + + const depth = getUrlHierarchyDepth(url); + + // The first level after main sections gets 1 level of indentation + // Each subsequent level gets another level of indentation + return Math.min(depth * INDENT_LEVEL, INDENT_LEVEL * 4); +} + +/** + * Determines the indentation level for a line based on its content type and URL. + */ +function getIndentationLevel(line, lineIndex, allLines) { + if (line.startsWith('# ') || line.startsWith('## ')) { + return 0; + } + + if (line.startsWith('### ')) { + return INDENT_LEVEL; + } + + if (line.startsWith('#### ')) { + return INDENT_LEVEL * 2; + } + + // Handle markdown links with URLs + if (line.startsWith('- [') && line.includes(`](${BASE_URL}/`)) { + // Extract URL from markdown link format: - [Link Text](https://docs.apify.com/path/to/page) + // Example: "- [API Reference](https://docs.apify.com/api/v2)" → extracts "https://docs.apify.com/api/v2" + const urlMatch = line.match(new RegExp(`\\]\\((${BASE_URL.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/[^)]+)\\)`)); + if (!urlMatch) { + return INDENT_LEVEL; + } + return getLinkIndentation(urlMatch[1]); + } + + // For other content, use the same indentation as the previous line + if (lineIndex > 0) { + const prevLine = allLines[lineIndex - 1]; + const prevIndentMatch = prevLine.match(/^(\s*)/); + return prevIndentMatch ? prevIndentMatch[1].length : INDENT_LEVEL; + } + + return INDENT_LEVEL; +} + +/** + * Applies hierarchical indentation to content based on URL structure and content type. + */ +function indentContent(content) { + const lines = content.split('\n'); + const indentedLines = []; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + const trimmedLine = line.trim(); + + // Preserve empty lines (add them without indentation) + if (!trimmedLine) { + indentedLines.push(''); + continue; + } + + const indent = getIndentationLevel(trimmedLine, i, lines); + const indentStr = ' '.repeat(indent); + indentedLines.push(indentStr + trimmedLine); + } + + return indentedLines.join('\n'); +} + +/** + * Main function to indent the LLMs file. + * Reads the file, applies indentation, and writes it back. + */ +async function indentLlmsFile() { + try { + await fs.access(LLMS_FILE); + const content = await fs.readFile(LLMS_FILE, 'utf8'); + const indentedContent = indentContent(content); + await fs.writeFile(LLMS_FILE, indentedContent, 'utf8'); + console.log('Successfully indented llms.txt file'); + } catch (error) { + if (error.code === 'ENOENT') { + console.log('llms.txt file not found, skipping indentation'); + } else { + console.error('Error indenting llms.txt file:', error); + process.exit(1); + } + } +} + +await indentLlmsFile();