From a9839c239ba57e5b25936f0cc6fb205a01d95fe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Biro=C5=A1?= Date: Thu, 11 Sep 2025 11:25:32 +0200 Subject: [PATCH 1/4] feat: Indent llms.txt file --- package-lock.json | 4 ++ package.json | 2 +- scripts/indentLlmsFile.mjs | 124 +++++++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 scripts/indentLlmsFile.mjs diff --git a/package-lock.json b/package-lock.json index 5ad99931e5..a510ab32f0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14187,6 +14187,7 @@ "version": "3.0.0", "resolved": "https://registry.npmjs.org/hast-util-has-property/-/hast-util-has-property-3.0.0.tgz", "integrity": "sha512-MNilsvEKLFpV604hwfhVStK0usFY/QmM5zX16bo7EjnAEGofr5YyI37kzopBlZJkHD4t887i+q/C8/tr5Q94cA==", + "license": "MIT", "dependencies": { "@types/hast": "^3.0.0" }, @@ -14199,6 +14200,7 @@ "version": "3.0.1", "resolved": "https://registry.npmjs.org/hast-util-is-body-ok-link/-/hast-util-is-body-ok-link-3.0.1.tgz", "integrity": "sha512-0qpnzOBLztXHbHQenVB8uNuxTnm/QBFUOmdOSsEn7GnBtyY07+ENTWVFBAnXd/zEgd9/SUG3lRY7hSIBWRgGpQ==", + "license": "MIT", "dependencies": { "@types/hast": "^3.0.0" }, @@ -14211,6 +14213,7 @@ "version": "3.0.0", "resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz", "integrity": "sha512-Val9mnv2IWpLbNPqc/pUem+a7Ipj2aHacCwgNfTiK0vJKl0LF+4Ba4+v1oPHFpf3bLYmreq0/l3Gud9S5OH42g==", + "license": "MIT", "dependencies": { "@types/hast": "^3.0.0" }, @@ -14223,6 +14226,7 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/hast-util-minify-whitespace/-/hast-util-minify-whitespace-1.0.1.tgz", "integrity": "sha512-L96fPOVpnclQE0xzdWb/D12VT5FabA7SnZOUMtL1DbXmYiHJMXZvFkIZfiMmTCNJHUeO2K9UYNXoVyfz+QHuOw==", + "license": "MIT", "dependencies": { "@types/hast": "^3.0.0", "hast-util-embedded": "^3.0.0", diff --git a/package.json b/package.json index a4aa7ff682..64880109f7 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,7 @@ "lint:code": "eslint .", "lint:code:fix": "eslint . --fix", "postinstall": "patch-package", - "postbuild": "node ./scripts/joinLlmsFiles.mjs" + "postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs" }, "devDependencies": { "@apify/eslint-config": "^1.0.0", diff --git a/scripts/indentLlmsFile.mjs b/scripts/indentLlmsFile.mjs new file mode 100644 index 0000000000..caf85cbdcf --- /dev/null +++ b/scripts/indentLlmsFile.mjs @@ -0,0 +1,124 @@ +import fs from 'node:fs/promises'; +import path from 'node:path'; + +const BUILD_DIR = path.resolve('build'); +const LLMS_FILE = path.join(BUILD_DIR, 'llms.txt'); + +const INDENT_LEVEL = 2; + +// Paths that should be indented at the first level +const INDENTED_PATHS = ['/api/v2/', '/academy/', '/platform/', '/legal/']; + +// Main API pages that should have no indentation +const MAIN_API_PAGES = ['/api.md', '/api/v2.md']; + +/** + * Calculates the depth of a URL by counting non-file path segments. + */ +function getUrlDepth(url) { + const baseUrl = url.replace('https://docs.apify.com', ''); + const urlSegments = baseUrl.split('/').filter((segment) => segment && segment !== ''); + const nonFileSegments = urlSegments.filter((segment) => !segment.endsWith('.md')); + return nonFileSegments.length; +} + +/** + * Determines the indentation level for a documentation link based on its URL. + */ +function getLinkIndentation(url) { + if (MAIN_API_PAGES.some((page) => url.includes(page))) { + return 0; + } + + if (INDENTED_PATHS.some((item) => url.includes(item))) { + return INDENT_LEVEL; + } + + // Default based on URL depth + const depth = getUrlDepth(url); + return Math.min(depth * INDENT_LEVEL, INDENT_LEVEL * 3); +} + +/** + * Determines the indentation level for a line based on its content type. + */ +function getIndentationLevel(line, lineIndex, allLines) { + if (line.startsWith('# ') || line.startsWith('## ')) { + return 0; // Main title or section title - no indent + } + + if (line.startsWith('### ')) { + return INDENT_LEVEL; // Subsection title - 1 level indent + } + + if (line.startsWith('#### ')) { + return INDENT_LEVEL * 2; // Sub-subsection title - 2 level indent + } + + if (line.startsWith('- [') && line.includes('](https://docs.apify.com/')) { + const urlMatch = line.match(/\]\((https:\/\/docs\.apify\.com\/[^)]+)\)/); + if (!urlMatch) { + return INDENT_LEVEL; // Fallback if URL parsing fails + } + return getLinkIndentation(urlMatch[1]); + } + + if (lineIndex > 0) { + // Other content - use same indent as previous line + const prevIndentMatch = allLines[lineIndex - 1].match(/^(\s*)/); + return prevIndentMatch ? prevIndentMatch[1].length : INDENT_LEVEL; + } + + return INDENT_LEVEL; +} + +/** + * Applies hierarchical indentation to content based on URL structure and content type. + */ +function indentContent(content) { + const lines = content.split('\n'); + const indentedLines = []; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i]; + const trimmedLine = line.trim(); + + // Skip empty lines + if (!trimmedLine) { + indentedLines.push(''); + continue; + } + + const indent = getIndentationLevel(trimmedLine, i, lines); + const indentStr = ' '.repeat(indent); + indentedLines.push(indentStr + trimmedLine); + } + + return indentedLines.join('\n'); +} + +/** + * Main function to indent the LLMs file. + * Reads the file, applies indentation, and writes it back. + */ +async function indentLlmsFile() { + try { + await fs.access(LLMS_FILE); + const content = await fs.readFile(LLMS_FILE, 'utf8'); + const indentedContent = indentContent(content); + await fs.writeFile(LLMS_FILE, indentedContent, 'utf8'); + console.log('Successfully indented llms.txt file'); + } catch (error) { + if (error.code === 'ENOENT') { + console.log('llms.txt file not found, skipping indentation'); + } else { + console.error('Error indenting llms.txt file:', error); + process.exit(1); + } + } +} + +indentLlmsFile().catch((err) => { + console.error('Failed to indent LLMs files:', err); + process.exit(1); +}); From 4681c2dd3cf6754909cab25aae6222460939d584 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Biro=C5=A1?= Date: Thu, 11 Sep 2025 13:45:24 +0200 Subject: [PATCH 2/4] fix: Pick the base url from env --- scripts/indentLlmsFile.mjs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/indentLlmsFile.mjs b/scripts/indentLlmsFile.mjs index caf85cbdcf..685bc61e05 100644 --- a/scripts/indentLlmsFile.mjs +++ b/scripts/indentLlmsFile.mjs @@ -12,11 +12,13 @@ const INDENTED_PATHS = ['/api/v2/', '/academy/', '/platform/', '/legal/']; // Main API pages that should have no indentation const MAIN_API_PAGES = ['/api.md', '/api/v2.md']; +const BASE_URL = process.env.APIFY_DOCS_ABSOLUTE_URL || 'https://docs.apify.com'; +console.log('debug: BASE_URL', BASE_URL); /** * Calculates the depth of a URL by counting non-file path segments. */ function getUrlDepth(url) { - const baseUrl = url.replace('https://docs.apify.com', ''); + const baseUrl = url.replace(BASE_URL, ''); const urlSegments = baseUrl.split('/').filter((segment) => segment && segment !== ''); const nonFileSegments = urlSegments.filter((segment) => !segment.endsWith('.md')); return nonFileSegments.length; @@ -55,8 +57,8 @@ function getIndentationLevel(line, lineIndex, allLines) { return INDENT_LEVEL * 2; // Sub-subsection title - 2 level indent } - if (line.startsWith('- [') && line.includes('](https://docs.apify.com/')) { - const urlMatch = line.match(/\]\((https:\/\/docs\.apify\.com\/[^)]+)\)/); + if (line.startsWith('- [') && line.includes(`](${BASE_URL}/`)) { + const urlMatch = line.match(new RegExp(`\\]\\((${BASE_URL.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/[^)]+)\\)`)); if (!urlMatch) { return INDENT_LEVEL; // Fallback if URL parsing fails } From 685cb23fb2be5c665c90997239c74255e6078f6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Biro=C5=A1?= Date: Tue, 16 Sep 2025 09:59:20 +0200 Subject: [PATCH 3/4] feat: improve the indentation --- scripts/indentLlmsFile.mjs | 84 +++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/scripts/indentLlmsFile.mjs b/scripts/indentLlmsFile.mjs index 685bc61e05..b3a7515d09 100644 --- a/scripts/indentLlmsFile.mjs +++ b/scripts/indentLlmsFile.mjs @@ -6,47 +6,81 @@ const LLMS_FILE = path.join(BUILD_DIR, 'llms.txt'); const INDENT_LEVEL = 2; -// Paths that should be indented at the first level -const INDENTED_PATHS = ['/api/v2/', '/academy/', '/platform/', '/legal/']; - -// Main API pages that should have no indentation -const MAIN_API_PAGES = ['/api.md', '/api/v2.md']; +const MAIN_SECTIONS = ['/api.md', '/api/v2.md']; const BASE_URL = process.env.APIFY_DOCS_ABSOLUTE_URL || 'https://docs.apify.com'; -console.log('debug: BASE_URL', BASE_URL); + +/** + * Extracts the path from a URL, removing the base URL and query parameters + */ +function extractPathFromUrl(url) { + const urlObj = new URL(url); + return urlObj.pathname; +} + /** - * Calculates the depth of a URL by counting non-file path segments. + * Calculates the hierarchical depth of a URL path. + * This counts directory levels, not including the filename. */ -function getUrlDepth(url) { - const baseUrl = url.replace(BASE_URL, ''); - const urlSegments = baseUrl.split('/').filter((segment) => segment && segment !== ''); - const nonFileSegments = urlSegments.filter((segment) => !segment.endsWith('.md')); +function getUrlHierarchyDepth(url) { + const urlPath = extractPathFromUrl(url); + const segments = urlPath.split('/').filter((segment) => segment && segment !== ''); + + // Remove the .md file extension to count only directory levels + const nonFileSegments = segments.filter((segment) => !segment.endsWith('.md')); + return nonFileSegments.length; } /** - * Determines the indentation level for a documentation link based on its URL. + * Determines if a URL is a main section page (level 0) + */ +function isMainSectionPage(url) { + const urlPath = extractPathFromUrl(url); + const segments = urlPath.split('/').filter((segment) => segment && segment !== ''); + + // Main pages are those with only one segment (the .md file) + // or specific known main pages + if (segments.length === 1) { + return true; + } + + // Special cases for main API pages + if (MAIN_SECTIONS.includes(urlPath)) { + return true; + } + + return false; +} + +/** + * Determines the indentation level for a documentation link based on its URL hierarchy. */ function getLinkIndentation(url) { - if (MAIN_API_PAGES.some((page) => url.includes(page))) { + // Main section pages get no indentation + if (isMainSectionPage(url)) { return 0; } - if (INDENTED_PATHS.some((item) => url.includes(item))) { - return INDENT_LEVEL; - } + // Calculate hierarchy depth + const depth = getUrlHierarchyDepth(url); - // Default based on URL depth - const depth = getUrlDepth(url); - return Math.min(depth * INDENT_LEVEL, INDENT_LEVEL * 3); + // The first level after main sections gets 1 level of indentation + // Each subsequent level gets another level of indentation + return Math.min(depth * INDENT_LEVEL, INDENT_LEVEL * 4); // Cap at 4 levels } /** - * Determines the indentation level for a line based on its content type. + * Determines the indentation level for a line based on its content type and URL. */ function getIndentationLevel(line, lineIndex, allLines) { - if (line.startsWith('# ') || line.startsWith('## ')) { - return 0; // Main title or section title - no indent + // Handle markdown headers + if (line.startsWith('# ')) { + return 0; // Main title - no indent + } + + if (line.startsWith('## ')) { + return 0; // Section title - no indent } if (line.startsWith('### ')) { @@ -57,6 +91,7 @@ function getIndentationLevel(line, lineIndex, allLines) { return INDENT_LEVEL * 2; // Sub-subsection title - 2 level indent } + // Handle markdown links with URLs if (line.startsWith('- [') && line.includes(`](${BASE_URL}/`)) { const urlMatch = line.match(new RegExp(`\\]\\((${BASE_URL.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/[^)]+)\\)`)); if (!urlMatch) { @@ -65,9 +100,10 @@ function getIndentationLevel(line, lineIndex, allLines) { return getLinkIndentation(urlMatch[1]); } + // For other content, use the same indentation as the previous line if (lineIndex > 0) { - // Other content - use same indent as previous line - const prevIndentMatch = allLines[lineIndex - 1].match(/^(\s*)/); + const prevLine = allLines[lineIndex - 1]; + const prevIndentMatch = prevLine.match(/^(\s*)/); return prevIndentMatch ? prevIndentMatch[1].length : INDENT_LEVEL; } From 9dd82e86740199b23e9d74c16070139cedd9ff62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20Biro=C5=A1?= Date: Thu, 18 Sep 2025 10:45:39 +0200 Subject: [PATCH 4/4] fix: code cleanup --- scripts/indentLlmsFile.mjs | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/scripts/indentLlmsFile.mjs b/scripts/indentLlmsFile.mjs index b3a7515d09..6a527b1b86 100644 --- a/scripts/indentLlmsFile.mjs +++ b/scripts/indentLlmsFile.mjs @@ -25,8 +25,6 @@ function extractPathFromUrl(url) { function getUrlHierarchyDepth(url) { const urlPath = extractPathFromUrl(url); const segments = urlPath.split('/').filter((segment) => segment && segment !== ''); - - // Remove the .md file extension to count only directory levels const nonFileSegments = segments.filter((segment) => !segment.endsWith('.md')); return nonFileSegments.length; @@ -40,7 +38,6 @@ function isMainSectionPage(url) { const segments = urlPath.split('/').filter((segment) => segment && segment !== ''); // Main pages are those with only one segment (the .md file) - // or specific known main pages if (segments.length === 1) { return true; } @@ -62,40 +59,36 @@ function getLinkIndentation(url) { return 0; } - // Calculate hierarchy depth const depth = getUrlHierarchyDepth(url); // The first level after main sections gets 1 level of indentation // Each subsequent level gets another level of indentation - return Math.min(depth * INDENT_LEVEL, INDENT_LEVEL * 4); // Cap at 4 levels + return Math.min(depth * INDENT_LEVEL, INDENT_LEVEL * 4); } /** * Determines the indentation level for a line based on its content type and URL. */ function getIndentationLevel(line, lineIndex, allLines) { - // Handle markdown headers - if (line.startsWith('# ')) { - return 0; // Main title - no indent - } - - if (line.startsWith('## ')) { - return 0; // Section title - no indent + if (line.startsWith('# ') || line.startsWith('## ')) { + return 0; } if (line.startsWith('### ')) { - return INDENT_LEVEL; // Subsection title - 1 level indent + return INDENT_LEVEL; } if (line.startsWith('#### ')) { - return INDENT_LEVEL * 2; // Sub-subsection title - 2 level indent + return INDENT_LEVEL * 2; } // Handle markdown links with URLs if (line.startsWith('- [') && line.includes(`](${BASE_URL}/`)) { + // Extract URL from markdown link format: - [Link Text](https://docs.apify.com/path/to/page) + // Example: "- [API Reference](https://docs.apify.com/api/v2)" → extracts "https://docs.apify.com/api/v2" const urlMatch = line.match(new RegExp(`\\]\\((${BASE_URL.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/[^)]+)\\)`)); if (!urlMatch) { - return INDENT_LEVEL; // Fallback if URL parsing fails + return INDENT_LEVEL; } return getLinkIndentation(urlMatch[1]); } @@ -121,7 +114,7 @@ function indentContent(content) { const line = lines[i]; const trimmedLine = line.trim(); - // Skip empty lines + // Preserve empty lines (add them without indentation) if (!trimmedLine) { indentedLines.push(''); continue; @@ -156,7 +149,4 @@ async function indentLlmsFile() { } } -indentLlmsFile().catch((err) => { - console.error('Failed to indent LLMs files:', err); - process.exit(1); -}); +await indentLlmsFile();