-
Notifications
You must be signed in to change notification settings - Fork 130
feat: Improve llms.txt file indentation #1918
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
import fs from 'node:fs/promises'; | ||
import path from 'node:path'; | ||
|
||
const BUILD_DIR = path.resolve('build'); | ||
const LLMS_FILE = path.join(BUILD_DIR, 'llms.txt'); | ||
|
||
const INDENT_LEVEL = 2; | ||
|
||
const MAIN_SECTIONS = ['/api.md', '/api/v2.md']; | ||
|
||
const BASE_URL = process.env.APIFY_DOCS_ABSOLUTE_URL || 'https://docs.apify.com'; | ||
|
||
/** | ||
* Extracts the path from a URL, removing the base URL and query parameters | ||
*/ | ||
function extractPathFromUrl(url) { | ||
const urlObj = new URL(url); | ||
return urlObj.pathname; | ||
} | ||
|
||
/** | ||
* Calculates the hierarchical depth of a URL path. | ||
* This counts directory levels, not including the filename. | ||
*/ | ||
function getUrlHierarchyDepth(url) { | ||
const urlPath = extractPathFromUrl(url); | ||
const segments = urlPath.split('/').filter((segment) => segment && segment !== ''); | ||
const nonFileSegments = segments.filter((segment) => !segment.endsWith('.md')); | ||
|
||
return nonFileSegments.length; | ||
} | ||
|
||
/** | ||
* Determines if a URL is a main section page (level 0) | ||
*/ | ||
function isMainSectionPage(url) { | ||
const urlPath = extractPathFromUrl(url); | ||
const segments = urlPath.split('/').filter((segment) => segment && segment !== ''); | ||
|
||
// Main pages are those with only one segment (the .md file) | ||
if (segments.length === 1) { | ||
return true; | ||
} | ||
|
||
// Special cases for main API pages | ||
if (MAIN_SECTIONS.includes(urlPath)) { | ||
return true; | ||
} | ||
|
||
return false; | ||
} | ||
|
||
/** | ||
* Determines the indentation level for a documentation link based on its URL hierarchy. | ||
*/ | ||
function getLinkIndentation(url) { | ||
// Main section pages get no indentation | ||
if (isMainSectionPage(url)) { | ||
return 0; | ||
} | ||
|
||
const depth = getUrlHierarchyDepth(url); | ||
|
||
// The first level after main sections gets 1 level of indentation | ||
// Each subsequent level gets another level of indentation | ||
return Math.min(depth * INDENT_LEVEL, INDENT_LEVEL * 4); | ||
} | ||
|
||
/** | ||
* Determines the indentation level for a line based on its content type and URL. | ||
*/ | ||
function getIndentationLevel(line, lineIndex, allLines) { | ||
if (line.startsWith('# ') || line.startsWith('## ')) { | ||
return 0; | ||
} | ||
|
||
if (line.startsWith('### ')) { | ||
return INDENT_LEVEL; | ||
} | ||
|
||
if (line.startsWith('#### ')) { | ||
return INDENT_LEVEL * 2; | ||
} | ||
|
||
// Handle markdown links with URLs | ||
if (line.startsWith('- [') && line.includes(`](${BASE_URL}/`)) { | ||
// Extract URL from markdown link format: - [Link Text](https://docs.apify.com/path/to/page) | ||
// Example: "- [API Reference](https://docs.apify.com/api/v2)" → extracts "https://docs.apify.com/api/v2" | ||
const urlMatch = line.match(new RegExp(`\\]\\((${BASE_URL.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/[^)]+)\\)`)); | ||
if (!urlMatch) { | ||
return INDENT_LEVEL; | ||
} | ||
return getLinkIndentation(urlMatch[1]); | ||
} | ||
|
||
// For other content, use the same indentation as the previous line | ||
if (lineIndex > 0) { | ||
const prevLine = allLines[lineIndex - 1]; | ||
const prevIndentMatch = prevLine.match(/^(\s*)/); | ||
return prevIndentMatch ? prevIndentMatch[1].length : INDENT_LEVEL; | ||
} | ||
|
||
return INDENT_LEVEL; | ||
} | ||
|
||
/** | ||
* Applies hierarchical indentation to content based on URL structure and content type. | ||
*/ | ||
function indentContent(content) { | ||
const lines = content.split('\n'); | ||
const indentedLines = []; | ||
|
||
for (let i = 0; i < lines.length; i++) { | ||
const line = lines[i]; | ||
const trimmedLine = line.trim(); | ||
|
||
// Preserve empty lines (add them without indentation) | ||
if (!trimmedLine) { | ||
indentedLines.push(''); | ||
continue; | ||
} | ||
|
||
const indent = getIndentationLevel(trimmedLine, i, lines); | ||
const indentStr = ' '.repeat(indent); | ||
indentedLines.push(indentStr + trimmedLine); | ||
} | ||
|
||
return indentedLines.join('\n'); | ||
} | ||
|
||
/** | ||
* Main function to indent the LLMs file. | ||
* Reads the file, applies indentation, and writes it back. | ||
*/ | ||
async function indentLlmsFile() { | ||
try { | ||
await fs.access(LLMS_FILE); | ||
const content = await fs.readFile(LLMS_FILE, 'utf8'); | ||
const indentedContent = indentContent(content); | ||
await fs.writeFile(LLMS_FILE, indentedContent, 'utf8'); | ||
console.log('Successfully indented llms.txt file'); | ||
} catch (error) { | ||
if (error.code === 'ENOENT') { | ||
console.log('llms.txt file not found, skipping indentation'); | ||
} else { | ||
console.error('Error indenting llms.txt file:', error); | ||
process.exit(1); | ||
} | ||
} | ||
} | ||
|
||
await indentLlmsFile(); |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.