diff --git a/.github/workflows/sync-docs-full.yml b/.github/workflows/sync-docs-full.yml index 906e8999..59f719d8 100644 --- a/.github/workflows/sync-docs-full.yml +++ b/.github/workflows/sync-docs-full.yml @@ -9,50 +9,20 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Get all MDX files and prepare payload - id: files + - name: Collect and validate files run: | - # First find all MDX files recursively - echo "Finding all MDX files..." - find content -type f -name "*.mdx" | sed 's|^content/||' > mdx_files.txt - echo "Found files:" - cat mdx_files.txt + set -euo pipefail + ./bin/collect-all-files.sh | \ + ./bin/validate-files.sh > all-files.txt - # Create the changed array by processing each file through jq - echo "Processing files..." - jq -n --slurpfile paths <( - while IFS= read -r path; do - [ -z "$path" ] && continue - if [ -f "content/$path" ]; then - echo "Processing: content/$path" - jq -n \ - --arg path "$path" \ - --arg content "$(base64 -w0 < "content/$path")" \ - '{path: $path, content: $content}' - fi - done < mdx_files.txt | jq -s '.' - ) \ - --slurpfile removed <(cat mdx_files.txt | jq -R . | jq -s .) \ - --arg repo "$GITHUB_REPOSITORY" \ - '{ - repo: $repo, - changed: ($paths | .[0] // []), - removed: ($removed | .[0] // []) - }' > payload.json - - # Show debug info - echo "Payload structure (without contents):" - jq 'del(.changed[].content)' payload.json + echo "Files to sync:" + cat all-files.txt - - name: Send to Agentuity + - name: Build and send payload + env: + AGENTUITY_TOKEN: ${{ secrets.AGENTUITY_TOKEN }} run: | - echo "About to sync these files:" - jq -r '.changed[].path' payload.json - echo -e "\nWill first remove these paths:" - jq -r '.removed[]' payload.json - - # Uncomment to actually send - curl https://agentuity.ai/webhook/f61d5ce9d6ed85695cc992c55ccdc2a6 \ - -X POST \ - -H "Content-Type: application/json" \ - -d @payload.json \ No newline at end of file + set -euo pipefail + cat all-files.txt | \ + ./bin/build-payload.sh "${{ github.repository }}" full | \ + ./bin/send-webhook.sh "https://agentuity.ai/webhook/f61d5ce9d6ed85695cc992c55ccdc2a6" "Bearer $AGENTUITY_TOKEN" \ No newline at end of file diff --git a/.github/workflows/sync-docs.yml b/.github/workflows/sync-docs.yml index 09a9491a..9e28e839 100644 --- a/.github/workflows/sync-docs.yml +++ b/.github/workflows/sync-docs.yml @@ -12,60 +12,24 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 - - name: Get changed and removed files - id: files + - name: Collect and validate files run: | + set -euo pipefail git fetch origin ${{ github.event.before }} + ./bin/collect-changed-files.sh "${{ github.event.before }}" "${{ github.sha }}" | \ + ./bin/validate-files.sh > changed-files.txt - # Get changed files (relative to content directory) - CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.sha }} -- 'content/**/*.mdx' | sed 's|^content/||') - REMOVED_FILES=$(git diff --name-only --diff-filter=D ${{ github.event.before }} ${{ github.sha }} -- 'content/**/*.mdx' | sed 's|^content/||') - - echo "Changed files: $CHANGED_FILES" - echo "Removed files: $REMOVED_FILES" - - # Build JSON payload with file contents - payload=$(jq -n \ - --arg commit "${{ github.sha }}" \ - --arg repo "${{ github.repository }}" \ - --argjson changed "$( - if [ -n "$CHANGED_FILES" ]; then - for f in $CHANGED_FILES; do - if [ -f "content/$f" ]; then - jq -n \ - --arg path "$f" \ - --arg content "$(base64 -w0 < "content/$f")" \ - '{path: $path, content: $content}' - fi - done | jq -s '.' - else - echo '[]' - fi - )" \ - --argjson removed "$( - if [ -n "$REMOVED_FILES" ]; then - printf '%s\n' $REMOVED_FILES | jq -R -s -c 'split("\n") | map(select(length > 0))' - else - echo '[]' - fi - )" \ - '{commit: $commit, repo: $repo, changed: $changed, removed: $removed}' - ) - - echo "payload<> $GITHUB_OUTPUT - echo "$payload" >> $GITHUB_OUTPUT - echo "EOF" >> $GITHUB_OUTPUT + echo "Files to sync:" + cat changed-files.txt - - name: Trigger Agentuity Sync Agent + - name: Build and send payload env: AGENTUITY_TOKEN: ${{ secrets.AGENTUITY_TOKEN }} run: | - echo "Sending payload to agent:" - echo '${{ steps.files.outputs.payload }}' | jq '.' - - curl https://agentuity.ai/webhook/f61d5ce9d6ed85695cc992c55ccdc2a6 \ - -X POST \ - -H "Authorization: Bearer $AGENTUITY_TOKEN" \ - -H "Content-Type: application/json" \ - -d '${{ steps.files.outputs.payload }}' \ No newline at end of file + set -euo pipefail + cat changed-files.txt | \ + ./bin/build-payload.sh "${{ github.repository }}" incremental | \ + ./bin/send-webhook.sh "https://agentuity.ai/webhook/f61d5ce9d6ed85695cc992c55ccdc2a6" "Bearer $AGENTUITY_TOKEN" \ No newline at end of file diff --git a/agent-docs/agentuity.yaml b/agent-docs/agentuity.yaml index 6f46b42b..89d8f17b 100644 --- a/agent-docs/agentuity.yaml +++ b/agent-docs/agentuity.yaml @@ -75,3 +75,6 @@ agents: name: doc-processing # The description of the Agent which is editable description: An applicaiton that process documents + - id: agent_9ccc5545e93644bd9d7954e632a55a61 + name: doc-qa + description: Agent that can answer questions based on dev docs as the knowledge base diff --git a/agent-docs/src/agents/doc-processing/docs-orchestrator.ts b/agent-docs/src/agents/doc-processing/docs-orchestrator.ts index 48a8d71b..3cdefbb7 100644 --- a/agent-docs/src/agents/doc-processing/docs-orchestrator.ts +++ b/agent-docs/src/agents/doc-processing/docs-orchestrator.ts @@ -1,7 +1,7 @@ import type { AgentContext } from '@agentuity/sdk'; import { processDoc } from './docs-processor'; -import { VECTOR_STORE_NAME } from './config'; -import type { FilePayload, SyncPayload, SyncStats } from './types'; +import { VECTOR_STORE_NAME } from '../../../../config'; +import type { SyncPayload, SyncStats } from './types'; /** * Helper to remove all vectors for a given logical path from the vector store. @@ -76,7 +76,8 @@ export async function syncDocsFromPayload(ctx: AgentContext, payload: SyncPayloa ...chunk.metadata, path: logicalPath, }; - await ctx.vector.upsert(VECTOR_STORE_NAME, chunk); + const result = await ctx.vector.upsert(VECTOR_STORE_NAME, chunk); + ctx.logger.info('Upserted chunk: %o', result.length); } processed++; diff --git a/agent-docs/src/agents/doc-processing/docs-processor.ts b/agent-docs/src/agents/doc-processing/docs-processor.ts index d21b136f..c568d26f 100644 --- a/agent-docs/src/agents/doc-processing/docs-processor.ts +++ b/agent-docs/src/agents/doc-processing/docs-processor.ts @@ -3,16 +3,8 @@ import type { VectorUpsertParams } from '@agentuity/sdk'; import { chunkAndEnrichDoc } from './chunk-mdx'; import { embedChunks } from './embed-chunks'; import type { Chunk } from './chunk-mdx'; +import type { ChunkMetadata } from './types'; -export type ChunkMetadata = { - chunkIndex: number; - contentType: string; - heading: string; - title: string; - description: string; - text: string; - createdAt: string; -}; /** * Processes a single .mdx doc: loads, chunks, and enriches each chunk with metadata. diff --git a/agent-docs/src/agents/doc-processing/embed-chunks.ts b/agent-docs/src/agents/doc-processing/embed-chunks.ts index 6f2c97f5..0d508388 100644 --- a/agent-docs/src/agents/doc-processing/embed-chunks.ts +++ b/agent-docs/src/agents/doc-processing/embed-chunks.ts @@ -29,6 +29,6 @@ export async function embedChunks( if (!response.embeddings || response.embeddings.length !== texts.length) { throw new Error('Embedding API returned unexpected result.'); } - } + return response.embeddings; } \ No newline at end of file diff --git a/agent-docs/src/agents/doc-processing/index.ts b/agent-docs/src/agents/doc-processing/index.ts index 6bb74899..629e0773 100644 --- a/agent-docs/src/agents/doc-processing/index.ts +++ b/agent-docs/src/agents/doc-processing/index.ts @@ -1,6 +1,6 @@ import type { AgentContext, AgentRequest, AgentResponse } from '@agentuity/sdk'; import { syncDocsFromPayload } from './docs-orchestrator'; -import type { FilePayload, SyncPayload } from './types'; +import type { SyncPayload } from './types'; export const welcome = () => { return { diff --git a/agent-docs/src/agents/doc-processing/types.ts b/agent-docs/src/agents/doc-processing/types.ts index 3a0815aa..bba55c9e 100644 --- a/agent-docs/src/agents/doc-processing/types.ts +++ b/agent-docs/src/agents/doc-processing/types.ts @@ -15,4 +15,15 @@ export interface SyncStats { deleted: number; errors: number; errorFiles: string[]; -} \ No newline at end of file +} + +export type ChunkMetadata = { + chunkIndex: number; + contentType: string; + heading: string; + title: string; + description: string; + text: string; + createdAt: string; + path?: string; +}; diff --git a/agent-docs/src/agents/doc-qa/index.ts b/agent-docs/src/agents/doc-qa/index.ts new file mode 100644 index 00000000..54351f89 --- /dev/null +++ b/agent-docs/src/agents/doc-qa/index.ts @@ -0,0 +1,122 @@ +import type { AgentContext, AgentRequest, AgentResponse } from '@agentuity/sdk'; +import { streamText } from 'ai'; +import { openai } from '@ai-sdk/openai'; + +import type { ChunkMetadata } from '../doc-processing/types'; +import { VECTOR_STORE_NAME, vectorSearchNumber } from '../../../../config'; +import type { RelevantDoc } from './types'; + +export default async function Agent( + req: AgentRequest, + resp: AgentResponse, + ctx: AgentContext +) { + const prompt = await req.data.text(); + const relevantDocs = await retrieveRelevantDocs(ctx, prompt); + + const systemPrompt = ` +You are a developer documentation assistant. Your job is to answer user questions about the Agentuity platform as effectively and concisely as possible, adapting your style to the user's request. If the user asks for a direct answer, provide it without extra explanation. If they want an explanation, provide a clear and concise one. Use only the provided relevant documents to answer. + +You must not make up answers if the provided documents don't exist. You can be direct to the user that the documentations +don't seem to include what they are looking for. Lying to the user is prohibited as it only slows them down. Feel free to +suggest follow up questions if what they're asking for don't seem to have an answer in the document. You can provide them +a few related things that the documents contain that may interest them. + +For every answer, return a valid JSON object with: + 1. "answer": your answer to the user's question. + 2. "documents": an array of strings, representing the path of the documents you used to answer. + +If you use information from a document, include it in the "documents" array. If you do not use any documents, return an empty array for "documents". + +User question: +\`\`\` +${prompt} +\`\`\` + +Relevant documents: +${JSON.stringify(relevantDocs, null, 2)} + +Respond ONLY with a valid JSON object as described above. In your answer, you should format code blocks properly in Markdown style if the user needs answer in code block. +`.trim(); + + const llmResponse = await streamText({ + model: openai('gpt-4o'), + system: systemPrompt, + prompt: prompt, + maxTokens: 2048, + }); + + return resp.stream(llmResponse.textStream); +} + +async function retrieveRelevantDocs(ctx: AgentContext, prompt: string): Promise { + const dbQuery = { + query: prompt, + limit: vectorSearchNumber + } + try { + + + const vectors = await ctx.vector.search(VECTOR_STORE_NAME, dbQuery); + + const uniquePaths = new Set(); + + vectors.forEach(vec => { + if (!vec.metadata) { + ctx.logger.warn('Vector missing metadata'); + return; + } + const path = typeof vec.metadata.path === 'string' ? vec.metadata.path : undefined; + if (!path) { + ctx.logger.warn('Vector metadata path is not a string'); + return; + } + uniquePaths.add(path); + }); + + const docs = await Promise.all( + Array.from(uniquePaths).map(async path => ({ + path, + content: await retrieveDocumentBasedOnPath(ctx, path) + })) + ); + + return docs; + } catch (err) { + ctx.logger.error('Error retrieving relevant docs: %o', err); + return []; + } +} + +async function retrieveDocumentBasedOnPath(ctx: AgentContext, path: string): Promise { + const dbQuery = { + query: ' ', + limit: 10000, + metadata: { + path: path + } + } + try { + const vectors = await ctx.vector.search(VECTOR_STORE_NAME, dbQuery); + + // Sort vectors by chunk index and concatenate text + const sortedVectors = vectors + .map(vec => { + const metadata = vec.metadata as ChunkMetadata; + return { + metadata, + index: metadata.chunkIndex + }; + }) + .sort((a, b) => a.index - b.index); + + const fullText = sortedVectors + .map(vec => vec.metadata.text) + .join('\n\n'); + + return fullText; + } catch (err) { + ctx.logger.error('Error retrieving document by path %s: %o', path, err); + return ''; + } +} \ No newline at end of file diff --git a/agent-docs/src/agents/doc-qa/types.ts b/agent-docs/src/agents/doc-qa/types.ts new file mode 100644 index 00000000..9fa227ff --- /dev/null +++ b/agent-docs/src/agents/doc-qa/types.ts @@ -0,0 +1,5 @@ +export interface RelevantDoc { + path: string; + content: string; + } + \ No newline at end of file diff --git a/bin/build-payload.sh b/bin/build-payload.sh new file mode 100755 index 00000000..23f8e021 --- /dev/null +++ b/bin/build-payload.sh @@ -0,0 +1,95 @@ +#!/bin/bash +set -euo pipefail + +# build-payload.sh [mode] +# Reads file paths from stdin, builds JSON payload +# mode: "incremental" (default) or "full" + +usage() { + echo "Usage: $0 [mode]" >&2 + echo "Example: $0 'owner/repo' incremental" >&2 + echo "Modes: incremental (default), full" >&2 + exit 1 +} + +if [ $# -lt 1 ]; then + usage +fi + +REPO_NAME="$1" +MODE="${2:-incremental}" + +echo "Building $MODE sync payload for $REPO_NAME" >&2 + +# Read all file paths into arrays +changed_files=() +removed_files=() + +while IFS= read -r file; do + if [ -z "$file" ]; then + continue + fi + + if [[ "$file" == REMOVED:* ]]; then + # Remove the REMOVED: prefix + removed_file="${file#REMOVED:}" + removed_files+=("$removed_file") + echo " removed: $removed_file" >&2 + else + changed_files+=("$file") + echo " changed: $file" >&2 + fi +done + +echo "Processing ${#changed_files[@]} changed files and ${#removed_files[@]} removed files" >&2 + +# For full mode, all files should be removed first +if [ "$MODE" = "full" ]; then + # Copy changed files to removed files for full refresh + removed_files=("${changed_files[@]}") + echo "Full mode: treating all files as removed for refresh" >&2 +fi + +# Start building JSON +echo "{" +echo " \"repo\": \"$REPO_NAME\"," + +# Build changed files array +echo " \"changed\": [" +first=true +for file in "${changed_files[@]}"; do + if [ -f "content/$file" ]; then + if [ "$first" = true ]; then + first=false + else + echo "," + fi + + # Read file content and base64 encode + content=$(base64 -w0 < "content/$file") + + echo -n " {" + echo -n "\"path\": \"$file\", " + echo -n "\"content\": \"$content\"" + echo -n "}" + fi +done +echo "" +echo " ]," + +# Build removed files array +echo " \"removed\": [" +first=true +for file in "${removed_files[@]}"; do + if [ "$first" = true ]; then + first=false + else + echo "," + fi + echo -n " \"$file\"" +done +echo "" +echo " ]" +echo "}" + +echo "Payload build complete" >&2 \ No newline at end of file diff --git a/bin/collect-all-files.sh b/bin/collect-all-files.sh new file mode 100755 index 00000000..8353acdd --- /dev/null +++ b/bin/collect-all-files.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -euo pipefail +trap "" PIPE + +# collect-all-files.sh +# Outputs newline-separated list of all MDX files (relative to content/) + +echo "Collecting all MDX files for full sync" >&2 + +if [ ! -d "content" ]; then + echo "Error: content directory not found" >&2 + exit 1 +fi + +# Find all MDX files +find content -type f -name "*.mdx" | \ + sed 's|^content/||' | \ + sort | \ + while read -r file; do + if [ -n "$file" ] && [ -f "content/$file" ]; then + echo "$file" + echo " found: $file" >&2 + fi + done + +# Count and report +file_count=$(find content -type f -name "*.mdx" | wc -l) +echo "Total files found: $file_count" >&2 \ No newline at end of file diff --git a/bin/collect-changed-files.sh b/bin/collect-changed-files.sh new file mode 100755 index 00000000..73cd12f3 --- /dev/null +++ b/bin/collect-changed-files.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set -euo pipefail + +# collect-changed-files.sh +# Outputs newline-separated list of changed MDX files (relative to content/) + +usage() { + echo "Usage: $0 " >&2 + echo "Example: $0 HEAD~1 HEAD" >&2 + exit 1 +} + +if [ $# -ne 2 ]; then + usage +fi + +BEFORE_COMMIT="$1" +AFTER_COMMIT="$2" + +# Validate commits exist +if ! git rev-parse --verify "$BEFORE_COMMIT" >/dev/null 2>&1; then + echo "Error: Invalid before commit: $BEFORE_COMMIT" >&2 + exit 1 +fi + +if ! git rev-parse --verify "$AFTER_COMMIT" >/dev/null 2>&1; then + echo "Error: Invalid after commit: $AFTER_COMMIT" >&2 + exit 1 +fi + +echo "Collecting changed files between $BEFORE_COMMIT and $AFTER_COMMIT" >&2 + +# Get changed files (excluding deleted) +echo "Changed files:" >&2 +git diff --name-only "$BEFORE_COMMIT" "$AFTER_COMMIT" -- 'content/**/*.mdx' | \ + grep '^content/' | \ + sed 's|^content/||' | \ + while read -r file; do + if [ -n "$file" ] && [ -f "content/$file" ]; then + echo "$file" + echo " + $file" >&2 + fi + done + +# Get removed files +echo "Removed files:" >&2 +git diff --name-only --diff-filter=D "$BEFORE_COMMIT" "$AFTER_COMMIT" -- 'content/**/*.mdx' | \ + grep '^content/' | \ + sed 's|^content/||' | \ + while read -r file; do + if [ -n "$file" ]; then + echo "REMOVED:$file" + echo " - $file" >&2 + fi + done \ No newline at end of file diff --git a/bin/send-webhook.sh b/bin/send-webhook.sh new file mode 100755 index 00000000..ceda5e9e --- /dev/null +++ b/bin/send-webhook.sh @@ -0,0 +1,77 @@ +#!/bin/bash +set -euo pipefail + +# send-webhook.sh [auth_token] +# Reads JSON payload from stdin, sends to webhook with retries + +usage() { + echo "Usage: $0 [auth_token]" >&2 + echo "Example: $0 'https://example.com/webhook' 'Bearer token123'" >&2 + exit 1 +} + +if [ $# -lt 1 ]; then + usage +fi + +WEBHOOK_URL="$1" +AUTH_TOKEN="${2:-}" +MAX_RETRIES=3 +RETRY_DELAY=2 + +echo "Sending webhook to $WEBHOOK_URL" >&2 + +# Read payload from stdin +payload=$(cat) + +if [ -z "$payload" ]; then + echo "Error: No payload received from stdin" >&2 + exit 1 +fi + +# Validate JSON +if ! echo "$payload" | jq . >/dev/null 2>&1; then + echo "Error: Invalid JSON payload" >&2 + exit 1 +fi + +echo "Payload size: $(echo "$payload" | wc -c) bytes" >&2 + +# Build curl command +curl_args=( + -X POST + -H "Content-Type: application/json" + -d "$payload" + --fail + --show-error + --silent +) + +# Add auth header if provided +if [ -n "$AUTH_TOKEN" ]; then + curl_args+=(-H "Authorization: $AUTH_TOKEN") +fi + +# Retry logic +for attempt in $(seq 1 $MAX_RETRIES); do + echo "Attempt $attempt/$MAX_RETRIES..." >&2 + + if response=$(curl "${curl_args[@]}" "$WEBHOOK_URL" 2>&1); then + echo "Success! Response:" >&2 + echo "$response" >&2 + echo "$response" + exit 0 + else + echo "Attempt $attempt failed: $response" >&2 + + if [ $attempt -lt $MAX_RETRIES ]; then + echo "Retrying in ${RETRY_DELAY}s..." >&2 + sleep $RETRY_DELAY + # Exponential backoff + RETRY_DELAY=$((RETRY_DELAY * 2)) + fi + fi +done + +echo "Error: All $MAX_RETRIES attempts failed" >&2 +exit 1 \ No newline at end of file diff --git a/bin/validate-files.sh b/bin/validate-files.sh new file mode 100755 index 00000000..6752d38d --- /dev/null +++ b/bin/validate-files.sh @@ -0,0 +1,53 @@ +#!/bin/bash +set -euo pipefail + +# validate-files.sh +# Reads file paths from stdin, validates they exist and are safe +# Outputs only valid file paths + +echo "Validating file paths" >&2 + +valid_count=0 +invalid_count=0 + +# Read all input into an array first +mapfile -t files + +for file in "${files[@]}"; do + # Skip empty lines + if [ -z "$file" ]; then + continue + fi + + # Handle REMOVED: prefix + if [[ "$file" == REMOVED:* ]]; then + echo "$file" + ((valid_count++)) + continue + fi + + # Security check: prevent path traversal + if [[ "$file" == *".."* ]] || [[ "$file" == "/"* ]]; then + echo "Warning: Unsafe path detected, skipping: $file" >&2 + ((invalid_count++)) + continue + fi + + # Check if file exists + if [ -f "content/$file" ]; then + echo "$file" + echo " ✓ $file" >&2 + ((valid_count++)) + else + echo "Warning: File not found, skipping: $file" >&2 + ((invalid_count++)) + fi +done + +echo "Validation complete: $valid_count valid, $invalid_count invalid" >&2 + +# Exit with error if no valid files +if [ "$valid_count" -eq 0 ]; then + echo "Error: No valid files found" >&2 + exit 1 +fi \ No newline at end of file diff --git a/agent-docs/src/agents/doc-processing/config.ts b/config.ts similarity index 57% rename from agent-docs/src/agents/doc-processing/config.ts rename to config.ts index 214f156c..3088c1f8 100644 --- a/agent-docs/src/agents/doc-processing/config.ts +++ b/config.ts @@ -1 +1,2 @@ -export const VECTOR_STORE_NAME = process.env.VECTOR_STORE_NAME || 'docs'; \ No newline at end of file +export const VECTOR_STORE_NAME = process.env.VECTOR_STORE_NAME || 'docs'; +export const vectorSearchNumber = 20; \ No newline at end of file