agentuity · afterrburn · Jun 19, 2025 · Jun 14, 2025 · Jun 14, 2025 · Jun 14, 2025
diff --git a/.github/workflows/sync-docs-full.yml b/.github/workflows/sync-docs-full.yml
@@ -0,0 +1,58 @@
+name: Full Docs Sync to Vector Store
+
+on:
+  workflow_dispatch:
+
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Get all MDX files and prepare payload
+        id: files
+        run: |
+          # First find all MDX files recursively
+          echo "Finding all MDX files..."
+          find content -type f -name "*.mdx" | sed 's|^content/||' > mdx_files.txt
+          echo "Found files:"
+          cat mdx_files.txt
+
+          # Create the changed array by processing each file through jq
+          echo "Processing files..."
+          jq -n --slurpfile paths <(
+            while IFS= read -r path; do
+              [ -z "$path" ] && continue
+              if [ -f "content/$path" ]; then
+                echo "Processing: content/$path"
+                jq -n \
+                  --arg path "$path" \
+                  --arg content "$(base64 -w0 < "content/$path")" \
+                  '{path: $path, content: $content}'
+              fi
+            done < mdx_files.txt | jq -s '.'
+          ) \
+          --slurpfile removed <(cat mdx_files.txt | jq -R . | jq -s .) \
+          --arg repo "$GITHUB_REPOSITORY" \
+          '{
+            repo: $repo,
+            changed: ($paths | .[0] // []),
+            removed: ($removed | .[0] // [])
+          }' > payload.json
+
+          # Show debug info
+          echo "Payload structure (without contents):"
+          jq 'del(.changed[].content)' payload.json
+
+      - name: Send to Agentuity
+        run: |
+          echo "About to sync these files:"
+          jq -r '.changed[].path' payload.json
+          echo -e "\nWill first remove these paths:"
+          jq -r '.removed[]' payload.json
+
+          # Uncomment to actually send
+          curl https://agentuity.ai/webhook/f61d5ce9d6ed85695cc992c55ccdc2a6 \
+            -X POST \
+            -H "Content-Type: application/json" \
+            -d @payload.json 
diff --git a/.github/workflows/sync-docs.yml b/.github/workflows/sync-docs.yml
@@ -0,0 +1,71 @@
+name: Sync Docs to Vector Store
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'content/**'
+
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Get changed and removed files
+        id: files
+        run: |
+          git fetch origin ${{ github.event.before }}
+
+          # Get changed files (relative to content directory)
+          CHANGED_FILES=$(git diff --name-only ${{ github.event.before }} ${{ github.sha }} -- 'content/**/*.mdx' | sed 's|^content/||')
+          REMOVED_FILES=$(git diff --name-only --diff-filter=D ${{ github.event.before }} ${{ github.sha }} -- 'content/**/*.mdx' | sed 's|^content/||')
+
+          echo "Changed files: $CHANGED_FILES"
+          echo "Removed files: $REMOVED_FILES"
+
+          # Build JSON payload with file contents
+          payload=$(jq -n \
+            --arg commit "${{ github.sha }}" \
+            --arg repo "${{ github.repository }}" \
+            --argjson changed "$(
+              if [ -n "$CHANGED_FILES" ]; then
+                for f in $CHANGED_FILES; do
+                  if [ -f "content/$f" ]; then
+                    jq -n \
+                      --arg path "$f" \
+                      --arg content "$(base64 -w0 < "content/$f")" \
+                      '{path: $path, content: $content}'
+                  fi
+                done | jq -s '.'
+              else
+                echo '[]'
+              fi
+            )" \
+            --argjson removed "$(
+              if [ -n "$REMOVED_FILES" ]; then
+                printf '%s\n' $REMOVED_FILES | jq -R -s -c 'split("\n") | map(select(length > 0))'
+              else
+                echo '[]'
+              fi
+            )" \
+            '{commit: $commit, repo: $repo, changed: $changed, removed: $removed}'
+          )
+
+          echo "payload<<EOF" >> $GITHUB_OUTPUT
+          echo "$payload" >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+      - name: Trigger Agentuity Sync Agent
+        env:
+          AGENTUITY_TOKEN: ${{ secrets.AGENTUITY_TOKEN }}
+        run: |
+          echo "Sending payload to agent:"
+          echo '${{ steps.files.outputs.payload }}' | jq '.'
+
+          curl https://agentuity.ai/webhook/f61d5ce9d6ed85695cc992c55ccdc2a6 \
+            -X POST \
+            -H "Authorization: Bearer $AGENTUITY_TOKEN" \
+            -H "Content-Type: application/json" \
+            -d '${{ steps.files.outputs.payload }}'
-      - name: Trigger Agentuity Sync Agent
-        env:
-          AGENTUITY_TOKEN: ${{ secrets.AGENTUITY_TOKEN }}
-        run: |
-          echo "Sending payload to agent:"
-          echo '${{ steps.files.outputs.payload }}' | jq '.'
-          
-          curl https://agentuity.ai/webhook/f61d5ce9d6ed85695cc992c55ccdc2a6 \
-            -X POST \
-            -H "Authorization: Bearer $AGENTUITY_TOKEN" \
-            -H "Content-Type: application/json" \
-            -d '${{ steps.files.outputs.payload }}'
+      - name: Trigger Agentuity Sync Agent
+        env:
+          AGENTUITY_TOKEN: ${{ secrets.AGENTUITY_TOKEN }}
+        run: |
+          echo "Sending payload to agent:"
+          printf '%s\n' "${{ steps.files.outputs.payload }}" | jq '.'
+          
+          curl https://agentuity.ai/webhook/f61d5ce9d6ed85695cc992c55ccdc2a6 \
+            -X POST \
+            -H "Authorization: Bearer $AGENTUITY_TOKEN" \
+            -H "Content-Type: application/json" \
+            --data "${{ steps.files.outputs.payload }}"
-      - name: Trigger Agentuity Sync Agent
-        env:
-          AGENTUITY_TOKEN: ${{ secrets.AGENTUITY_TOKEN }}
-        run: |
-          echo "Sending payload to agent:"
-          echo '${{ steps.files.outputs.payload }}' | jq '.'
-          
-          curl https://agentuity.ai/webhook/f61d5ce9d6ed85695cc992c55ccdc2a6 \
-            -X POST \
-            -H "Authorization: Bearer $AGENTUITY_TOKEN" \
-            -H "Content-Type: application/json" \
-            -d '${{ steps.files.outputs.payload }}'
+      - name: Trigger Agentuity Sync Agent
+        env:
+          AGENTUITY_TOKEN: ${{ secrets.AGENTUITY_TOKEN }}
+        run: |
+          echo "Sending payload to agent:"
+          printf '%s\n' "${{ steps.files.outputs.payload }}" | jq '.'
+          
+          curl https://agentuity.ai/webhook/f61d5ce9d6ed85695cc992c55ccdc2a6 \
+            -X POST \
+            -H "Authorization: Bearer $AGENTUITY_TOKEN" \
+            -H "Content-Type: application/json" \
+            --data "${{ steps.files.outputs.payload }}"
diff --git a/agent-docs/RAG-TODO.md b/agent-docs/RAG-TODO.md
@@ -0,0 +1,54 @@
+# RAG System Implementation TODOs
+
+## 1. Document Chunking & Metadata
+- [x] Refine and test the chunking logic for MDX files.
+- [x] Implement full metadata enrichment (id, path, chunkIndex, contentType, heading, keywords) in the chunking/processing pipeline.
+- [x] Write unit tests for chunking and metadata extraction.
+
+## 2. Keyword Extraction
+- [x] Implement LLM-based keyword extraction for each chunk.
+- [x] Write tests to validate keyword extraction quality.
+- [ ] Integrate keyword in document processing pipeline
+
+## 3. Embedding Generation
+- [x] Implement embedding function for batch processing of chunk texts (using OpenAI SDK or Agentuity vector store as appropriate).
+- [x] Integrate embedding generation into the chunk processing pipeline.
+- [ ] Write tests to ensure embeddings are generated and stored correctly.
+
+## 4. Vector Store Integration
+- [x] Set up Agentuity vector database integration.
+- [x] Store chunk content, metadata, keywords, and embeddings.
+
+## 5. Hybrid Retrieval Logic
+- [ ] Implement hybrid search (semantic + keyword boosting).
+- [ ] Write tests to ensure correct ranking and recall.
+
+## 6. Reranker Integration
+- [ ] Integrate reranker model (API or local).
+- [ ] Implement reranking step after hybrid retrieval.
+- [ ] Write tests to validate reranker improves result quality.
+
+## 7. API Layer
+- [ ] Build modular API endpoints for search and retrieval.
+- [ ] Ensure endpoints are stateless and testable.
+- [ ] Write API tests (unit and integration).
+
+## 8. UI Integration
+- [ ] Add search bar and results display to documentation site.
+- [ ] Implement keyword highlighting and breadcrumb navigation.
+- [ ] Write UI tests for search and result presentation.
+
+## 9. Monitoring & Analytics
+- [ ] Add logging for search queries and result quality.
+- [ ] Implement feedback mechanism for users to rate results.
+
+## 10. Documentation & Developer Experience
+- [ ] Document each module and its tests.
+- [ ] Provide clear setup and usage instructions.
+
+## 11. Sync/Processor Workflow Design
+- [x] Design the documentation sync workflow:
+    - [x] Primary: Trigger sync via CI/CD or GitHub Action after merges to main/deploy branch.
+    - [x] Optional: Implement a webhook endpoint for manual or CMS-triggered syncs.
+    - [x] Ensure the sync process is idempotent and efficient (only updates changed docs/chunks).
+    - [x] Plan for operational workflow implementation after core modules are complete.