arm · JoeStech · Mar 19, 2026 · Mar 11, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/.github/workflows/embedding-unit-tests.yml b/.github/workflows/embedding-unit-tests.yml
@@ -0,0 +1,38 @@
+name: Embedding Generation Unit Tests
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'embedding-generation/**'
+      - '.github/workflows/embedding-unit-tests.yml'
+  pull_request:
+    branches: [main]
+    paths:
+      - 'embedding-generation/**'
+      - '.github/workflows/embedding-unit-tests.yml'
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: embedding-generation
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest
+
+      - name: Run unit tests
+        run: python -m pytest tests/ -v --tb=short
diff --git a/embedding-generation/Dockerfile b/embedding-generation/Dockerfile
@@ -33,7 +33,7 @@ WORKDIR /embedding-data
 # Copy Python scripts and dependencies
 COPY generate-chunks.py .
 COPY local_vectorstore_creation.py .
-COPY urls-to-chunk.csv .
+COPY vector-db-sources.csv .
 COPY requirements.txt .
 
 # Copy intrinsic chunks data from the cached base image
@@ -43,7 +43,7 @@ COPY --from=intrinsic-chunks /embedding-data/intrinsic_chunks ./intrinsic_chunks
 RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt
 
 # Generate vector database
-RUN python3 generate-chunks.py urls-to-chunk.csv && \
+RUN python3 generate-chunks.py vector-db-sources.csv && \
     python3 local_vectorstore_creation.py && \
     rm -f embeddings_*.txt
 

diff --git a/embedding-generation/generate-chunks.py b/embedding-generation/generate-chunks.py
@@ -96,22 +96,103 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks',
 # Global var to prevent duplication entries from cross platform learning paths
 cross_platform_lps_dont_duplicate = []
 
+# Global tracking for vector-db-sources.csv
+# Set of URLs already in the CSV (for deduplication)
+known_source_urls = set()
+# List of all source entries (including existing and new)
+# Each entry is a dict: {site_name, license_type, display_name, url, keywords}
+all_sources = []
+
 # Increase the file size limit, which defaults to '131,072'
 csv.field_size_limit(10**9) #1,000,000,000 (1 billion), smaller than 64-bit space but avoids 'python overflowerror'
 
+
+def load_existing_sources(csv_file):
+    """
+    Load existing sources from vector-db-sources.csv into memory.
+    Populates known_source_urls set and all_sources list.
+    """
+    global known_source_urls, all_sources
+    known_source_urls = set()
+    all_sources = []
+
+    if not os.path.exists(csv_file):
+        print(f"Sources file '{csv_file}' does not exist. Starting fresh.")
+        return
+
+    with open(csv_file, 'r', newline='', encoding='utf-8') as file:
+        reader = csv.DictReader(file)
+        for row in reader:
+            url = row.get('URL', '').strip()
+            if url:
+                known_source_urls.add(url)
+                all_sources.append({
+                    'site_name': row.get('Site Name', ''),
+                    'license_type': row.get('License Type', ''),
+                    'display_name': row.get('Display Name', ''),
+                    'url': url,
+                    'keywords': row.get('Keywords', '')
+                })
+
+    print(f"Loaded {len(all_sources)} existing sources from '{csv_file}'")
+
+
+def register_source(site_name, license_type, display_name, url, keywords):
+    """
+    Register a new source URL. If the URL already exists, skip it.
+    Returns True if the source was added, False if it was a duplicate.
+    """
+    global known_source_urls, all_sources
+
+    # Normalize URL for comparison
+    url = url.strip()
+
+    if url in known_source_urls:
+        return False
+
+    known_source_urls.add(url)
+    all_sources.append({
+        'site_name': site_name,
+        'license_type': license_type,
+        'display_name': display_name,
+        'url': url,
+        'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords)
+    })
+    print(f"[NEW SOURCE] {display_name}: {url}")
+    return True
+
+
+def save_sources_csv(csv_file):
+    """
+    Write all sources (existing + new) to vector-db-sources.csv.
+    """
+    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
+        writer = csv.writer(file)
+        writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords'])
+        for source in all_sources:
+            writer.writerow([
+                source['site_name'],
+                source['license_type'],
+                source['display_name'],
+                source['url'],
+                source['keywords']
+            ])
+
+    print(f"Saved {len(all_sources)} sources to '{csv_file}'")
+
 class Chunk:
     def __init__(self, title, url, uuid, keywords, content):
         self.title = title
         self.url = url
         self.uuid = uuid
         self.content = content
 
-        # Translate keyword list into comma-seperated string, and add similar words to keywords.
+        # Translate keyword list into comma-separated string, and add similar words to keywords.
         self.keywords = self.formatKeywords(keywords)
-
 
-    def formatKeywords(self,keywords):
-        return ', '.join(keywords).lower().strip()
+    def formatKeywords(self, keywords):
+        """Format keywords list into a lowercase, comma-separated string."""
+        return ', '.join(k.strip() for k in keywords).lower()
 
     # Used to dump into a yaml file without difficulty
     def toDict(self):
@@ -180,9 +261,20 @@ def createTextSnippet(main_row):
                 keywords.append(c.replace('tag-license-','').replace('tag-category-',''))
 
 
+        package_url = f"{url}?package={package_name_urlized}"
+
+        # Register this ecosystem dashboard entry as a source
+        register_source(
+            site_name='Ecosystem Dashboard',
+            license_type='Arm Proprietary',
+            display_name=f'Ecosystem Dashboard - {package_name}',
+            url=package_url,
+            keywords=keywords
+        )
+
         chunk = Chunk(
             title        = f"Ecosystem Dashboard - {package_name}",
-            url          = f"{url}?package={package_name_urlized}",
+            url          = package_url,
             uuid         = str(uuid.uuid4()),
             keywords     = keywords,
             content      = text_snippet
@@ -352,6 +444,27 @@ def chunkizeLearningPath(relative_url, title, keywords):
 
         response = http_session.get(url, timeout=60)
         soup = BeautifulSoup(response.text, 'html.parser')
+
+        # Get learning path title and keywords once for registration
+        lp_title_elem = soup.find(id='learning-path-title')
+        if lp_title_elem:
+            lp_title = lp_title_elem.get_text()
+            ads_tags = soup.findAll('ads-tag')
+            lp_keywords = []
+            for tag in ads_tags:
+                keyword = tag.get_text().strip()
+                if keyword not in lp_keywords:
+                    lp_keywords.append(keyword)
+
+            # Register this learning path as a source
+            register_source(
+                site_name='Learning Paths',
+                license_type='CC4.0',
+                display_name=f'Learning Path - {lp_title}',
+                url=url,
+                keywords=lp_keywords
+            )
+
         for link in soup.find_all(class_='inner-learning-path-navbar-element'):
             #Ignore mobile links
             if 'content-individual-a-mobile' not in link.get('class', []): 
@@ -392,11 +505,24 @@ def chunkizeLearningPath(relative_url, title, keywords):
             ig_soup = BeautifulSoup(ig_response.text, 'html.parser')
 
             # obtain title of Install Guide
-            title = 'Install Guide - '+ ig_soup.find(id='install-guide-title').get_text()
+            ig_title_elem = ig_soup.find(id='install-guide-title')
+            if not ig_title_elem:
+                continue
+            ig_title = ig_title_elem.get_text()
+            title = 'Install Guide - '+ ig_title
 
 
             # Obtain keywords of learning path
-            keywords = [ig_soup.find(id='install-guide-title').get_text(), 'install','build', 'download']
+            keywords = [ig_title, 'install','build', 'download']
+
+            # Register this install guide as a source
+            register_source(
+                site_name='Install Guides',
+                license_type='CC4.0',
+                display_name=title,
+                url=ig_url,
+                keywords=keywords
+            )
 
             # Processing to check for multi-install
             multi_install_guides = ig_soup.find_all(class_='multi-install-card')
@@ -447,27 +573,28 @@ def createLearningPathChunks():
 
 
 def readInCSV(csv_file):
-    csv_length = 0
+    """Read sources CSV file and return dict of lists for processing.
+
+    Uses csv.DictReader to properly handle quoted fields containing commas.
+    Returns empty results if the file doesn't exist.
+    """
     csv_dict = {
         'urls': [],
         'focus': [],
         'source_names': []
     }
-    with open(csv_file, 'r') as file:
-        next(file)  # Skip the header row
-        for line in file:
-
-            source_name = line.strip().split(',')[2]  # Get the URL from column A
-            focus = line.strip().split(',')[4]  # Get the URL from column B
-            url = line.strip().split(',')[3]  # Get the URL from column C
-
-            csv_dict['urls'].append(url)
-            csv_dict['focus'].append(focus)
-            csv_dict['source_names'].append(source_name)
-
-            csv_length += 1
-
-    return csv_dict, csv_length
+
+    if not os.path.exists(csv_file):
+        return csv_dict, 0
+
+    with open(csv_file, 'r', newline='', encoding='utf-8') as file:
+        reader = csv.DictReader(file)
+        for row in reader:
+            csv_dict['urls'].append(row.get('URL', ''))
+            csv_dict['focus'].append(row.get('Keywords', ''))
+            csv_dict['source_names'].append(row.get('Display Name', ''))
+
+    return csv_dict, len(csv_dict['urls'])
 
 
 def getMarkdownGitHubURLsFromPage(url):
@@ -707,9 +834,22 @@ def main():
     ensure_intrinsic_chunks_from_s3()
 
     # Argparse inputs
-    parser = argparse.ArgumentParser(description="Turn a Learning Path URL into suburls in GitHub")
-    parser.add_argument("csv_file", help="Path to the CSV file that lists all Learning Paths to chunk.")
+    parser = argparse.ArgumentParser(
+        description="Generate text chunks from Arm documentation sources for vector database ingestion. "
+                    "Discovers learning paths, install guides, and ecosystem dashboard entries, "
+                    "then updates the sources CSV with any new entries found."
+    )
+    parser.add_argument(
+        "sources_file",
+        help="Path to vector-db-sources.csv. This file is read for existing sources "
+             "(to avoid duplicates) and WILL BE OVERWRITTEN with the combined list "
+             "of existing + newly discovered sources."
+    )
     args = parser.parse_args()
+    sources_file = args.sources_file
+
+    # Load existing sources from vector-db-sources.csv (for deduplication)
+    load_existing_sources(sources_file)
 
     # 0) Initialize files
     os.makedirs(yaml_dir, exist_ok=True) # create if doesn't exist
@@ -729,9 +869,9 @@ def main():
     #createIntrinsicsDatabaseChunks()
 
     # 1) Get URLs and details from CSV
-    csv_dict, csv_length = readInCSV(args.csv_file)
+    csv_dict, csv_length = readInCSV(sources_file)
 
-    print(f'Starting to loop over CSV file {args.csv_file} ......')
+    print(f'Starting to loop over CSV file {sources_file} ......')
     for i in range(csv_length):
         url = csv_dict['urls'][i]
         source_name = csv_dict['source_names'][i]
@@ -759,6 +899,10 @@ def main():
                 chunk = createChunk(text_snippet, WEBSITE_url, keywords, source_name)
                 chunkSaveAndTrack(url,chunk) 
 
+    # Save updated sources CSV with all discovered sources
+    save_sources_csv(sources_file)
+    print(f"\n=== Source tracking complete ===")
+    print(f"Total sources in {sources_file}: {len(all_sources)}")
 
 
 if __name__ == "__main__":

diff --git a/embedding-generation/tests/__init__.py b/embedding-generation/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright © 2026, Arm Limited and Contributors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.