Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/embedding-unit-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Embedding Generation Unit Tests

on:
push:
branches: [main]
paths:
- 'embedding-generation/**'
- '.github/workflows/embedding-unit-tests.yml'
pull_request:
branches: [main]
paths:
- 'embedding-generation/**'
- '.github/workflows/embedding-unit-tests.yml'

jobs:
test:
runs-on: ubuntu-latest
defaults:
run:
working-directory: embedding-generation

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest

- name: Run unit tests
run: python -m pytest tests/ -v --tb=short
4 changes: 2 additions & 2 deletions embedding-generation/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ WORKDIR /embedding-data
# Copy Python scripts and dependencies
COPY generate-chunks.py .
COPY local_vectorstore_creation.py .
COPY urls-to-chunk.csv .
COPY vector-db-sources.csv .
COPY requirements.txt .

# Copy intrinsic chunks data from the cached base image
Expand All @@ -43,7 +43,7 @@ COPY --from=intrinsic-chunks /embedding-data/intrinsic_chunks ./intrinsic_chunks
RUN pip3 install --no-cache-dir --break-system-packages -r requirements.txt

# Generate vector database
RUN python3 generate-chunks.py urls-to-chunk.csv && \
RUN python3 generate-chunks.py vector-db-sources.csv && \
python3 local_vectorstore_creation.py && \
rm -f embeddings_*.txt

Expand Down
198 changes: 171 additions & 27 deletions embedding-generation/generate-chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,22 +96,103 @@ def ensure_intrinsic_chunks_from_s3(local_folder='intrinsic_chunks',
# Global var to prevent duplication entries from cross platform learning paths
cross_platform_lps_dont_duplicate = []

# Global tracking for vector-db-sources.csv
# Set of URLs already in the CSV (for deduplication)
known_source_urls = set()
# List of all source entries (including existing and new)
# Each entry is a dict: {site_name, license_type, display_name, url, keywords}
all_sources = []

# Increase the file size limit, which defaults to '131,072'
csv.field_size_limit(10**9) #1,000,000,000 (1 billion), smaller than 64-bit space but avoids 'python overflowerror'


def load_existing_sources(csv_file):
"""
Load existing sources from vector-db-sources.csv into memory.
Populates known_source_urls set and all_sources list.
"""
global known_source_urls, all_sources
known_source_urls = set()
all_sources = []

if not os.path.exists(csv_file):
print(f"Sources file '{csv_file}' does not exist. Starting fresh.")
return

with open(csv_file, 'r', newline='', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
url = row.get('URL', '').strip()
if url:
known_source_urls.add(url)
all_sources.append({
'site_name': row.get('Site Name', ''),
'license_type': row.get('License Type', ''),
'display_name': row.get('Display Name', ''),
'url': url,
'keywords': row.get('Keywords', '')
})

print(f"Loaded {len(all_sources)} existing sources from '{csv_file}'")


def register_source(site_name, license_type, display_name, url, keywords):
"""
Register a new source URL. If the URL already exists, skip it.
Returns True if the source was added, False if it was a duplicate.
"""
global known_source_urls, all_sources

# Normalize URL for comparison
url = url.strip()

if url in known_source_urls:
return False

known_source_urls.add(url)
all_sources.append({
'site_name': site_name,
'license_type': license_type,
'display_name': display_name,
'url': url,
'keywords': keywords if isinstance(keywords, str) else '; '.join(keywords)
})
print(f"[NEW SOURCE] {display_name}: {url}")
return True


def save_sources_csv(csv_file):
"""
Write all sources (existing + new) to vector-db-sources.csv.
"""
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Site Name', 'License Type', 'Display Name', 'URL', 'Keywords'])
for source in all_sources:
writer.writerow([
source['site_name'],
source['license_type'],
source['display_name'],
source['url'],
source['keywords']
])

print(f"Saved {len(all_sources)} sources to '{csv_file}'")

class Chunk:
def __init__(self, title, url, uuid, keywords, content):
self.title = title
self.url = url
self.uuid = uuid
self.content = content

# Translate keyword list into comma-seperated string, and add similar words to keywords.
# Translate keyword list into comma-separated string, and add similar words to keywords.
self.keywords = self.formatKeywords(keywords)


def formatKeywords(self,keywords):
return ', '.join(keywords).lower().strip()
def formatKeywords(self, keywords):
"""Format keywords list into a lowercase, comma-separated string."""
return ', '.join(k.strip() for k in keywords).lower()

# Used to dump into a yaml file without difficulty
def toDict(self):
Expand Down Expand Up @@ -180,9 +261,20 @@ def createTextSnippet(main_row):
keywords.append(c.replace('tag-license-','').replace('tag-category-',''))


package_url = f"{url}?package={package_name_urlized}"

# Register this ecosystem dashboard entry as a source
register_source(
site_name='Ecosystem Dashboard',
license_type='Arm Proprietary',
display_name=f'Ecosystem Dashboard - {package_name}',
url=package_url,
keywords=keywords
)

chunk = Chunk(
title = f"Ecosystem Dashboard - {package_name}",
url = f"{url}?package={package_name_urlized}",
url = package_url,
uuid = str(uuid.uuid4()),
keywords = keywords,
content = text_snippet
Expand Down Expand Up @@ -352,6 +444,27 @@ def chunkizeLearningPath(relative_url, title, keywords):

response = http_session.get(url, timeout=60)
soup = BeautifulSoup(response.text, 'html.parser')

# Get learning path title and keywords once for registration
lp_title_elem = soup.find(id='learning-path-title')
if lp_title_elem:
lp_title = lp_title_elem.get_text()
ads_tags = soup.findAll('ads-tag')
lp_keywords = []
for tag in ads_tags:
keyword = tag.get_text().strip()
if keyword not in lp_keywords:
lp_keywords.append(keyword)

# Register this learning path as a source
register_source(
site_name='Learning Paths',
license_type='CC4.0',
display_name=f'Learning Path - {lp_title}',
url=url,
keywords=lp_keywords
)

for link in soup.find_all(class_='inner-learning-path-navbar-element'):
#Ignore mobile links
if 'content-individual-a-mobile' not in link.get('class', []):
Expand Down Expand Up @@ -392,11 +505,24 @@ def chunkizeLearningPath(relative_url, title, keywords):
ig_soup = BeautifulSoup(ig_response.text, 'html.parser')

# obtain title of Install Guide
title = 'Install Guide - '+ ig_soup.find(id='install-guide-title').get_text()
ig_title_elem = ig_soup.find(id='install-guide-title')
if not ig_title_elem:
continue
ig_title = ig_title_elem.get_text()
title = 'Install Guide - '+ ig_title


# Obtain keywords of learning path
keywords = [ig_soup.find(id='install-guide-title').get_text(), 'install','build', 'download']
keywords = [ig_title, 'install','build', 'download']

# Register this install guide as a source
register_source(
site_name='Install Guides',
license_type='CC4.0',
display_name=title,
url=ig_url,
keywords=keywords
)

# Processing to check for multi-install
multi_install_guides = ig_soup.find_all(class_='multi-install-card')
Expand Down Expand Up @@ -447,27 +573,28 @@ def createLearningPathChunks():


def readInCSV(csv_file):
csv_length = 0
"""Read sources CSV file and return dict of lists for processing.

Uses csv.DictReader to properly handle quoted fields containing commas.
Returns empty results if the file doesn't exist.
"""
csv_dict = {
'urls': [],
'focus': [],
'source_names': []
}
with open(csv_file, 'r') as file:
next(file) # Skip the header row
for line in file:

source_name = line.strip().split(',')[2] # Get the URL from column A
focus = line.strip().split(',')[4] # Get the URL from column B
url = line.strip().split(',')[3] # Get the URL from column C

csv_dict['urls'].append(url)
csv_dict['focus'].append(focus)
csv_dict['source_names'].append(source_name)

csv_length += 1

return csv_dict, csv_length

if not os.path.exists(csv_file):
return csv_dict, 0

with open(csv_file, 'r', newline='', encoding='utf-8') as file:
reader = csv.DictReader(file)
for row in reader:
csv_dict['urls'].append(row.get('URL', ''))
csv_dict['focus'].append(row.get('Keywords', ''))
csv_dict['source_names'].append(row.get('Display Name', ''))

return csv_dict, len(csv_dict['urls'])


def getMarkdownGitHubURLsFromPage(url):
Expand Down Expand Up @@ -707,9 +834,22 @@ def main():
ensure_intrinsic_chunks_from_s3()

# Argparse inputs
parser = argparse.ArgumentParser(description="Turn a Learning Path URL into suburls in GitHub")
parser.add_argument("csv_file", help="Path to the CSV file that lists all Learning Paths to chunk.")
parser = argparse.ArgumentParser(
description="Generate text chunks from Arm documentation sources for vector database ingestion. "
"Discovers learning paths, install guides, and ecosystem dashboard entries, "
"then updates the sources CSV with any new entries found."
)
parser.add_argument(
"sources_file",
help="Path to vector-db-sources.csv. This file is read for existing sources "
"(to avoid duplicates) and WILL BE OVERWRITTEN with the combined list "
"of existing + newly discovered sources."
)
args = parser.parse_args()
sources_file = args.sources_file

# Load existing sources from vector-db-sources.csv (for deduplication)
load_existing_sources(sources_file)

# 0) Initialize files
os.makedirs(yaml_dir, exist_ok=True) # create if doesn't exist
Expand All @@ -729,9 +869,9 @@ def main():
#createIntrinsicsDatabaseChunks()

# 1) Get URLs and details from CSV
csv_dict, csv_length = readInCSV(args.csv_file)
csv_dict, csv_length = readInCSV(sources_file)

print(f'Starting to loop over CSV file {args.csv_file} ......')
print(f'Starting to loop over CSV file {sources_file} ......')
for i in range(csv_length):
url = csv_dict['urls'][i]
source_name = csv_dict['source_names'][i]
Expand Down Expand Up @@ -759,6 +899,10 @@ def main():
chunk = createChunk(text_snippet, WEBSITE_url, keywords, source_name)
chunkSaveAndTrack(url,chunk)

# Save updated sources CSV with all discovered sources
save_sources_csv(sources_file)
print(f"\n=== Source tracking complete ===")
print(f"Total sources in {sources_file}: {len(all_sources)}")


if __name__ == "__main__":
Expand Down
13 changes: 13 additions & 0 deletions embedding-generation/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright © 2026, Arm Limited and Contributors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Loading
Loading