# Create Vector DBs

In [8]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

import pandas as pd
import re
from uuid import uuid4

In [9]:
import os
print("Current working directory:", os.getcwd())


Current working directory: /home/anra240/URLupload


In [10]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

In [11]:
def load_urls_from_csv_files(csv_paths, column_name="Permalink"):
    all_urls = []
    for path in csv_paths:
        df = pd.read_csv(path)
        if column_name not in df.columns:
            raise ValueError(f"Column '{column_name}' not found in {path}")
        urls = df[column_name].dropna().tolist()
        all_urls.extend(urls)
    return all_urls

def load_pages_from_url_list(url_list):
    loader = WebBaseLoader(url_list)
    docs = loader.load()
    for doc in docs:
        doc.page_content = re.sub(r"\n{3,}", "\n\n", doc.page_content)
    return docs

def create_vectorstore(documents, database_loc, embedding="all-mpnet-base-v2"):
    embedding_model = HuggingFaceEmbeddings(model_name=embedding)
    vector_store = Chroma(embedding_function=embedding_model,
                          persist_directory=database_loc)
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vector_store.add_documents(documents, ids=uuids)
    print(f"All documents have been processed and stored at {database_loc}.")

## Create a Knowledge Base DB

In [12]:
kb_db_loc = "Vectorstore/kb_only"

In [13]:
# Load URLs from the Article CSV export
kb_csv_files = ["/home/anra240/URLupload/Article-Export-2025-Apr-29-181530.csv"]
kb_urls = load_urls_from_csv_files(kb_csv_files, column_name="Permalink")

# Load documents from URLs
kb_pages = load_pages_from_url_list(kb_urls)

# Deduplicate documents
source_urls = set()
titles = set()
kb_deduped = []

for doc in kb_pages:
    src = doc.metadata.get('source')
    title = doc.metadata.get('title')
    if "Page not found" not in title and "Log In" not in title:
        kb_deduped.append(doc)
        print(f"adding {src}: {title}")
    else:
        print(f"Found duplicates: {src}: {title}")

print(len(kb_deduped))
create_vectorstore(kb_deduped, kb_db_loc, embedding="all-mpnet-base-v2")

Found duplicates: https://learn.fabric-testbed.net/?post_type=ht_kb&p=97: Page not found – FABRIC Knowledge Base
Found duplicates: https://learn.fabric-testbed.net/?post_type=ht_kb&p=103: Page not found – FABRIC Knowledge Base
Found duplicates: https://learn.fabric-testbed.net/?post_type=ht_kb&p=112: Page not found – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/knowledge-base/fabrictestbed-slice_manager/: Slice Manager – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/knowledge-base/slice-editor/: Slice Editor – FABRIC Knowledge Base
Found duplicates: https://learn.fabric-testbed.net/?post_type=ht_kb&p=197: Page not found – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/knowledge-base/fabric-testbed-release-1-0/: FABRIC Testbed Release 1.0 – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/knowledge-base/quick-start-guide/: Quick Start Guide – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/knowledge-base/install-

## Create a Forum DB

In [14]:
forum_db_loc = "Vectorstore/forum_only"

In [15]:
# Load URLs from the Forum CSV export
forum_csv_files = ["/home/anra240/URLupload/Topic-Export-2025-Apr-29-182021.csv"]
forum_urls = load_urls_from_csv_files(forum_csv_files, column_name="Permalink")

# Load documents from URLs
forum_pages = load_pages_from_url_list(forum_urls)

# Deduplicate documents
source_urls = set()
titles = set()
forum_deduped = []

for doc in forum_pages:
    src = doc.metadata.get('source')
    title = doc.metadata.get('title')
    if "Page not found" not in title and "Log In" not in title:
        forum_deduped.append(doc)
        print(f"adding {src}: {title}")
    else:
        print(f"Found duplicates: {src}: {title}")

print(len(forum_deduped))
create_vectorstore(forum_deduped, forum_db_loc, embedding="all-mpnet-base-v2")

adding https://learn.fabric-testbed.net/forums/topic/ai-ml-security-contests/: AI/ML security contests – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/forums/topic/map-gui-notebooks/: Map/GUI Notebooks – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/forums/topic/question-about-resource-map-notebook/: Question about Resource Map notebook – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/forums/topic/questions-about-node-ports-and-bandwidth-between-sites/: Questions about node ports and bandwidth between sites – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/forums/topic/important-note-fabric-is-open-for-beta-testing-only/: IMPORTANT NOTE: FABRIC is open for beta testing only – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/forums/topic/maps-visual-slice-editor/: Maps: Visual Slice Editor – FABRIC Knowledge Base
adding https://learn.fabric-testbed.net/forums/topic/maps-display-existing-resources/: Maps: Display E

## Create a combined DB

In [16]:
kb_csv_files = ["/home/anra240/URLupload/Article-Export-2025-Apr-29-181530.csv"]
forum_csv_files = ["/home/anra240/URLupload/Topic-Export-2025-Apr-29-182021.csv"]

kb_urls = load_urls_from_csv_files(kb_csv_files, column_name="Permalink")
forum_urls = load_urls_from_csv_files(forum_csv_files, column_name="Permalink")

kb_pages = load_pages_from_url_list(kb_urls)
forum_pages = load_pages_from_url_list(forum_urls)

def filter_docs(pages):
    return [doc for doc in pages if "Page not found" not in doc.metadata.get("title", "") and "Log In" not in doc.metadata.get("title", "")]

kb_deduped = filter_docs(kb_pages)
forum_deduped = filter_docs(forum_pages)

combined_docs = kb_deduped + forum_deduped
combined_db_loc = "Vectorstore/kb_forum_combined"

create_vectorstore(combined_docs, combined_db_loc, embedding="all-mpnet-base-v2")

All documents have been processed and stored at Vectorstore/kb_forum_combined.
