# Create Vector DBs

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document

import pandas as pd
import re
from uuid import uuid4

In [None]:
import os
print("Current working directory:", os.getcwd())


In [None]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

In [None]:
def load_urls_from_csv_files(csv_paths, column_name="Permalink"):
    all_urls = []
    for path in csv_paths:
        df = pd.read_csv(path)
        if column_name not in df.columns:
            raise ValueError(f"Column '{column_name}' not found in {path}")
        urls = df[column_name].dropna().tolist()
        all_urls.extend(urls)
    return all_urls

def load_pages_from_url_list(url_list):
    loader = WebBaseLoader(url_list)
    docs = loader.load()
    for doc in docs:
        doc.page_content = re.sub(r"\n{3,}", "\n\n", doc.page_content)
    return docs

def create_vectorstore(documents, database_loc, embedding="all-mpnet-base-v2"):
    embedding_model = HuggingFaceEmbeddings(model_name=embedding)
    vector_store = Chroma(embedding_function=embedding_model,
                          persist_directory=database_loc)
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vector_store.add_documents(documents, ids=uuids)
    print(f"All documents have been processed and stored at {database_loc}.")

## Create a Knowledge Base DB

In [None]:
kb_db_loc = "Vectorstore/kb_only"

In [None]:
# Load URLs from the Article CSV export
kb_csv_files = ["/home/anra240/URLupload/Article-Export-2025-Apr-29-181530.csv"]
kb_urls = load_urls_from_csv_files(kb_csv_files, column_name="Permalink")

# Load documents from URLs
kb_pages = load_pages_from_url_list(kb_urls)

# Deduplicate documents
source_urls = set()
titles = set()
kb_deduped = []

for i, doc in enumerate(kb_pages):
    src = doc.metadata.get('source')
    title = doc.metadata.get('title')
    doc_page_content = doc.page_content
    document = Document(page_content=doc_page_content,
                        metadata={"url": src, "title": title}, id=i)
    
    if "Page not found" not in title and "Log In" not in title:
        kb_deduped.append(document)
        print(f"adding {src}: {title}")
    else:
        print(f"Found duplicates: {src}: {title}")

print(len(kb_deduped))
create_vectorstore(kb_deduped, kb_db_loc, embedding="all-mpnet-base-v2")

## Create a Forum DB

In [None]:
forum_db_loc = "Vectorstore/forum_only"

In [None]:
# Load URLs from the Forum CSV export
forum_csv_files = ["/home/anra240/URLupload/Topic-Export-2025-Apr-29-182021.csv"]
forum_urls = load_urls_from_csv_files(forum_csv_files, column_name="Permalink")

# Load documents from URLs
forum_pages = load_pages_from_url_list(forum_urls)

# Deduplicate documents
source_urls = set()
titles = set()
forum_deduped = []

for i, doc in enumerate(forum_pages):
    src = doc.metadata.get('source')
    title = doc.metadata.get('title')
    doc_page_content = doc.page_content
    document = Document(page_content=doc_page_content,
                        metadata={"url": src, "title": title}, id=i)
    
    if "Page not found" not in title and "Log In" not in title:
        forum_deduped.append(document)
        print(f"adding {src}: {title}")
    else:
        print(f"Found duplicates: {src}: {title}")

print(len(forum_deduped))
create_vectorstore(forum_deduped, forum_db_loc, embedding="all-mpnet-base-v2")

## Create a combined DB

In [None]:
kb_csv_files = ["/home/anra240/URLupload/Article-Export-2025-Apr-29-181530.csv"]
forum_csv_files = ["/home/anra240/URLupload/Topic-Export-2025-Apr-29-182021.csv"]

kb_urls = load_urls_from_csv_files(kb_csv_files, column_name="Permalink")
forum_urls = load_urls_from_csv_files(forum_csv_files, column_name="Permalink")

kb_pages = load_pages_from_url_list(kb_urls)
forum_pages = load_pages_from_url_list(forum_urls)

def filter_docs(pages):
    return [doc for doc in pages if "Page not found" not in doc.metadata.get("title", "") and "Log In" not in doc.metadata.get("title", "")]

kb_deduped = filter_docs(kb_pages)
forum_deduped = filter_docs(forum_pages)

combined_docs = kb_deduped + forum_deduped
combined_db_loc = "Vectorstore/kb_forum_combined"

create_vectorstore(combined_docs, combined_db_loc, embedding="all-mpnet-base-v2")