''' 
This script runs on Google Colab and performs the following tasks:
1. Load and preprocess Vietnamese legal documents in HTML format.
2. Use BeautifulSoup for parsing since UnstructuredHTMLLoader and BSHTMLLoader did not return desired results.
3. Tokenize Vietnamese text using PyVi's ViTokenizer because Vietnamese has special syntax rules,
   and the bkai bi-encoder requires word-segmented input before embedding.
4. Embed documents using HuggingFace's Vietnamese Bi-Encoder model.
5. Store and persist embeddings in ChromaDB, saving the database to Google Drive.
'''

In [None]:
# Install required libraries
!pip install -qU langchain-huggingface
!pip install -qU "langchain-chroma>=0.1.2"
!pip install pyvi

In [None]:
# Import necessary modules
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from pyvi import ViTokenizer
import os
from bs4 import BeautifulSoup
import re
from langchain.schema import Document

In [None]:
# Define folder path containing HTML files
folder_path = "/content/vbpl"
html_files = [f for f in os.listdir(folder_path) if f.endswith(".html")]

documents = []  # List to store parsed documents

# Process each HTML file
for file_name in html_files:
    file_path = os.path.join(folder_path, file_name)

    # Open and read the file
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser") # Parse HTML content
        text_content = soup.get_text() # Extract plain text
        text_content = re.sub(r'\n+', '\n', text_content)  # Remove excessive newlines
        text_content = ViTokenizer.tokenize(text_content)  # Tokenize Vietnamese text
        '''According to bkai-foundation-models/vietnamese-bi-encoder, the input text has to be word-segmented. Hence the ViTokenizerViTokenizer'''
    # Create a Document object with metadata
    doc = Document(page_content=text_content, metadata={"file_path": file_path})
    documents.append(doc)


# Split documents into smaller chunks for embedding
splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=20)
split_docs = splitter.split_documents(documents)

In [None]:
# Load embedding model
embeddings = HuggingFaceEmbeddings(model_name='bkai-foundation-models/vietnamese-bi-encoder')

# Define ChromaDB storage path
chroma_db_path = "/content/vectordb"

# Create a Chroma vector database and add document embeddings
vector_db = Chroma(persist_directory=chroma_db_path, embedding_function=embeddings)
vector_db.add_documents(split_docs)

In [None]:
# Mount Google Drive to save the database
from google.colab import drive
drive.mount('/content/drive')

# Move the zipped database to Google Drive
!zip -r vectordb.zip vectordb
!mv vectordb.zip /content/drive/MyDrive/