In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import csv
import os

# Pre-process

In [None]:
def raw2cache(raw_path, cache_path):
    # Step 1
    with open(raw_path, 'r', encoding='utf-8') as f1:
        lines = f1.readlines()
        lines = [line.strip() for line in lines if line.strip()]
        items = [lines[i:i+6] for i in range(0, len(lines), 6)]
        headers = ["STT", "Mã chuẩn", "Tên thủ tục", "Lĩnh vực", "Cơ quan thực hiện", "Mức độ"]
        with open(cache_path, 'w', newline='', encoding='utf-8') as f2:
            writer = csv.writer(f2)
            writer.writerow(headers)
            writer.writerows(items)
    # Step 2
    with open(cache_path, mode='r', newline='', encoding='utf-8') as f:
        thutucs = list(csv.DictReader(f))
        thutucs = sorted(thutucs, key=lambda e: len(e["Tên thủ tục"]))
    with open(cache_path, mode='w', newline='', encoding='utf-8') as f:
        fieldnames = thutucs[0].keys()
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(thutucs)

raw2cache("url/raw", "url/cache")

In [None]:
model_e5 = SentenceTransformer("onelevelstudio/M-E5-BASE")
model_mpnet = SentenceTransformer("onelevelstudio/M-MPNET-BASE")

In [None]:
with open('url/cache', mode='r', newline='', encoding='utf-8') as f:
    thutucs = list(csv.DictReader(f))
    tenthutucs = [e["Tên thủ tục"] for e in thutucs]

In [None]:
# Option 1: Re-vectorize embeddings
embs_e5    = model_e5.encode(tenthutucs)
embs_mpnet = model_mpnet.encode(tenthutucs)
np.save("url/embs_e5", embs_e5)
np.save("url/embs_mpnet", embs_mpnet)
os.rename("url/embs_e5.npy", "url/embs_e5")
os.rename("url/embs_mpnet.npy", "url/embs_mpnet")

# Option 2: Load pre-vectorized embeddings
embs_e5 = np.load("url/embs_e5")
embs_mpnet = np.load("url/embs_mpnet")

# Main Process