# 1. raw -> cache

In [1]:
import csv

# Step 1
def extract_and_save_to_csv(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    lines = [line.strip() for line in lines if line.strip()]
    items = [lines[i:i+6] for i in range(0, len(lines), 6)]
    headers = ["STT", "Mã chuẩn", "Tên thủ tục", "Lĩnh vực", "Cơ quan thực hiện", "Mức độ"]
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(headers)
        writer.writerows(items)
extract_and_save_to_csv("raw", "cache")

# Step 2
with open('cache', mode='r', newline='', encoding='utf-8') as f:
    thutucs = list(csv.DictReader(f))
    thutucs = sorted(thutucs, key=lambda e: len(e["Tên thủ tục"]))
with open('cache', mode='w', newline='', encoding='utf-8') as f:
    fieldnames = thutucs[0].keys()
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(thutucs)

# 2. cache -> vectors

In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
import csv
import os

model_1 = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

with open('cache', mode='r', newline='', encoding='utf-8') as f:
    thutucs = list(csv.DictReader(f))
sentences = [e["Tên thủ tục"] for e in thutucs]

### 2.1. Pre-calculate embeddings

In [None]:
vectors_1 = model_1.encode(sentences)
np.save("vectors", vectors_1)
os.rename("vectors.npy", "vectors")

### 2.2. Load and use embeddings

In [None]:
vectors_1 = np.load("vectors")

In [None]:
test_questions = [
"sắp khởi nghiệp cần giấy tờ gì?",
"tôi muốn mua đất thì cần làm gì?",
"tôi sắp cưới vợ thì phải làm như nào?",
"tôi sắp lập gia đình thì cần làm gì?",
"vợ tôi sắp sinh con thủ tục nào?",
"thủ tục xây nhà cấp 3, 4?",
"phúc khảo bài thi thpt?",
]

for question in test_questions:
    print("-"*100)
    print(f"> {question}")
    q_emb = model_1.encode(question)
    similarities = model_1.similarity(q_emb, vectors_1)[0]
    top_5_idx = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:5]
    for idx in top_5_idx:
        print(sentences[idx])