In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import csv
import os

model_1 = SentenceTransformer("intfloat/multilingual-e5-base")
model_2 = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

with open('cache', mode='r', newline='', encoding='utf-8') as f:
    thutucs = list(csv.DictReader(f))
sentences = [e["Tên thủ tục"] for e in thutucs]

ValueError: I/O operation on closed file.

# Pre-calculate embeddings

In [None]:
vectors_1 = model_1.encode(sentences)
vectors_2 = model_2.encode(sentences)
np.save("vectors_1", vectors_1)
np.save("vectors_2", vectors_2)
os.rename("vectors_1.npy", "vectors_1")
os.rename("vectors_2.npy", "vectors_2")

# Load and use embeddings

In [None]:
vectors_1 = np.load("vectors_1")
vectors_2 = np.load("vectors_2")

In [None]:
test_questions = [
"sắp khởi nghiệp cần giấy tờ gì?",
"tôi muốn mua đất thì cần làm gì?",
"tôi sắp cưới vợ thì phải làm như nào?",
"tôi sắp lập gia đình thì cần làm gì?",
"vợ tôi sắp sinh con thủ tục nào?",
"thủ tục xây nhà cấp 3, 4?",
"phúc khảo bài thi thpt?",
]

for question in test_questions:
    print("-"*100)
    print(f"> {question}")
    q_emb = model_1.encode(question)
    similarities = model_1.similarity(q_emb, vectors_1)[0]
    top_5_idx = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:5]
    for idx in top_5_idx:
        print(sentences[idx])

print("="*100)

for question in test_questions:
    print("-"*100)
    print(f"> {question}")
    q_emb = model_2.encode(question)
    similarities = model_2.similarity(q_emb, vectors_2)[0]
    top_5_idx = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:5]
    for idx in top_5_idx:
        print(sentences[idx])