In [1]:
# import json
# with open("static/DATA.json", mode="r", encoding="utf-8") as f:
#     DATA = json.load(f)
# ldg_names = [e["name"] for e in DATA["DVC_TTHC_LamDong"]["data"]]

In [2]:
_TEST_PASSAGES = ["Th·ªß t·ª•c th√†nh l·∫≠p c√¥ng ty t∆∞ nh√¢n", "Th·ªß t·ª•c ƒëƒÉng k√Ω k·∫øt h√¥n", "Th·ªß t·ª•c chuy·ªÉn nh∆∞·ª£ng quy·ªÅn s·ª≠ d·ª•ng ƒë·∫•t", "Th·ªß t·ª•c ƒë·∫•u th·∫ßu ƒë·∫•t x√¢y d·ª±ng", "Th·ªß t·ª•c c·∫•p l·∫°i l√Ω l·ªãch t∆∞ ph√°p", "Th·ªß t·ª•c chuy·ªÉn tr∆∞·ªùng cho h·ªçc sinh trung h·ªçc ph·ªï th√¥ng", "Th·ªß t·ª•c chuy·ªÉn tr∆∞·ªùng cho h·ªçc sinh trung h·ªçc c∆° s·ªü", "Th·ªß t·ª•c chuy·ªÉn tr∆∞·ªùng cho h·ªçc sinh ti·ªÉu h·ªçc", "Th·ªß t·ª•c ƒëƒÉng k√Ω l·∫°i k·∫øt h√¥n", "Th·ªß t·ª•c ƒëƒÉng k√Ω k·∫øt h√¥n c√≥ y·∫øu t·ªë n∆∞·ªõc ngo√†i", "Th·ªß t·ª•c l√†m gi·∫•y khai sinh", "Th·ªß t·ª•c th√†nh l·∫≠p c√¥ng ty tr√°ch nhi·ªám h·ªØu h·∫°n 1 th√†nh vi√™n", "Th·ªß t·ª•c th√†nh l·∫≠p c√¥ng ty tr√°ch nhi·ªám h·ªØu h·∫°n 2 th√†nh vi√™n tr·ªü l√™n", "Th·ªß t·ª•c t·ªë c√°o t·∫°i c·∫•p x√£", "Th·ªß t·ª•c t·ªë c√°o t·∫°i c·∫•p t·ªânh"]
_TEST_QUERIES = ["Chuy·ªÉn Tr∆∞·ªùng", "Chuyen Truong", "Khai Sinh", "Ch√°u mu·ªën chuy·ªÉn tr∆∞·ªùng c·∫•p 3 th√¨ c·∫ßn ph·∫£i l√†m g√¨?", "T√¥i mu·ªën m·ªü c√¥ng ty th√¨ th·ªß t·ª•c g√¨?"]

-----

In [3]:
from huggingface_hub import hf_hub_download as HF_Download
from tokenizers import Tokenizer as STL_Tokenizer
from rank_bm25 import BM25Okapi as BM25_Retriever
import onnxruntime as ort
import numpy as np
import json
import os
from pkg.NLPT.NLPT import Process_NLPT_Tokenize, Process_NLPT_Normalize
os.makedirs("_hyse", exist_ok=True)

def dict2json(dict, jsonpath):
    try:
        with open(jsonpath, "w", encoding="utf-8") as f:
            json.dump(dict, f, ensure_ascii=False, indent=4)
    except Exception as er:
        print(f"‚ö†Ô∏è dict2json > Error: {er}")

def json2dict(jsonpath):
    dict = {}
    try:
        with open(jsonpath, "r", encoding="utf-8") as f:
            dict = json.load(f)
    except Exception as er:
        print(f"‚ö†Ô∏è json2dict > Error: {er}")
    return dict

class SentenceTransformerLite:
    # Init: model_path -> model + tokenizer
    def __init__(self, model_path="onelevelstudio/ML-E5-0.3B"):
        try:
            # Model (ONNX)
            try: HF_Download(repo_id=model_path, filename="onnx/model.onnx_data")
            except: pass
            STL_model = ort.InferenceSession(HF_Download(repo_id=model_path, filename="onnx/model.onnx"))
            # Tokenizer
            STL_tokenizer = STL_Tokenizer.from_pretrained(model_path)
            STL_tokenizer.enable_padding(pad_id=1, pad_token="<pad>")
            STL_tokenizer.enable_truncation(max_length=512)
        except Exception as er:
            raise ValueError(f"‚ö†Ô∏è > SentenceTransformerLite > init > Error: {er}")
        # Return
        self.STL_model = STL_model
        self.STL_tokenizer = STL_tokenizer
    # Encode: Text(s) -> Embedding(s)
    def encode(self, inputtexts):
        # Ensure inputtexts is a list of strings
        if isinstance(inputtexts, list) and all(isinstance(e, str) for e in inputtexts):
            if len(inputtexts) == 0:
                raise ValueError(f"‚ö†Ô∏è > SentenceTransformerLite > encode > inputtexts = empty list []")
        elif isinstance(inputtexts, str):
            inputtexts = [inputtexts]
        else:
            raise ValueError(f"‚ö†Ô∏è > SentenceTransformerLite > encode > inputtexts != string or list of strings")
        # Tokenize
        inputs = self.STL_tokenizer.encode_batch(inputtexts, is_pretokenized=False)
        inputs_ids = np.array([e.ids for e in inputs], dtype=np.int64)
        inputs_msk = np.array([e.attention_mask for e in inputs], dtype=np.int64)
        # Encoding
        embeddings = self.STL_model.run(None, {"input_ids": inputs_ids, "attention_mask": inputs_msk})[0]                                             # Encode
        embeddings = np.sum(embeddings * np.expand_dims(inputs_msk, axis=-1), axis=1) / np.maximum(np.sum(inputs_msk, axis=1, keepdims=True), 1e-9)   # Pooling
        embeddings = embeddings / np.maximum(np.linalg.norm(embeddings, axis=1, keepdims=True), 1e-9)                                                 # Normalize
        # Return
        return embeddings

In [4]:
class HYSE_EngineSemantic:
    # # ----- Example -----
    # engine_semantic = HYSE_EngineSemantic()
    # engine_semantic.update(_TEST_PASSAGES)
    # engine_semantic.search(_TEST_QUERIES)
    # # -------------------
    def __init__(self, name="hyse001_sem1", modelpath="onelevelstudio/ML-E5-0.3B"):
        self.name = name
        self.modelpath = modelpath
        self.savepath_docs = f"_hyse/{name}_docs.json"
        self.savepath_embs = f"_hyse/{name}_embs.npy"
        self.model = SentenceTransformerLite(modelpath)
        # ----------
        self.docs = []
        self.embs = []
        if os.path.exists(self.savepath_docs) and os.path.exists(self.savepath_embs):
            self.docs = json2dict(self.savepath_docs)["docs"]  # üì§ Read file as docs
            self.embs = np.load(self.savepath_embs)            # üì§ Read file as embs
    def update(self, new_docs):
        if self.docs == new_docs:
            pass
        else:
            self.docs = new_docs
            self.embs = self.model.encode(self.docs)
            dict2json({"docs": self.docs}, self.savepath_docs) # üì• Save docs as file
            np.save(self.savepath_embs, self.embs)             # üì• Save embs as file
    def search(self, new_queries, top=5):
        embs_queries = self.model.encode(new_queries)
        # -----
        similarities = embs_queries @ self.embs.T
        best_matching_idxs = [[idx for idx, _ in sorted(enumerate(sim), key=lambda x: x[1], reverse=True)][:min(top, len(self.docs))] for sim in similarities]
        best_matching_docs = [[self.docs[idx] for idx in e] for e in best_matching_idxs]
        best_matching_similarities = [[similarities[i][idx] for idx in idxs] for i, idxs in enumerate(best_matching_idxs)]
        # -----
        return [[{"index": ee[0], "doc": ee[1], "score": round(float(ee[2]), 3)} for ee in zip(e[0], e[1], e[2])] for e in zip(best_matching_idxs, best_matching_docs, best_matching_similarities)]

class HYSE_EngineLexical:
    # # ----- Example -----
    # engine_lexical = HYSE_EngineLexical()
    # engine_lexical.update(_TEST_PASSAGES)
    # engine_lexical.search(_TEST_QUERIES)
    # # -------------------
    def __init__(self, name="hyse001_lex1"):
        self.name = name
        self.savepath_docs = f"_hyse/{name}_docs.json"
        self.savepath_embs = f"_hyse/{name}_embs.json"
        # ----------
        self.docs = []
        self.embs = []
        self.model = None
        if os.path.exists(self.savepath_docs) and os.path.exists(self.savepath_embs):
            self.docs = json2dict(self.savepath_docs)["docs"]  # üì§ Read file as docs
            self.embs = json2dict(self.savepath_embs)["embs"]  # üì§ Read file as embs
            self.model = BM25_Retriever(self.embs)
    def update(self, new_docs):
        if self.docs == new_docs:
            pass
        else:
            self.docs = new_docs
            self.embs = [Process_NLPT_Tokenize(e) for e in self.docs]
            self.model = BM25_Retriever(self.embs)
            dict2json({"docs": self.docs}, self.savepath_docs) # üì• Save docs as file
            dict2json({"embs": self.embs}, self.savepath_embs) # üì• Save embs as file
    def search(self, new_queries, top=5):
        queries_embs = [Process_NLPT_Tokenize(e) for e in new_queries]
        # -----
        similarities = [self.model.get_scores(query_emb) for query_emb in queries_embs]
        best_matching_idxs = [self.model.get_top_n(query_emb, range(len(self.docs)), n=top) for query_emb in queries_embs]
        best_matching_docs = [[self.docs[idx] for idx in e] for e in best_matching_idxs]
        best_matching_similarities = [[similarities[i][idx] for idx in idxs] for i, idxs in enumerate(best_matching_idxs)]
        # -----
        return [[{"index": ee[0], "doc": ee[1], "score": round(float(ee[2]), 3)} for ee in zip(e[0], e[1], e[2])] for e in zip(best_matching_idxs, best_matching_docs, best_matching_similarities)]

class HYSE_EngineExactMatch:
    # # ----- Example -----
    # engine_exactmatch = HYSE_EngineExactMatch()
    # engine_exactmatch.update(_TEST_PASSAGES)
    # engine_exactmatch.search(_TEST_QUERIES)
    # # -------------------
    def __init__(self, name="hyse001_exa1"):
        self.name = name
        self.savepath_docs = f"_hyse/{name}_docs.json"
        # ----------
        self.docs = []
        if os.path.exists(self.savepath_docs):
            self.docs = json2dict(self.savepath_docs)["docs"]  # üì§ Read file as docs
    def update(self, new_docs):
        if self.docs == new_docs:
            pass
        else:
            self.docs = new_docs
            dict2json({"docs": self.docs}, self.savepath_docs) # üì• Save docs as file
    def search(self, new_queries):
        best_matching_idxs = []
        for q in new_queries:
            # Exact match with diacritics
            tmp_idxs = [i for i, d in enumerate(self.docs) if q.lower().strip() in d.lower().strip()]
            if len(tmp_idxs) == 0:
                # Exact match without diacritics
                tmp_idxs = [i for i, d in enumerate(self.docs) if Process_NLPT_Normalize(q) in Process_NLPT_Normalize(d)]
            best_matching_idxs.append(tmp_idxs)
        best_matching_docs = [[self.docs[idx] for idx in e] for e in best_matching_idxs]
        best_matching_similarities = [[round(len(new_queries[qidx])/len(doc), 3) for doc in e] for qidx, e in enumerate(best_matching_docs)]
        # -----
        return [[{"index": ee[0], "doc": ee[1], "score": round(float(ee[2]), 3)} for ee in zip(e[0], e[1], e[2])] for e in zip(best_matching_idxs, best_matching_docs, best_matching_similarities)]

In [5]:
class HYSE_EngineHybrid:
    def __init__(self, name="HYSE1"):
        self.search_engine_1 = HYSE_EngineSemantic(name=f"{name}_SEM1", modelpath="onelevelstudio/ML-E5-0.3B")
        self.search_engine_2 = HYSE_EngineSemantic(name=f"{name}_SEM2", modelpath="onelevelstudio/MPNET-0.3B")
        self.search_engine_3 = HYSE_EngineLexical(name=f"{name}_LEX1")
        self.search_engine_4 = HYSE_EngineExactMatch(name=f"{name}_EXA1")
        self.docs = ["‚ú®"]
        self.update(self.docs)
    def update(self, new_docs):
        self.docs = new_docs
        self.search_engine_1.update(self.docs)
        self.search_engine_2.update(self.docs)
        self.search_engine_3.update(self.docs)
        self.search_engine_4.update(self.docs)
    def search(self, new_queries):
        res_1 = self.search_engine_1.search(new_queries)
        res_2 = self.search_engine_2.search(new_queries)
        res_3 = self.search_engine_3.search(new_queries)
        res_4 = self.search_engine_4.search(new_queries)
        print(res_1)
        print(res_2)
        print(res_3)
        print(res_4)

In [6]:
# ----- Example -----
hyse_engine = HYSE_EngineHybrid()
hyse_engine.update(_TEST_PASSAGES)
hyse_engine.search(_TEST_QUERIES)
# -------------------

[[{'index': 7, 'doc': 'Th·ªß t·ª•c chuy·ªÉn tr∆∞·ªùng cho h·ªçc sinh ti·ªÉu h·ªçc', 'score': 0.894}, {'index': 5, 'doc': 'Th·ªß t·ª•c chuy·ªÉn tr∆∞·ªùng cho h·ªçc sinh trung h·ªçc ph·ªï th√¥ng', 'score': 0.884}, {'index': 6, 'doc': 'Th·ªß t·ª•c chuy·ªÉn tr∆∞·ªùng cho h·ªçc sinh trung h·ªçc c∆° s·ªü', 'score': 0.88}, {'index': 2, 'doc': 'Th·ªß t·ª•c chuy·ªÉn nh∆∞·ª£ng quy·ªÅn s·ª≠ d·ª•ng ƒë·∫•t', 'score': 0.867}, {'index': 10, 'doc': 'Th·ªß t·ª•c l√†m gi·∫•y khai sinh', 'score': 0.821}], [{'index': 10, 'doc': 'Th·ªß t·ª•c l√†m gi·∫•y khai sinh', 'score': 0.777}, {'index': 13, 'doc': 'Th·ªß t·ª•c t·ªë c√°o t·∫°i c·∫•p x√£', 'score': 0.773}, {'index': 1, 'doc': 'Th·ªß t·ª•c ƒëƒÉng k√Ω k·∫øt h√¥n', 'score': 0.773}, {'index': 14, 'doc': 'Th·ªß t·ª•c t·ªë c√°o t·∫°i c·∫•p t·ªânh', 'score': 0.768}, {'index': 0, 'doc': 'Th·ªß t·ª•c th√†nh l·∫≠p c√¥ng ty t∆∞ nh√¢n', 'score': 0.768}], [{'index': 10, 'doc': 'Th·ªß t·ª•c l√†m gi·∫•y khai sinh', 'score': 0.841}, {'index': 1, 'doc': 'Th·ªß t·ª•c ƒëƒ