In [None]:
import pandas as pd
import pickle
import re
from ckip_transformers.nlp import CkipWordSegmenter
from rank_bm25 import BM25Okapi
from joblib import dump, load

# 文字分詞及過濾停用詞
def split_words(text: str, stop_words):
    ws_driver = CkipWordSegmenter(model="bert-base",device=0)
    text = re.sub(r"[,\.\r\n ]", "。", text)  # 標點與空白替換為句點
    ws_result = ws_driver([text])
    filtered_words = [re.sub(r'[^\w\s]', '', word).strip() for word in ws_result[0] if word.strip()]
    return [word for word in filtered_words if word not in stop_words]


# 載入 BM25 模型
def load_bm25_model(filename):
    bm25 = load(filename)  # 使用 joblib 載入模型
    print(f"BM25 模型已從 {filename} 載入")
    return bm25

# BM25 搜尋
def BM25_search(bm25, query, stop_words, top_k, df):
    query_words = split_words(query, stop_words)
    print("切割後進行bm25:", query_words)

    if not query_words:
        return [], []
    doc_scores = bm25.get_scores(query_words).tolist()
    top_n_text = bm25.get_top_n(query_words, df['CKIP Text'].tolist(), n=top_k)
    original_top_n = [
        df[df['CKIP Text'] == chunk]['ArticleContent'].values[0]
        for chunk in top_n_text if chunk in df['CKIP Text'].values
    ]
    print(original_top_n)
    return original_top_n, doc_scores

if __name__ == "__main__":
    # 設定 CSV 檔案路徑和停用詞
    csv_path = r"C:\PYTHON\自主學習\csv_json\刑法_去除停用詞.csv"  # 替換成實際 CSV 檔案路徑
    with open("C:/PYTHON/自主學習/csv_json/stop_words.txt","r",encoding="utf-8") as f:

        stop_words = set(f.read().splitlines())

    df=pd.read_csv(csv_path)

    # 載入模型
    bm25 = load_bm25_model("bm25_model.joblib")  # 載入已儲存模型
    
    # 使用 BM25 進行搜尋
    query = "請問刑法第一百條是什麼內容"
    top_k = 3  # 預期返回最相關的 3 條結果
    original_top_n, doc_scores = BM25_search(bm25, query, stop_words, top_k, df)
    
    # 顯示結果
    print("搜尋結果:", original_top_n)
    print("文檔分數:", doc_scores)
