### 日文

In [None]:
pip install sudachipy sudachidict_full sudachidict_core


In [3]:
import pandas as pd

input_file = 'cleaned_comments_new.csv'  
output_file = 'cleaned_comments_th.csv'  

# 載入 CSV
df = pd.read_csv(input_file)

# 過濾 "language"
df_filtered = df[df['language'] == 'th']

# 輸出
df_filtered.to_csv(output_file, index=False)

print(f"已成功過濾並輸出為 {output_file}")

已成功過濾並輸出為 cleaned_comments_th.csv


In [None]:
import torch
import pandas as pd
import json
from FlagEmbedding import BGEM3FlagModel
from transformers import AutoTokenizer
from collections import defaultdict
import stopwordsiso
import gc


from sudachipy import Dictionary
tokenizer_ja = Dictionary().create() 

# 模型載入
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True, use_cuda=True, device=0)

# 中文停用詞
stopwords = stopwordsiso.stopwords("ja")

# SudachiPy 子詞合併函式（批次處理版本）
def merge_subwords_ja(tokens, weights, sudachi_words):
    """
    針對日文評論：
    - XLM-R tokenizer 負責 subword 切割
    - SudachiPy 提供詞彙邊界，幫助合併子詞
    """
    merged_tokens, merged_weights = [], []
    idx = 0

    for word in sudachi_words:  # 🔹 這裡直接用已經分詞好的 `sudachi_words`
        current_weight = 0.0
        current_length = 0

        while current_length < len(word) and idx < len(tokens):
            token = tokens[idx].replace("▁", "").replace("<s>", "").replace("</s>", "")
            current_weight += weights[idx]
            current_length += len(token)
            idx += 1

        merged_tokens.append(word)
        merged_weights.append(current_weight)

    return merged_tokens, merged_weights

# 讀取地點對應資訊
with open("../RAW_DATA/E_838_location_info.json", "r", encoding="utf-8") as f:
    location_info = json.load(f)
location_mapping = {loc["location_id"]: loc["gmap_location"] for loc in location_info}

location_topics_weight = defaultdict(lambda: defaultdict(float))

# 分批處理評論資料 (batch 處理 BGE 和 CKIP)
chunksize = 10000
reader = pd.read_csv("cleaned_comments_run.csv", chunksize=chunksize)

chunk_index = 0

for df_chunk in reader:
    df_chunk = df_chunk.dropna(subset=["comments"])
    print(f"正在處理第 {chunk_index+1} 個chunk，共 {len(df_chunk)} 筆評論")

    comments = df_chunk["comments"].tolist()
    location_ids = df_chunk["location_id"].tolist()
    languages = df_chunk["language"].tolist()

    # ** 先用 tokenizer 處理 input_ids (避免 KeyError)**
    tokenized_inputs = tokenizer(comments, padding=True, truncation=True, return_tensors="pt")

    # ** BGE 模型 batch 處理**
    with torch.no_grad():
        outputs = model.encode(comments, return_sparse=True, batch_size=128)

    sparse_weights_list = outputs["lexical_weights"]  
    input_ids_list = tokenized_inputs["input_ids"].tolist()  

    # ** SudachiPy 分詞 batch 處理**
    sudachi_words_batch = [ [m.surface() for m in tokenizer_ja.tokenize(text)] for text in comments ]

    # ** 處理每筆評論**
    for idx in range(len(comments)):
        sparse_weights = sparse_weights_list[idx]
        input_ids = input_ids_list[idx]
        tokens_str = tokenizer.convert_ids_to_tokens(input_ids)

        lexical_weights = [sparse_weights.get(str(token_id), 0.0) for token_id in input_ids]

        # 使用 SudachiPy 來指導日文子詞合併（這裡不再 tokenize，因為 sudachi_words_batch[idx] 已經是 list）
        merged_tokens, merged_weights = merge_subwords_ja(tokens_str, lexical_weights, sudachi_words_batch[idx])

        key = (location_ids[idx], languages[idx])
        for token, weight in zip(merged_tokens, merged_weights):
            token = token.lower().strip()
            if len(token) < 2 or token in stopwords:
                continue
            location_topics_weight[key][token] += weight

    # 每處理完一個chunk，清理記憶體
    del df_chunk, comments, location_ids, languages, sparse_weights_list, input_ids_list, sudachi_words_batch
    gc.collect()
    chunk_index += 1
    print(f"完成第{chunk_index}個chunk處理與記憶體清理。")

# 全部chunk完成後再產出top30 Excel
rows = []
for (loc_id, lang), token_weights in location_topics_weight.items():
    top30 = sorted(token_weights.items(), key=lambda x: x[1], reverse=True)[:30]
    top_words_str = ", ".join([f"{token}:{weight:.2f}" for token, weight in top30])
    rows.append({
        "location_id": loc_id,
        "gmap_location": location_mapping.get(loc_id, loc_id),
        "language": lang,
        "top_30_tokens": top_words_str
    })

df_output = pd.DataFrame(rows)
df_output.to_excel("location_topics_top30_ja.xlsx", index=False)
print("處理完成，已儲存 location_topics_top30_ja.xlsx")


### 泰文

In [None]:
pip install pythainlp

In [4]:
from pythainlp.tokenize import word_tokenize

In [None]:
import torch
import pandas as pd
import json
from FlagEmbedding import BGEM3FlagModel
from transformers import AutoTokenizer
from collections import defaultdict
import stopwordsiso
import gc

# 模型載入
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True, use_cuda=True, device=0)


# 中文停用詞
stopwords = stopwordsiso.stopwords("th")

# pythainlp 子詞合併函式（批次處理版本）
def merge_subwords_th(tokens, weights, text):
    """
    針對泰文評論：
    - XLM-R tokenizer 先切 subword
    - PyThaiNLP 幫助合併子詞，使詞彙更自然
    """
    if isinstance(text, list):  # 確保 text 是 string 而非 list
        text = " ".join(text)
    
    words = word_tokenize(text, engine="newmm")  
    words = [w.strip() for w in words if w.strip()]  # **過濾掉空白詞**

    merged_tokens, merged_weights = [], []
    idx = 0

    for word in words:
        current_weight = 0.0
        current_length = 0
        success = False  # **加入這個變數來確認是否成功匹配**

        while current_length < len(word) and idx < len(tokens):
            token = tokens[idx].replace("▁", "").replace("<s>", "").replace("</s>", "").replace("<pad>", "").strip()
            
            if not token:  
                idx += 1
                continue

            current_weight += weights[idx]
            current_length += len(token)
            idx += 1
            success = True  # **如果有任何 token 被加總，代表匹配成功**

        # **如果詞彙成功匹配，才加入**
        if success and word:
            merged_tokens.append(word)
            merged_weights.append(current_weight)
        elif word:  # **只在完全沒有匹配成功時，才印出警告**
            print(f"merge_subwords_th() 警告：未成功合併詞彙 - {word}")

    return merged_tokens, merged_weights

# 讀取地點對應資訊
with open("../RAW_DATA/E_838_location_info.json", "r", encoding="utf-8") as f:
    location_info = json.load(f)
location_mapping = {loc["location_id"]: loc["gmap_location"] for loc in location_info}

location_topics_weight = defaultdict(lambda: defaultdict(float))

# 分批處理評論資料 (batch 處理 BGE 和 CKIP)
chunksize = 10000
reader = pd.read_csv("cleaned_comments_run.csv", chunksize=chunksize)

chunk_index = 0

for df_chunk in reader:
    df_chunk = df_chunk.dropna(subset=["comments"])
    print(f"正在處理第 {chunk_index+1} 個chunk，共 {len(df_chunk)} 筆評論")

    comments = df_chunk["comments"].tolist()
    location_ids = df_chunk["location_id"].tolist()
    languages = df_chunk["language"].tolist()

    # ** 先用 tokenizer 處理 input_ids (避免 KeyError)**
    tokenized_inputs = tokenizer(comments, padding=True, truncation=True, return_tensors="pt")

    # ** BGE 模型 batch 處理**
    with torch.no_grad():
        outputs = model.encode(comments, return_sparse=True, batch_size=128)

    sparse_weights_list = outputs["lexical_weights"]  
    input_ids_list = tokenized_inputs["input_ids"].tolist()  

    # ** 依語言選擇合適的分詞方法**
    thai_words_batch = []

    for text in comments:
        thai_words_batch.append(word_tokenize(text, engine="newmm"))  # 泰文用 PyThaiNLP 分詞

    # ** 處理每筆評論**
    for idx in range(len(comments)):
        sparse_weights = sparse_weights_list[idx]
        input_ids = input_ids_list[idx]
        tokens_str = tokenizer.convert_ids_to_tokens(input_ids)

        lexical_weights = [sparse_weights.get(str(token_id), 0.0) for token_id in input_ids]

        # 使用 PyThaiNLP 來指導泰文子詞合併
        merged_tokens, merged_weights = merge_subwords_th(tokens_str, lexical_weights, thai_words_batch[idx])

        key = (location_ids[idx], languages[idx])
        for token, weight in zip(merged_tokens, merged_weights):
            token = token.lower().strip()
            if len(token) < 2 or token in stopwords:
                continue
            location_topics_weight[key][token] += weight

        if len(location_topics_weight[key]) == 0:
            location_topics_weight[key]["(無關鍵詞)"] = 0.0  # 避免 key 是空的

    # 每處理完一個chunk，清理記憶體
    del df_chunk, comments, location_ids, languages, sparse_weights_list, input_ids_list, thai_words_batch
    gc.collect()
    chunk_index += 1
    print(f"完成第{chunk_index}個chunk處理與記憶體清理。")

# 全部chunk完成後再產出top30 Excel
rows = []
for (loc_id, lang), token_weights in location_topics_weight.items():
    top30 = sorted(token_weights.items(), key=lambda x: x[1], reverse=True)[:30]
    top_words_str = ", ".join([f"{token}:{weight:.2f}" for token, weight in top30])
    rows.append({
        "location_id": loc_id,
        "gmap_location": location_mapping.get(loc_id, loc_id),
        "language": lang,
        "top_30_tokens": top_words_str
    })

df_output = pd.DataFrame(rows)
df_output.to_excel("location_topics_top30_th_.xlsx", index=False)
print("處理完成，已儲存 location_topics_top30_th.xlsx")
