### 利用語意模型來分析評論主題(類似詞頻)

### 多語系停用詞套件 `stopwords-iso`
`https://github.com/stopwords-iso/stopwords-iso`

In [None]:
pip install stopwordsiso

In [None]:
import torch
import pandas as pd
import json
from FlagEmbedding import BGEM3FlagModel
from transformers import AutoTokenizer
from collections import defaultdict
import stopwordsiso 
import gc

# 模型載入
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True, use_cuda=True, device=0)

# 多語言停用詞
langs = ["zh", "en", "ja", "ko", "th", "vi", "id", "fr", "de", "it", "es"]
stopwords = set()
for lang in langs:
    stopwords.update(stopwordsiso.stopwords(lang))

# 子詞合併函式
def merge_subwords(tokens, weights):
    merged_tokens, merged_weights = [], []
    current_token, current_weight = "", 0.0
    for token, weight in zip(tokens, weights):
        if token.startswith("▁") or token.startswith("<"):
            if current_token:
                merged_tokens.append(current_token)
                merged_weights.append(current_weight)
            current_token = token.lstrip("▁<")
            current_weight = weight
        else:
            current_token += token
            current_weight += weight
    if current_token:
        merged_tokens.append(current_token)
        merged_weights.append(current_weight)
    return merged_tokens, merged_weights

# 讀取地點對應資訊 
with open("../RAW_DATA/E_838_location_info.json", "r", encoding="utf-8") as f:
    location_info = json.load(f)
location_mapping = {loc["location_id"]: loc["gmap_location"] for loc in location_info}

# 最終結果 
location_topics_weight = defaultdict(lambda: defaultdict(float))

# 分批處理
chunksize = 10000  # 根據記憶體情況調整
reader = pd.read_csv("cleaned_comments_new.csv", chunksize=chunksize)

chunk_index = 0

for chunk_idx, df_chunk in enumerate(reader):
    df_chunk = df_chunk.dropna(subset=["comments"])
    print(f"正在處理第 {chunk_index+1} 個chunk，共 {len(df_chunk)} 筆評論")

    # chunk內逐筆處理
    for idx, row in df_chunk.iterrows():
        comment = row["comments"]
        location_id = row["location_id"]
        language = row["language"]

        # 模型取得 sparse 權重
        with torch.no_grad():
            output = model.encode([comment], return_sparse=True)
        sparse_weights = output["lexical_weights"][0]

        # tokenizer 處理
        tokens = tokenizer(comment, return_tensors="pt", truncation=True)
        input_ids = tokens["input_ids"][0].tolist()
        tokens_str = tokenizer.convert_ids_to_tokens(input_ids)

        # 取得每個token權重
        lexical_weights = [sparse_weights.get(str(token_id), 0.0) for token_id in input_ids]

        # 子詞合併
        merged_tokens, merged_weights = merge_subwords(tokens_str, lexical_weights)

        # 累積權重統計 
        key = (location_id, language)
        if key not in location_topics_weight:
            location_topics_weight[key] = defaultdict(float)

        for token, weight in zip(merged_tokens, merged_weights):
            token = token.lower().strip()
            if len(token) < 2 or token in stopwords:
                continue
            location_topics_weight[key][token] += weight

    # 每處理完一個chunk，清除記憶體
    del df_chunk
    gc.collect()
    chunk_index += 1

# 最後全部chunk完成後再產出top30 Excel
rows = []
for (loc_id, lang), token_weights in location_topics_weight.items():
    top30 = sorted(token_weights.items(), key=lambda x: x[1], reverse=True)[:30]
    top_words_str = ", ".join([f"{token}:{weight:.2f}" for token, weight in top30])
    rows.append({
        "location_id": loc_id,
        "gmap_location": location_mapping.get(loc_id, loc_id),
        "language": lang,
        "top_30_tokens": top_words_str
    })

df_output = pd.DataFrame(rows)
df_output.to_excel("location_topics_top30_fixed_final.xlsx", index=False)
print("location_topics_top30_fixed_final.xlsx已完成。")


In [10]:
comment = "空氣很好，風景漂亮。"
output = model.encode([comment], return_sparse=True)

print("lexical_weights型別:", type(output["lexical_weights"]))
print("lexical_weights內容:", output["lexical_weights"])


lexical_weights型別: <class 'list'>
lexical_weights內容: [defaultdict(<class 'int'>, {'6': 0.1702, '90552': 0.302, '29787': 0.2357, '4': 0.1472, '80791': 0.2461, '80179': 0.269, '30': 0.1812})]


In [None]:
import pandas as pd

# 設定檔案路徑
input_file = 'cleaned_comments_new.csv'
output_file = 'test.csv'

# 讀取 CSV 檔案
df = pd.read_csv(input_file)

# 指定篩選條件 (location_id 為 XXX 或空白)
filtered_df = df[df['location_id'].isin(
    ['0x345d451e2a77aef7:0xf079d1cfca8d55d5',
     #'XXX',
    ]
    )]

# 將篩選後的資料存儲為新的 CSV 檔案
filtered_df.to_csv(output_file, index=False)

print(f"篩選後的資料已儲存為 {output_file}")


### 除了中文以外，效果非常好! 另外嘗試子詞合併時，用install ckip-transformers作為參考字典進行subwords的合併

In [None]:
import pandas as pd

input_file = 'cleaned_comments_new.csv'  
output_file = 'cleaned_comments_run.csv'  

# 載入 CSV
df = pd.read_csv(input_file)

# 過濾 "language" 和 "location_id"
df_filtered = df[(df['language'] == 'th') & (df['location_id'] == '0x3442ac3acd404a7d:0x5d6d7018397a09c1')]

# 輸出
df_filtered.to_csv(output_file, index=False)

print(f"已成功過濾並輸出為 {output_file}")


In [None]:
pip install ckip-transformers


In [None]:
from ckip_transformers.nlp import CkipWordSegmenter
ws_driver = CkipWordSegmenter(model="bert-base", device=0)

In [None]:
import torch
import pandas as pd
import json
from FlagEmbedding import BGEM3FlagModel
from transformers import AutoTokenizer
from collections import defaultdict
import stopwordsiso
import gc
from ckip_transformers.nlp import CkipWordSegmenter

# 模型載入
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True, use_cuda=True, device=0)
ws_driver = CkipWordSegmenter(model="bert-base", device=0)

# 中文停用詞
stopwords = stopwordsiso.stopwords("zh")

# CKIP 子詞合併函式（批次處理版本）
def merge_subwords(tokens, weights, ckip_words):
    merged_tokens, merged_weights = [], []
    idx = 0

    for word in ckip_words:
        current_weight = 0.0
        current_length = 0

        while current_length < len(word) and idx < len(tokens):
            token = tokens[idx].replace("▁", "").replace("<s>", "").replace("</s>", "")
            current_weight += weights[idx]
            current_length += len(token)
            idx += 1

        merged_tokens.append(word)
        merged_weights.append(current_weight)

    return merged_tokens, merged_weights

# 讀取地點對應資訊
with open("../RAW_DATA/E_838_location_info.json", "r", encoding="utf-8") as f:
    location_info = json.load(f)
location_mapping = {loc["location_id"]: loc["gmap_location"] for loc in location_info}

location_topics_weight = defaultdict(lambda: defaultdict(float))

# 分批處理評論資料 (batch 處理 BGE 和 CKIP)
chunksize = 2000
reader = pd.read_csv("cleaned_comments_run.csv", chunksize=chunksize)

chunk_index = 0

for df_chunk in reader:
    df_chunk = df_chunk.dropna(subset=["comments"])
    print(f"正在處理第 {chunk_index+1} 個chunk，共 {len(df_chunk)} 筆評論")

    comments = df_chunk["comments"].tolist()
    location_ids = df_chunk["location_id"].tolist()
    languages = df_chunk["language"].tolist()

    # ** 先用 tokenizer 處理 input_ids (避免 KeyError)**
    tokenized_inputs = tokenizer(comments, padding=True, truncation=True, return_tensors="pt")

    # ** BGE 模型 batch 處理**
    with torch.no_grad():
        outputs = model.encode(comments, return_sparse=True, batch_size=128)

    sparse_weights_list = outputs["lexical_weights"]  
    input_ids_list = tokenized_inputs["input_ids"].tolist()  

    # ** CKIP 分詞 batch 處理**
    ckip_words_batch = ws_driver(comments, batch_size=128)

    # 處理每筆評論
    for idx in range(len(comments)):
        sparse_weights = sparse_weights_list[idx]
        input_ids = input_ids_list[idx]
        tokens_str = tokenizer.convert_ids_to_tokens(input_ids)

        lexical_weights = [sparse_weights.get(str(token_id), 0.0) for token_id in input_ids]

        # 使用 CKIP 的結果指導合併子詞
        merged_tokens, merged_weights = merge_subwords(tokens_str, lexical_weights, ckip_words_batch[idx])

        key = (location_ids[idx], languages[idx])
        for token, weight in zip(merged_tokens, merged_weights):
            token = token.lower().strip()
            if len(token) < 2 or token in stopwords:
                continue
            location_topics_weight[key][token] += weight

    # 每處理完一個chunk，清理記憶體
    del df_chunk, comments, location_ids, languages, sparse_weights_list, input_ids_list, ckip_words_batch
    gc.collect()
    chunk_index += 1
    print(f"完成第{chunk_index}個chunk處理與記憶體清理。")

# 全部chunk完成後再產出top30 Excel
rows = []
for (loc_id, lang), token_weights in location_topics_weight.items():
    top30 = sorted(token_weights.items(), key=lambda x: x[1], reverse=True)[:30]
    top_words_str = ", ".join([f"{token}:{weight:.2f}" for token, weight in top30])
    rows.append({
        "location_id": loc_id,
        "gmap_location": location_mapping.get(loc_id, loc_id),
        "language": lang,
        "top_30_tokens": top_words_str
    })

df_output = pd.DataFrame(rows)
df_output.to_excel("location_topics_top30_chinese.xlsx", index=False)
print("處理完成，已儲存 location_topics_top30_chinese.xlsx")
