In [7]:
import pandas as pd
import re

# 定義清整函式
def clean_text(text):
    if isinstance(text, str):
        text = text.replace(',', ' ')  
        text = text.replace('\n', ' ')  
        text = re.sub(r'\s+', ' ', text).strip()  
    return text

# 讀取 CSV
csv_file = "cleaned_comments_cleaned.csv"

# 讀取 CSV，僅保留必要欄位
df = pd.read_csv(csv_file, usecols=['gmap_location','location_id','review_id', 'score','date','comments','language','translated_comments'])

# 清整 comments 和 translated_comments 欄位
df['comments'] = df['comments'].apply(clean_text)
df['translated_comments'] = df['translated_comments'].apply(clean_text)

# 移除空評論
df = df.dropna(subset=['comments', 'language'])

# 目標語言清單
target_languages = ["zh-Hant", "en", "ja", "ko", "th", "vi", "id"]

# 過濾非目標語言的評論
df_filtered = df[df["language"].isin(target_languages)]

# 移除過短的評論（少於 10 個字）
df_filtered = df_filtered[df_filtered["comments"].str.len() >= 10]

# 顯示前 5 筆，確認資料格式
print(df_filtered.head())

# 儲存過濾後的 CSV
df_filtered.to_csv("cleaned_comments_target.csv", index=False)
print(f"有效評論數量: {len(df_filtered)}")


  gmap_location                            location_id  \
0       白楊步道水簾洞  0x346885034084ae89:0x59a8b1eac5731c29   
1       白楊步道水簾洞  0x346885034084ae89:0x59a8b1eac5731c29   
2       白楊步道水簾洞  0x346885034084ae89:0x59a8b1eac5731c29   
3       白楊步道水簾洞  0x346885034084ae89:0x59a8b1eac5731c29   
4       白楊步道水簾洞  0x346885034084ae89:0x59a8b1eac5731c29   

                              review_id  score        date  \
0   ChZDSUhNMG9nS0VJQ0FnTURBM3RueUd3EAE      3  2025-02-09   
1   ChZDSUhNMG9nS0VJQ0FnSURQbDdIOWZREAE      1  2024-12-06   
2   ChZDSUhNMG9nS0VJQ0FnSURINzR1MUN3EAE      5  2024-09-21   
3  ChdDSUhNMG9nS0VJQ0FnSURIdV9pYnFBRRAB      5  2024-09-20   
4   ChZDSUhNMG9nS0VJQ0FnSURIanJHb2ZnEAE      5  2024-09-15   

                                            comments language  \
0                                  在0403的地震就崩壞了，過不去了  zh-Hant   
1  給一日遊，請先做功課喔， 113/12/5天氣晴，查網路：蘇花，中橫順暢， 想著沿途，中橫施...  zh-Hant   
2  太魯閣水簾洞：大自然鬼斧神工的傑作 太魯閣，這片鬼斧神工的峽谷，處處是令人驚嘆的自然景觀。其...  zh-Hant   
3                 

In [30]:
import pandas as pd

# 設定檔案路徑
input_file = 'cleaned_comments_target.csv'
output_file = 'test13.csv'

# 讀取 CSV 檔案
df = pd.read_csv(input_file)

# 指定篩選條件 (location_id 為 XXX 或空白)
filtered_df = df[df['location_id'].isin(
    ['0x346e048bc0d739c7:0xd3e6e842bfa0ecfd',
     #'XXX',
    ]
    )]

# 將篩選後的資料存儲為新的 CSV 檔案
filtered_df.to_csv(output_file, index=False)

print(f"篩選後的資料已儲存為 {output_file}")

篩選後的資料已儲存為 test13.csv


In [None]:
import torch
import pandas as pd
import gc
from collections import defaultdict
from ckip_transformers.nlp import CkipWordSegmenter
from transformers import pipeline

# 確保 GPU 可用
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# 初始化 CKIP 斷詞器 (使用 GPU)
ws_driver = CkipWordSegmenter(model="bert-base", device=0)

# 初始化 Sentiment 分析模型 (多語言)
sentiment_pipeline = pipeline("text-classification", model="cardiffnlp/twitter-xlm-roberta-base-sentiment", device=0 if device == "cuda" else -1)

# 讀取 CSV
df = pd.read_csv("test13.csv")

# 讀取地點對應資訊
location_info = pd.read_json("../RAW_DATA/E_838_location_info.json")
location_mapping = {loc["location_id"]: loc["gmap_location"] for _, loc in location_info.iterrows()}

# 只保留有評論的資料 (避免 `NaN`)
df = df.dropna(subset=["comments"])

# **🔹 確保所有評論都是 `str`**
df["processed_comments"] = df.apply(lambda row: str(row["translated_comments"]) if row["language"] != "zh-Hant" else str(row["comments"]), axis=1)

# **🔹 剔除 `NaN` 或無效值**
df["processed_comments"] = df["processed_comments"].fillna("").astype(str)

# **🔹 過濾空白評論**
df = df[df["processed_comments"].str.strip() != ""]

# **🔹 批量 Tokenization**
processed_comments = df["processed_comments"].tolist()
tokenized_results = ws_driver(processed_comments)  

# 檢查長度是否一致
if len(tokenized_results) != len(df):
    print(f"⚠️ Warning: Tokenized results ({len(tokenized_results)}) ≠ DataFrame rows ({len(df)})")
    df = df.iloc[:len(tokenized_results)]  # 確保 DataFrame 長度對齊

# **🔹 Sentiment Analysis 批量處理**
flat_tokens = [token for tokens in tokenized_results for token in tokens]
sentiment_results = sentiment_pipeline(flat_tokens, batch_size=32, truncation=True)

# **🔹 組織 Sentiment 結果**
negative_words = set(token for token, result in zip(flat_tokens, sentiment_results) if result["label"] == "negative")

# **🔹 建立負面詞頻統計**
negative_word_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))  
review_map = defaultdict(lambda: defaultdict(list))

for idx, (row, tokens) in enumerate(zip(df.itertuples(index=False), tokenized_results)):
    location_id = row.location_id
    gmap_location = location_mapping.get(location_id, "未知地點")
    location_key = f"{location_id} | {gmap_location}"
    lang = row.language
    review_id = row.review_id
    processed_comment = row.processed_comments  # 修正評論欄位

    # 統計負面詞
    for token in tokens:
        if token in negative_words:
            negative_word_counts[location_key][lang][token] += 1
            review_map[location_key][lang].append((token, review_id, processed_comment))

# **🔹 產生 DataFrame**
rows = []
for location, lang_data in negative_word_counts.items():
    for lang, word_counts in lang_data.items():
        sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:20]  # 取前 20 個負面詞
        total_reviews = df[df["language"] == lang].shape[0]  # 總評論數
        for word, count in sorted_words:
            review_info = review_map[location][lang]
            review_id, comment = next((rev_id, text) for token, rev_id, text in review_info if token == word)

            rows.append({
                "location_id": location.split(" | ")[0],
                "gmap_location": location.split(" | ")[1],
                "language": lang,
                "word": word,
                "count": count,
                "percentage": f"{(count / total_reviews * 100):.2f}%" if total_reviews > 0 else "0%",
                "review_id": review_id,
                "processed_comments": comment
            })

df_output = pd.DataFrame(rows)

# **🔹 輸出 CSV**
df_output.to_csv("negative_words_with_reviews.csv", index=False, encoding="utf-8-sig")
print("負面詞分析已輸出：negative_words_with_reviews.csv")


Using device: cuda


Device set to use cuda:0
Tokenization: 100%|██████████| 10705/10705 [00:00<00:00, 17102.65it/s]
Inference: 100%|██████████| 42/42 [08:58<00:00, 12.81s/it]
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


📄 負面詞分析已輸出：negative_words_with_reviews.csv
