In [None]:
from datasets import load_dataset
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# 使用 Hugging Face 的多語言 NER pipeline，載入 XLM-R 模型
ner_pipeline = pipeline("ner", model="xlm-roberta-large-finetuned-conll03-english")

# 使用 datasets 套件載入 IWSLT 2017 英中翻譯資料集，僅取前 2000 筆數據
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train', trust_remote_code=True)

# 提取中英文句子
english_sentences = [example['translation']['en'] for example in dataset]
chinese_sentences = [example['translation']['zh'] for example in dataset]

# 進行英文命名實體識別
english_ner_results = []
for sentence in tqdm(english_sentences, desc="Processing English NER"):
    tagged_words = ner_pipeline(sentence)
    english_ner_results.append(tagged_words)

# 進行中文命名實體識別
chinese_ner_results = []
for sentence in tqdm(chinese_sentences, desc="Processing Chinese NER"):
    tagged_words = ner_pipeline(sentence)
    chinese_ner_results.append(tagged_words)

# 將結果轉換為 DataFrame 便於檢視
ner_df = pd.DataFrame({
    "English Sentence": english_sentences,
    "English NER Tagged": english_ner_results,
    "Chinese Sentence": chinese_sentences,
    "Chinese NER Tagged": chinese_ner_results
})

# 保存 NER 標記結果到 CSV 文件
ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Test_tagged.csv", index=False, encoding='utf-8-sig')
