In [None]:
from datasets import load_dataset
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# 設置 Stanford NER jar 和模型的完整路徑
jar = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\stanford-ner-4.2.0.jar"
english_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\english.conll.4class.distsim.crf.ser.gz"
chinese_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\chinese.misc.distsim.crf.ser.gz"

# 初始化 NER 標記器
english_ner_tagger = StanfordNERTagger(english_model, jar, encoding='utf-8')
chinese_ner_tagger = StanfordNERTagger(chinese_model, jar, encoding='utf-8')

# 使用 datasets 套件載入 IWSLT 2017 英中翻譯資料集，設置 trust_remote_code=True
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train', trust_remote_code=True)

# 提取中英文句子
english_sentences = [example['translation']['en'] for example in dataset]
chinese_sentences = [example['translation']['zh'] for example in dataset]

# 進行英文命名實體識別
english_ner_results = []
for sentence in tqdm(english_sentences, desc="Processing English NER"):
    words = word_tokenize(sentence)
    tagged_words = english_ner_tagger.tag(words)
    english_ner_results.append(tagged_words)

# 進行中文命名實體識別
chinese_ner_results = []
for sentence in tqdm(chinese_sentences, desc="Processing Chinese NER"):
    words = list(sentence)  # 中文不需要分詞，直接逐字標記
    tagged_words = chinese_ner_tagger.tag(words)
    chinese_ner_results.append(tagged_words)

# 將結果轉換為 DataFrame 便於檢視
ner_df = pd.DataFrame({
    "English Sentence": english_sentences,
    "English NER Tagged": english_ner_results,
    "Chinese Sentence": chinese_sentences,
    "Chinese NER Tagged": chinese_ner_results
})

# 保存 NER 標記結果到 CSV 文件
ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Test_iwslt2017_ner_tagged.csv", index=False, encoding='utf-8-sig')

# 提取 NER 標記
english_tags = ner_df['English NER Tagged'].apply(lambda x: eval(x))
chinese_tags = ner_df['Chinese NER Tagged'].apply(lambda x: eval(x))

# 定義函數以用指定格式替換實體
def replace_with_tags(tagged_words):
    tagged_sentence = ""
    for word, tag in tagged_words:
        if tag == 'O':
            tagged_sentence += word + " "
        else:
            tagged_sentence += f"<\\{tag}, {word}> "
    return tagged_sentence

# 替換英文和中文句子的標籤
tagged_english_sentences = [replace_with_tags(tagged_words) for tagged_words in english_tags]
tagged_chinese_sentences = [replace_with_tags(tagged_words) for tagged_words in chinese_tags]

# 創建 DataFrame 以便於查看
tagged_df = pd.DataFrame({
    "Original English Sentence": ner_df['English Sentence'],
    "Tagged English Sentence": tagged_english_sentences,
    "Original Chinese Sentence": ner_df['Chinese Sentence'],
    "Tagged Chinese Sentence": tagged_chinese_sentences
})

# 顯示結果
print(tagged_df.head())

# 保存替換後的 NER 標記結果到 CSV 文件
tagged_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Temp_iwslt2017_tagged_sentences.csv", index=False, encoding='utf-8-sig')


  from .autonotebook import tqdm as notebook_tqdm
Processing English NER:   0%|                                                   | 51/231266 [00:41<52:02:36,  1.23it/s]