In [5]:
from datasets import load_dataset
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# 設置 Stanford NER jar 和模型的完整路徑
jar = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\stanford-ner-4.2.0.jar"
english_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\english.all.3class.distsim.crf.ser.gz"
chinese_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\chinese.misc.distsim.crf.ser.gz"

english_ner_tagger = StanfordNERTagger(english_model, jar, encoding='utf-8')
chinese_ner_tagger = StanfordNERTagger(chinese_model, jar, encoding='utf-8')

# 使用 datasets 套件載入 IWSLT 2017 英中翻譯資料集，設置 trust_remote_code=True
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train', trust_remote_code=True)

# 提取中英文句子
english_sentences = [example['translation']['en'] for example in dataset]
chinese_sentences = [example['translation']['zh'] for example in dataset]

print("Loaded sentences:")
print("English:", english_sentences[:3])
print("Chinese:", chinese_sentences[:3])

# 使用 tqdm 查看進度並進行英文命名實體識別
english_ner_results = []
for sentence in tqdm(english_sentences, desc="Processing English NER"):
    words = word_tokenize(sentence)
    tagged_words = english_ner_tagger.tag(words)
    english_ner_results.append(tagged_words)

# 使用 tqdm 查看進度並進行中文命名實體識別
chinese_ner_results = []
for sentence in tqdm(chinese_sentences, desc="Processing Chinese NER"):
    words = list(sentence)  # 中文不需要分詞，直接逐字標記
    tagged_words = chinese_ner_tagger.tag(words)
    chinese_ner_results.append(tagged_words)

# 將結果轉換為 DataFrame 便於檢視
ner_df = pd.DataFrame({
    "English Sentence": english_sentences,
    "English NER Tagged": english_ner_results,
    "Chinese Sentence": chinese_sentences,
    "Chinese NER Tagged": chinese_ner_results
})

# 顯示結果
ner_df.head()

# 保存 NER 標記結果到 CSV 文件（可選）
ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\iwslt2017_ner_tagged.csv", index=False)


Generating train split: 100%|████████████████████████████████████████| 231266/231266 [00:08<00:00, 28025.68 examples/s]
Generating test split: 100%|█████████████████████████████████████████████| 8549/8549 [00:00<00:00, 17174.59 examples/s]
Generating validation split: 100%|█████████████████████████████████████████| 879/879 [00:00<00:00, 17578.34 examples/s]


Loaded sentences:
English: ["Thank you so much, Chris. And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.", 'I have been blown away by this conference, and I want to thank all of you for the many nice comments about what I had to say the other night.', 'And I say that sincerely, partly because  I need that.  Put yourselves in my position.']
Chinese: ['非常谢谢，克里斯。的确非常荣幸 能有第二次站在这个台上的机会，我真是非常感激。', '这个会议真是让我感到惊叹不已，我还要谢谢你们留下的 关于我上次演讲的精彩评论', '我是非常真诚的，部分原因是因为----我的确非常需要！ 你设身处地为我想想！']


Processing English NER:   1%|▍                                                | 1828/231266 [38:40<80:53:32,  1.27s/it]

KeyboardInterrupt

