In [None]:
from datasets import load_dataset
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# 設置 Stanford NER jar 和模型的完整路徑
jar = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\stanford-ner-4.2.0.jar"
english_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\english.all.3class.distsim.crf.ser.gz"
chinese_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\chinese.misc.distsim.crf.ser.gz"

# 初始化英文和中文 NER 模型
english_ner_tagger = StanfordNERTagger(english_model, jar, encoding='utf-8')
chinese_ner_tagger = StanfordNERTagger(chinese_model, jar, encoding='utf-8')

# 使用 datasets 套件載入 IWSLT 2017 英中翻譯資料集
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train', trust_remote_code=True)

# 提取中英文句子
english_sentences = [example['translation']['en'] for example in dataset]
chinese_sentences = [example['translation']['zh'] for example in dataset]

# 英文命名實體識別
english_ner_results = []
for sentence in tqdm(english_sentences, desc="Processing English NER"):
    words = word_tokenize(sentence)
    tagged_words = english_ner_tagger.tag(words)
    english_ner_results.append(tagged_words)

# 中文命名實體識別
chinese_ner_results = []
for sentence in tqdm(chinese_sentences, desc="Processing Chinese NER"):
    words = list(sentence)  # 中文不需要分詞，直接逐字標記
    tagged_words = chinese_ner_tagger.tag(words)
    chinese_ner_results.append(tagged_words)

# 將結果轉換為 DataFrame
ner_df = pd.DataFrame({
    "English Sentence": english_sentences,
    "English NER Tagged": english_ner_results,
    "Chinese Sentence": chinese_sentences,
    "Chinese NER Tagged": chinese_ner_results
})

# 保存 NER 標記結果到 CSV 文件
ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\iwslt2017_ner_tagged_sentences.csv", index=False, encoding='utf-8-sig')

# 提取所有獨特的 NER 標記
unique_english_tags = set(tag for sentence in english_ner_results for _, tag in sentence)
unique_chinese_tags = set(tag for sentence in chinese_ner_results for _, tag in sentence)
all_unique_tags = unique_english_tags.union(unique_chinese_tags)
print("All unique NER tags:", all_unique_tags)

# 將命名實體替換為帶有標籤的格式
def replace_with_tags(sentence, tagged_words):
    tagged_sentence = ""
    inside_tag = False
    current_tag = None

    for word, tag in tagged_words:
        if tag == 'O':
            tagged_sentence += word + " "
            inside_tag = False
            current_tag = None
        else:
            if not inside_tag:
                tagged_sentence += f"<{tag}>"
                inside_tag = True
                current_tag = tag
            tagged_sentence += word + " "
            if tag != current_tag or (tag == current_tag and word != tagged_words[-1][0]):
                tagged_sentence += f"<\\{tag}> "
                inside_tag = False
                current_tag = None
    return tagged_sentence.strip()

# 應用標籤替換
tagged_english_sentences = [replace_with_tags(sentence, tagged_words) for sentence, tagged_words in zip(english_sentences, english_ner_results)]
tagged_chinese_sentences = [replace_with_tags(sentence, tagged_words) for sentence, tagged_words in zip(chinese_sentences, chinese_ner_results)]

# 保存帶標籤的 NER 標記結果到 CSV 文件
tagged_df = pd.DataFrame({
    "Original English Sentence": english_sentences,
    "Tagged English Sentence": tagged_english_sentences,
    "Original Chinese Sentence": chinese_sentences,
    "Tagged Chinese Sentence": tagged_chinese_sentences
})

tagged_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\iwslt2017_tagged_sentences.csv", index=False, encoding='utf-8-sig')


  from .autonotebook import tqdm as notebook_tqdm
Processing English NER:  30%|█████████████▍                               | 69315/231266 [23:13:00<88:09:09,  1.96s/it]