In [2]:
from datasets import load_dataset
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import random

# 設置 Stanford NER jar 和模型的完整路徑
jar = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\stanford-ner-4.2.0.jar"
english_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\english.all.3class.distsim.crf.ser.gz"
chinese_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\chinese.misc.distsim.crf.ser.gz"

english_ner_tagger = StanfordNERTagger(english_model, jar, encoding='utf-8')
chinese_ner_tagger = StanfordNERTagger(chinese_model, jar, encoding='utf-8')

# 使用 datasets 套件載入 IWSLT 2017 英中翻譯資料集，設置 trust_remote_code=True
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train', trust_remote_code=True)

# 隨機選取 1000 個樣本作為測試資料，剩餘的保留為縮小的訓練資料集
sample_size = 100
test_samples = dataset.shuffle(seed=42).select(range(sample_size))
train_samples = dataset.select(range(sample_size, len(dataset)))

# 提取中英文句子
english_sentences = [example['translation']['en'] for example in test_samples]
chinese_sentences = [example['translation']['zh'] for example in test_samples]

print("Sample sentences for testing:")
print("English:", english_sentences[:3])
print("Chinese:", chinese_sentences[:3])

# 使用 tqdm 查看進度並進行英文命名實體識別
english_ner_results = []
for sentence in tqdm(english_sentences, desc="Processing English NER"):
    words = word_tokenize(sentence)
    tagged_words = english_ner_tagger.tag(words)
    english_ner_results.append(tagged_words)

# 使用 tqdm 查看進度並進行中文命名實體識別
chinese_ner_results = []
for sentence in tqdm(chinese_sentences, desc="Processing Chinese NER"):
    words = list(sentence)  # 中文不需要分詞，直接逐字標記
    tagged_words = chinese_ner_tagger.tag(words)
    chinese_ner_results.append(tagged_words)

# 將結果轉換為 DataFrame 便於檢視
ner_df = pd.DataFrame({
    "English Sentence": english_sentences,
    "English NER Tagged": english_ner_results,
    "Chinese Sentence": chinese_sentences,
    "Chinese NER Tagged": chinese_ner_results
})

# 顯示結果
ner_df.head()

# 保存 NER 標記結果到 CSV 文件（可選）
ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\iwslt2017_ner_tagged_test_sample.csv", index=False)


Sample sentences for testing:
English: ["And deterrence theory is a very simple theory of human behavior: If you want somebody to do less of something, add a punishment and they'll do less of it.", 'If you were to walk into one of these rooms, you would see this.', 'We tried to answer this simple question: Can you find a unifying language that cuts across age and income and culture that will help people themselves find a new way of living, see spaces around them differently, think about the resources they use differently, interact differently?']
Chinese: ['威慑理论是人类行为最简单的理论。 如果你不想要人们做某些事情， 只需要加上惩罚，人们便不会去做了。', '如果你走进这其中一个房间，你就会看到这个。', '我们试图回答这个简单的问题 你能找到一种统一的语言 即使跨越时代 收入和文化却仍然能够帮助人们自己 找到一种新的生活方式 看看身边独特的空间 想想人们别出心裁的 不同交流方式 的不同资源']


Processing English NER: 100%|████████████████████████████████████████████████████████| 100/100 [02:18<00:00,  1.39s/it]
Processing Chinese NER: 100%|████████████████████████████████████████████████████████| 100/100 [04:07<00:00,  2.48s/it]


In [7]:
import pandas as pd

# 載入 CSV 文件
file_path = 'C:\\Users\\USER\\Downloads\\NLP-Courses\\NLP243\\Projects\\iwslt2017_ner_tagged_test_sample.csv'

ner_df = pd.read_csv(file_path)

# 提取 English NER 和 Chinese NER 標記欄位中的標記
english_tags = ner_df['English NER Tagged'].apply(lambda x: eval(x))
chinese_tags = ner_df['Chinese NER Tagged'].apply(lambda x: eval(x))

# 取得 English NER 中的獨特標記
unique_english_tags = set(tag for sentence in english_tags for _, tag in sentence)

# 取得 Chinese NER 中的獨特標記
unique_chinese_tags = set(tag for sentence in chinese_tags for _, tag in sentence)

# 結合兩種語言中的所有獨特標記
all_unique_tags = unique_english_tags.union(unique_chinese_tags)

# 輸出結果
print("All unique NER tags:", all_unique_tags)


All unique NER tags: {'LOCATION', 'O', 'MISC', 'ORGANIZATION', 'GPE', 'PERSON'}


# 命名實體標記（NER tags）有以下幾種類型：

## GPE - 地理政治實體，例如國家、城市等。

## LOCATION - 一般地點名稱。

## MISC - 其他未分類的實體類別。

## O - 表示該詞或字不是任何命名實體。
## ORGANIZATION - 組織名稱，例如公司、機構等。
## PERSON - 人名。

In [11]:
# 將命名實體替換為帶有標籤的格式
def replace_with_tags(sentence, tagged_words):
    tagged_sentence = ""
    inside_tag = False
    current_tag = None

    for word, tag in tagged_words:
        if tag == 'O':
            tagged_sentence += word + " "
            inside_tag = False
            current_tag = None
        else:
            if not inside_tag:
                tagged_sentence += f"<{tag}>"
                inside_tag = True
                current_tag = tag
            tagged_sentence += word + " "
            if tag != current_tag or (tag == current_tag and word != tagged_words[-1][0]):
                tagged_sentence += f"<\\{tag}> "
                inside_tag = False
                current_tag = None
    return tagged_sentence.strip()

# 應用標籤替換
tagged_english_sentences = [replace_with_tags(sentence, tagged_words) for sentence, tagged_words in zip(english_sentences, english_ner_results)]
tagged_chinese_sentences = [replace_with_tags(sentence, tagged_words) for sentence, tagged_words in zip(chinese_sentences, chinese_ner_results)]

# 將結果轉換為 DataFrame 便於檢視
tagged_df = pd.DataFrame({
    "Original English Sentence": english_sentences,
    "Tagged English Sentence": tagged_english_sentences,
    "Original Chinese Sentence": chinese_sentences,
    "Tagged Chinese Sentence": tagged_chinese_sentences
})

# 顯示結果
print(tagged_df.head())

# 保存帶標籤的 NER 標記結果到 CSV 文件
# 保存帶標籤的 NER 標記結果到 CSV 文件，指定編碼為 UTF-8
# 保存帶標籤的 NER 標記結果到 CSV 文件，並指定 UTF-8 編碼以避免亂碼
tagged_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\iwslt2017_tagged_sentences.csv", index=False, encoding='utf-8-sig')


                           Original English Sentence  \
0  And deterrence theory is a very simple theory ...   
1  If you were to walk into one of these rooms, y...   
2  We tried to answer this simple question: Can y...   
3  Here's how a philosopher explains the sawing-t...   
4  So you create a bridge between the media and t...   

                             Tagged English Sentence  \
0  And deterrence theory is a very simple theory ...   
1  If you were to walk into one of these rooms , ...   
2  We tried to answer this simple question : Can ...   
3  Here 's how a philosopher explains the sawing-...   
4  So you create a bridge between the media and t...   

                           Original Chinese Sentence  \
0  威慑理论是人类行为最简单的理论。 如果你不想要人们做某些事情， 只需要加上惩罚，人们便不会去做了。   
1                              如果你走进这其中一个房间，你就会看到这个。   
2  我们试图回答这个简单的问题 你能找到一种统一的语言 即使跨越时代 收入和文化却仍然能够帮助人...   
3                                哲学家如何解释将女士锯成两半的魔术呢。   
4                    所以你就架起了一座桥梁， 一头是媒体，一头是那些不