In [1]:
from datasets import load_dataset
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# 使用 Hugging Face 的多語言 NER pipeline，載入 XLM-R 模型
ner_pipeline = pipeline("ner", model="xlm-roberta-large-finetuned-conll03-english")

# 使用 datasets 套件載入 IWSLT 2017 英中翻譯資料集，僅取前 2000 筆數據
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train[:2000]', trust_remote_code=True)

# 提取中英文句子
english_sentences = [example['translation']['en'] for example in dataset]
chinese_sentences = [example['translation']['zh'] for example in dataset]

# 進行英文命名實體識別
english_ner_results = []
for sentence in tqdm(english_sentences, desc="Processing English NER"):
    tagged_words = ner_pipeline(sentence)
    english_ner_results.append(tagged_words)

# 進行中文命名實體識別
chinese_ner_results = []
for sentence in tqdm(chinese_sentences, desc="Processing Chinese NER"):
    tagged_words = ner_pipeline(sentence)
    chinese_ner_results.append(tagged_words)

# 將結果轉換為 DataFrame 便於檢視
ner_df = pd.DataFrame({
    "English Sentence": english_sentences,
    "English NER Tagged": english_ner_results,
    "Chinese Sentence": chinese_sentences,
    "Chinese NER Tagged": chinese_ner_results
})

# 保存 NER 標記結果到 CSV 文件
ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Test_tagged.csv", index=False, encoding='utf-8-sig')


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Processing English NER: 100%|██████████████████████████████████████████████████████| 2000/2000 [09:41<00:00,  3.44it/s]
Processing Chinese NER: 100%|█████████████

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\USER\\Downloads\\NLP-Courses\\NLP243\\Projects\\Test_iwslt2017_ner_tagged.csv'

In [5]:
# 保存 NER 標記結果到 CSV 文件
ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Test_tagged.csv", index=False, encoding='utf-8-sig')


In [14]:
# Load the tagged CSV file for processing entity merging
import pandas as pd

# Path to the uploaded tagged file
file_path = './Test_tagged.csv'
df = pd.read_csv(file_path)

# Function to merge segmented entities based on their type
# Update the merge function for Chinese to only merge entities with the same tag and strictly consecutive start:end positions
def merge_entities_for_chinese(entities):
    merged_entities = []
    temp_entity = ""
    temp_tag = None
    temp_score = 1.0  # Start with a high confidence score for the entity being merged
    temp_end = None  # Track the end position of the last added word

    for entity in entities:
        word, tag, score, start, end = entity['word'].replace("▁", ""), entity['entity'], entity['score'], entity['start'], entity['end']

        # Check if we should start a new entity based on tag or strictly consecutive positions (start == temp_end)
        if temp_tag is None or temp_tag != tag or (temp_end is not None and temp_end != start):
            # If we are starting a new entity, tag changes, or start position is not strictly consecutive, store the previous entity
            if temp_entity:
                merged_entities.append({"word": temp_entity, "tag": temp_tag, "score": temp_score})
            # Start a new entity
            temp_entity = word
            temp_tag = tag
            temp_score = score
            temp_end = end
        else:
            # Continue the current entity if the tag is the same and positions are strictly consecutive
            temp_entity += word
            temp_score = min(temp_score, score)  # Track the minimum score as the overall confidence
            temp_end = end  # Update end position for consecutive check

    # Add the final entity if any
    if temp_entity:
        merged_entities.append({"word": temp_entity, "tag": temp_tag, "score": temp_score})

    return merged_entities

# Apply the updated merging function for English and the new function for Chinese
merged_english_entities = []
merged_chinese_entities = []

for index, row in df.iterrows():
    english_entities = eval(row['English NER Tagged']) if row['English NER Tagged'] != "[]" else []
    chinese_entities = eval(row['Chinese NER Tagged']) if row['Chinese NER Tagged'] != "[]" else []

    # Apply merge function with space for English entities, and strict consecutive merge for Chinese entities
    merged_english_entities.append(merge_entities_with_space_for_english(english_entities))
    merged_chinese_entities.append(merge_entities_for_chinese(chinese_entities))

# Add merged entities to the DataFrame
df['Merged English NER Tagged'] = merged_english_entities
df['Merged Chinese NER Tagged'] = merged_chinese_entities

# Display the modified DataFrame with merged entities
df[['English Sentence', 'Merged English NER Tagged', 'Chinese Sentence', 'Merged Chinese NER Tagged']].head()

        
        

# 保存替換後的 NER 標記結果到 CSV 文件
df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Entity.csv", index=False, encoding='utf-8-sig')


In [13]:
# Load the latest tagged CSV file for processing
file_path = './Test_tagged.csv'
df = pd.read_csv(file_path)

# Function to merge segmented entitie# Update the merge function to only combine entities with the same tag and consecutive start:end positions
# Update the merge function to handle cases where consecutive entities in English require a space between them
def merge_entities_with_space_for_english(entities):
    merged_entities = []
    temp_entity = ""
    temp_tag = None
    temp_score = 1.0  # Start with a high confidence score for the entity being merged
    temp_end = None  # Track the end position of the last added word

    for entity in entities:
        word, tag, score, start, end = entity['word'].replace("▁", ""), entity['entity'], entity['score'], entity['start'], entity['end']

        # Check if we should start a new entity based on tag or position (end + 1 == start for consecutive check)
        if temp_tag is None or temp_tag != tag or (temp_end is not None and temp_end + 1 < start):
            # If we are starting a new entity, tag changes, or start position is not consecutive, store the previous entity
            if temp_entity:
                merged_entities.append({"word": temp_entity, "tag": temp_tag, "score": temp_score})
            # Start a new entity
            temp_entity = word
            temp_tag = tag
            temp_score = score
            temp_end = end
        else:
            # Continue the current entity if the tag is the same
            # Add a space if there's a gap between the previous end and current start
            if temp_end + 1 == start:
                temp_entity += " " + word
            else:
                temp_entity += word
            temp_score = min(temp_score, score)  # Track the minimum score as the overall confidence
            temp_end = end  # Update end position for consecutive check

    # Add the final entity if any
    if temp_entity:
        merged_entities.append({"word": temp_entity, "tag": temp_tag, "score": temp_score})

    return merged_entities

# Apply the updated merging function only on the English NER tagged results
merged_english_entities = []
merged_chinese_entities = []

for index, row in df.iterrows():
    english_entities = eval(row['English NER Tagged']) if row['English NER Tagged'] != "[]" else []
    chinese_entities = eval(row['Chinese NER Tagged']) if row['Chinese NER Tagged'] != "[]" else []

    # Apply merge function with space for English entities, and regular merge for Chinese entities
    merged_english_entities.append(merge_entities_with_space_for_english(english_entities))
    merged_chinese_entities.append(merge_entities_by_tag_and_consecutive_extended(chinese_entities))

# Add merged entities to the DataFrame
df['Merged English NER Tagged'] = merged_english_entities
df['Merged Chinese NER Tagged'] = merged_chinese_entities

# Display the modified DataFrame with merged entities
df[['English Sentence', 'Merged English NER Tagged', 'Chinese Sentence', 'Merged Chinese NER Tagged']].head()


# 保存替換後的 NER 標記結果到 CSV 文件
df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Entity.csv", index=False, encoding='utf-8-sig')


In [17]:
from fuzzywuzzy import fuzz
import pandas as pd

# 假設您已經加載了包含 'Merged English NER Tagged' 和 'Merged Chinese NER Tagged' 的 DataFrame df
# 使用以下代碼來建立配對字典

# 構建實體翻譯詞典
entity_dict = {}

# 遍歷每一行數據
for i, row in df.iterrows():
    english_entities = row["Merged English NER Tagged"]
    chinese_entities = row["Merged Chinese NER Tagged"]
    
    # 匹配英文和中文實體
    for en_entity in english_entities:
        for zh_entity in chinese_entities:
            # 確保標籤相同才配對
            if en_entity['tag'] == zh_entity['tag']:
                # 計算名稱的相似度，若超過閾值則加入詞典
                similarity = fuzz.ratio(en_entity['word'], zh_entity['word'])
                if similarity > 80:  # 設定相似度閾值
                    entity_dict[(en_entity['word'], en_entity['tag'])] = (zh_entity['word'], zh_entity['tag'])

# 顯示結果
print("實體翻譯詞典:")
for (en_word, en_tag), (zh_word, zh_tag) in entity_dict.items():
    print(f"英文實體: {en_word} ({en_tag})  ->  中文實體: {zh_word} ({zh_tag})")


實體翻譯詞典:
英文實體: Tipper (I-PER)  ->  中文實體: Tipper (I-PER)
英文實體: Shoney's (I-ORG)  ->  中文實體: Shoney' (I-ORG)
英文實體: Ashraf (I-PER)  ->  中文實體: Ashraf (I-PER)
英文實體: CA (I-ORG)  ->  中文實體: CA (I-ORG)
英文實體: AG (I-ORG)  ->  中文實體: AG (I-ORG)
英文實體: DC8 (I-MISC)  ->  中文實體: DC8 (I-MISC)
英文實體: SR71 (I-MISC)  ->  中文實體: SR71 (I-MISC)
英文實體: IBM (I-ORG)  ->  中文實體: IBM (I-ORG)
英文實體: Mac (I-MISC)  ->  中文實體: Mac (I-MISC)
英文實體: TED (I-ORG)  ->  中文實體: TED (I-ORG)
英文實體: DOS (I-MISC)  ->  中文實體: DOS (I-MISC)
英文實體: Photoshop (I-MISC)  ->  中文實體: Photoshop (I-MISC)
英文實體: Windows (I-MISC)  ->  中文實體: Windows (I-MISC)
英文實體: Windows PC (I-MISC)  ->  中文實體: Windows (I-MISC)
英文實體: Palm (I-ORG)  ->  中文實體: Palm (I-ORG)
英文實體: Word (I-MISC)  ->  中文實體: Word (I-MISC)
英文實體: Office (I-MISC)  ->  中文實體: Office (I-MISC)
英文實體: Woz (I-PER)  ->  中文實體: Woz (I-PER)
英文實體: iPod (I-MISC)  ->  中文實體: iPod (I-MISC)
英文實體: Sonos (I-ORG)  ->  中文實體: Sonos (I-ORG)
英文實體: CA (I-LOC)  ->  中文實體: CA (I-LOC)
英文實體: JS (I-PER)  ->  中文實體: JS (I-PER)
英文實體: iB

In [18]:
# 構建實體翻譯詞典
entity_dict = {}

# 遍歷每一行數據
for i, row in df.iterrows():
    english_entities = row["Merged English NER Tagged"]
    chinese_entities = row["Merged Chinese NER Tagged"]
    
    # 匹配英文和中文實體
    for en_idx, en_entity in enumerate(english_entities):
        en_word, en_tag = en_entity['word'], en_entity['tag']
        for zh_idx, zh_entity in enumerate(chinese_entities):
            zh_word, zh_tag = zh_entity['word'], zh_entity['tag']
            
            # 根據標籤相似性和位置相似性進行配對
            if en_tag == zh_tag or abs(en_idx - zh_idx) <= 1:  # 放寬標籤限制
                similarity = fuzz.ratio(en_word, zh_word)
                if similarity > 60:  # 設定相似度閾值，可以根據需要調整
                    entity_dict[(en_word, en_tag)] = (zh_word, zh_tag)

# 顯示結果
print("實體翻譯詞典:")
for (en_word, en_tag), (zh_word, zh_tag) in entity_dict.items():
    print(f"英文實體: {en_word} ({en_tag})  ->  中文實體: {zh_word} ({zh_tag})")

實體翻譯詞典:
英文實體: Tipper (I-PER)  ->  中文實體: Tipper (I-PER)
英文實體: Ford Taurus (I-MISC)  ->  中文實體: 福特Taurus (I-MISC)
英文實體: Shoney's (I-ORG)  ->  中文實體: ney's (I-ORG)
英文實體: G-V (I-MISC)  ->  中文實體: G-5 (I-MISC)
英文實體: TED (I-ORG)  ->  中文實體: TED (I-ORG)
英文實體: Bill Joy (I-PER)  ->  中文實體: Bill (I-PER)
英文實體: Ashraf (I-PER)  ->  中文實體: Ashraf (I-PER)
英文實體: Ashraf Ghani (I-PER)  ->  中文實體: Ashraf (I-PER)
英文實體: CA (I-ORG)  ->  中文實體: CA (I-ORG)
英文實體: AG (I-ORG)  ->  中文實體: AG (I-ORG)
英文實體: DC8 (I-MISC)  ->  中文實體: DC8 (I-MISC)
英文實體: DARPA (I-MISC)  ->  中文實體: DARPA网 (I-ORG)
英文實體: SR71 (I-MISC)  ->  中文實體: SR71 (I-MISC)
英文實體: IBM (I-ORG)  ->  中文實體: IBM (I-ORG)
英文實體: Mac (I-MISC)  ->  中文實體: Mac (I-MISC)
英文實體: DOS (I-MISC)  ->  中文實體: DOS (I-MISC)
英文實體: Photoshop (I-MISC)  ->  中文實體: Photoshop (I-MISC)
英文實體: Microsoft Word (I-MISC)  ->  中文實體: Microsoft (I-MISC)
英文實體: Microsoft Write (I-MISC)  ->  中文實體: Microsoft (I-MISC)
英文實體: Windows 2000. (I-MISC)  ->  中文實體: Windows (I-MISC)
英文實體: Windows (I-MISC)  ->  中文實體: Win

In [21]:
from datasets import load_dataset
import spacy
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm  # 新增進度條

# 1. 載入 IWSLT 2017 英中翻譯資料集
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train[:2000]', trust_remote_code=True)

# 2. 載入 NER 模型（以英文的 Spacy 模型為例）
nlp = spacy.load("en_core_web_sm")

# 3. 設置翻譯模型 (使用 MarianMT 模型從英文翻譯到中文)
model_name = "Helsinki-NLP/opus-mt-en-zh"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# 用於儲存結果的列表
results = []

# 4. 遍歷每個句子進行 NER 識別並翻譯實體，使用 tqdm 顯示進度
for example in tqdm(dataset, desc="Processing sentences"):
    english_text = example['translation']['en']
    chinese_text = example['translation']['zh']
    
    # NER 識別
    doc = nlp(english_text)
    translated_entities = []
    
    for ent in doc.ents:
        # 翻譯識別到的實體
        translated = model.generate(**tokenizer(ent.text, return_tensors="pt", padding=True))
        translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
        
        # 儲存原始實體、翻譯實體和實體類型
        translated_entities.append({
            "original_entity": ent.text,
            "translated_entity": translated_text,
            "entity_type": ent.label_
        })
    
    # 儲存結果（原始英文句子，中文翻譯，識別實體及其翻譯）
    results.append({
        "english_text": english_text,
        "chinese_text": chinese_text,
        "entities": translated_entities
    })

# 5. 顯示結果示例
for result in results[:5]:  # 僅顯示前五個結果
    print("英文句子:", result["english_text"])
    print("中文句子:", result["chinese_text"])
    print("識別到的實體與翻譯:")
    for entity in result["entities"]:
        print(f"  原始實體: {entity['original_entity']}, 翻譯: {entity['translated_entity']}, 類別: {entity['entity_type']}")
    print("\n")



KeyboardInterrupt



In [22]:
# 5. 將結果存入 CSV
df = pd.DataFrame(results)
df.to_csv("ner_translations.csv", index=False, encoding="utf-8")

print("NER 及翻譯結果已保存到 'ner_translations.csv'")

NER 及翻譯結果已保存到 'ner_translations.csv'


In [30]:
from datasets import load_dataset
import spacy
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm

# 載入 IWSLT 2017 英中翻譯資料集
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train[:200]', trust_remote_code=True)

# 載入 NER 模型
nlp = spacy.load("en_core_web_sm")

# 設置翻譯模型
model_name = "Helsinki-NLP/opus-mt-en-zh"

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# 儲存結果的列表
results = []

# 遍歷每個句子進行 NER 識別並翻譯實體，加上 tqdm 進度條
for example in tqdm(dataset, desc="Processing sentences"):
    english_text = example['translation']['en']
    chinese_text = example['translation']['zh']
    
    # NER 識別
    doc = nlp(english_text)
    translated_entities = []
    
    for ent in doc.ents:
        # 翻譯識別到的實體，並限制生成的字數避免重複
        translated = model.generate(**tokenizer(ent.text, return_tensors="pt", padding=True), max_length=5)
        translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
        
        # 去除重複字元
        translated_text = " ".join(dict.fromkeys(translated_text.split()))
        
        # 儲存原始實體、翻譯實體和實體類型
        translated_entities.append({
            "original_entity": ent.text,
            "translated_entity": translated_text,
            "entity_type": ent.label_
        })
    
    # 儲存結果
    results.append({
        "english_text": english_text,
        "chinese_text": chinese_text,
        "entities": translated_entities
    })

# 顯示結果示例
for result in results[:5]:  # 僅顯示前五個結果
    print("英文句子:", result["english_text"])
    print("中文句子:", result["chinese_text"])
    print("識別到的實體與翻譯:")
    for entity in result["entities"]:
        print(f"  原始實體: {entity['original_entity']}, 翻譯: {entity['translated_entity']}, 類別: {entity['entity_type']}")
    print("\n")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'NllbTokenizer'. 
The class this function is called from is 'MarianTokenizer'.


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [26]:
# 5. 將結果存入 CSV
df = pd.DataFrame(results)
df.to_csv("ner_translations.csv", index=False, encoding="utf-8")

print("NER 及翻譯結果已保存到 'ner_translations.csv'")

NER 及翻譯結果已保存到 'ner_translations.csv'


In [31]:
from datasets import load_dataset
import spacy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

# 載入 IWSLT 2017 英中翻譯資料集
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train[:200]', trust_remote_code=True)

# 載入 NER 模型
nlp = spacy.load("en_core_web_sm")

# 使用 NLLB 模型進行翻譯
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 儲存結果的列表
results = []

# 遍歷每個句子進行 NER 識別並翻譯實體，加上 tqdm 進度條
for example in tqdm(dataset, desc="Processing sentences"):
    english_text = example['translation']['en']
    chinese_text = example['translation']['zh']
    
    # NER 識別
    doc = nlp(english_text)
    translated_entities = []
    
    for ent in doc.ents:
        # 翻譯識別到的實體，並限制生成的字數避免重複
        inputs = tokenizer(ent.text, return_tensors="pt")
        translated_tokens = model.generate(inputs["input_ids"], max_length=5, forced_bos_token_id=tokenizer.lang_code_to_id["zho_Hans"])
        translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        
        # 去除重複字元
        translated_text = " ".join(dict.fromkeys(translated_text.split()))
        
        # 儲存原始實體、翻譯實體和實體類型
        translated_entities.append({
            "original_entity": ent.text,
            "translated_entity": translated_text,
            "entity_type": ent.label_
        })
    
    # 儲存結果
    results.append({
        "english_text": english_text,
        "chinese_text": chinese_text,
        "entities": translated_entities
    })

# 顯示結果示例
for result in results[:5]:  # 僅顯示前五個結果
    print("英文句子:", result["english_text"])
    print("中文句子:", result["chinese_text"])
    print("識別到的實體與翻譯:")
    for entity in result["entities"]:
        print(f"  原始實體: {entity['original_entity']}, 翻譯: {entity['translated_entity']}, 類別: {entity['entity_type']}")
    print("\n")


  return torch.load(checkpoint_file, map_location=map_location)
Processing sentences: 100%|██████████████████████████████████████████████████████████| 200/200 [01:42<00:00,  1.94it/s]

英文句子: Thank you so much, Chris. And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.
中文句子: 非常谢谢，克里斯。的确非常荣幸 能有第二次站在这个台上的机会，我真是非常感激。
識別到的實體與翻譯:
  原始實體: Chris, 翻譯: 克里, 類別: PERSON


英文句子: I have been blown away by this conference, and I want to thank all of you for the many nice comments about what I had to say the other night.
中文句子: 这个会议真是让我感到惊叹不已，我还要谢谢你们留下的 关于我上次演讲的精彩评论
識別到的實體與翻譯:


英文句子: And I say that sincerely, partly because  I need that.  Put yourselves in my position.
中文句子: 我是非常真诚的，部分原因是因为----我的确非常需要！ 你设身处地为我想想！
識別到的實體與翻譯:


英文句子: I flew on Air Force Two for eight years.
中文句子: 我坐了8年的空军二号。
識別到的實體與翻譯:
  原始實體: Air Force Two, 翻譯: 航空队, 類別: PRODUCT
  原始實體: eight years, 翻譯: 八年, 類別: DATE


英文句子: Now I have to take off my shoes or boots to get on an airplane!
中文句子: 不过现在上飞机前我则要脱掉我的鞋子
識別到的實體與翻譯:







In [33]:
from datasets import load_dataset
import spacy
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
from tqdm import tqdm

# 載入 IWSLT 2017 英中翻譯資料集
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train[:200]', trust_remote_code=True)

# 載入 NER 模型
nlp = spacy.load("en_core_web_sm")

# 設置 mBART 翻譯模型
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# 設定源語言和目標語言
tokenizer.src_lang = "en_XX"

# 儲存結果的列表
results = []

# 遍歷每個句子進行 NER 識別並翻譯實體，加上 tqdm 進度條
for example in tqdm(dataset, desc="Processing sentences"):
    english_text = example['translation']['en']
    chinese_text = example['translation']['zh']
    
    # NER 識別
    doc = nlp(english_text)
    translated_entities = []
    
    for ent in doc.ents:
        # 翻譯識別到的實體，並限制生成的字數避免重複
        inputs = tokenizer(ent.text, return_tensors="pt")
        translated_tokens = model.generate(inputs["input_ids"], max_length=10, forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"])
        translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
        
        # 去除重複字元
        translated_text = " ".join(dict.fromkeys(translated_text.split()))
        
        # 儲存原始實體、翻譯實體和實體類型
        translated_entities.append({
            "original_entity": ent.text,
            "translated_entity": translated_text,
            "entity_type": ent.label_
        })
    
    # 儲存結果
    results.append({
        "english_text": english_text,
        "chinese_text": chinese_text,
        "entities": translated_entities
    })

# 顯示結果示例
for result in results[:5]:  # 僅顯示前五個結果
    print("英文句子:", result["english_text"])
    print("中文句子:", result["chinese_text"])
    print("識別到的實體與翻譯:")
    for entity in result["entities"]:
        print(f"  原始實體: {entity['original_entity']}, 翻譯: {entity['translated_entity']}, 類別: {entity['entity_type']}")
    print("\n")


Processing sentences: 100%|██████████████████████████████████████████████████████████| 200/200 [03:16<00:00,  1.02it/s]

英文句子: Thank you so much, Chris. And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.
中文句子: 非常谢谢，克里斯。的确非常荣幸 能有第二次站在这个台上的机会，我真是非常感激。
識別到的實體與翻譯:
  原始實體: Chris, 翻譯: 克里斯, 類別: PERSON


英文句子: I have been blown away by this conference, and I want to thank all of you for the many nice comments about what I had to say the other night.
中文句子: 这个会议真是让我感到惊叹不已，我还要谢谢你们留下的 关于我上次演讲的精彩评论
識別到的實體與翻譯:


英文句子: And I say that sincerely, partly because  I need that.  Put yourselves in my position.
中文句子: 我是非常真诚的，部分原因是因为----我的确非常需要！ 你设身处地为我想想！
識別到的實體與翻譯:


英文句子: I flew on Air Force Two for eight years.
中文句子: 我坐了8年的空军二号。
識別到的實體與翻譯:
  原始實體: Air Force Two, 翻譯: 空军二号, 類別: PRODUCT
  原始實體: eight years, 翻譯: 八年, 類別: DATE


英文句子: Now I have to take off my shoes or boots to get on an airplane!
中文句子: 不过现在上飞机前我则要脱掉我的鞋子
識別到的實體與翻譯:







In [34]:
# 5. 將結果存入 CSV
df = pd.DataFrame(results)
df.to_csv("ner_translations.csv", index=False, encoding="utf-8")

print("NER 及翻譯結果已保存到 'ner_translations.csv'")

NER 及翻譯結果已保存到 'ner_translations.csv'
