In [1]:
from datasets import load_dataset
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

# 使用 Hugging Face 的多語言 NER pipeline，載入 XLM-R 模型
ner_pipeline = pipeline("ner", model="xlm-roberta-large-finetuned-conll03-english")

# 使用 datasets 套件載入 IWSLT 2017 英中翻譯資料集，僅取前 2000 筆數據
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train[:2000]', trust_remote_code=True)

# 提取中英文句子
english_sentences = [example['translation']['en'] for example in dataset]
chinese_sentences = [example['translation']['zh'] for example in dataset]

# 進行英文命名實體識別
english_ner_results = []
for sentence in tqdm(english_sentences, desc="Processing English NER"):
    tagged_words = ner_pipeline(sentence)
    english_ner_results.append(tagged_words)

# 進行中文命名實體識別
chinese_ner_results = []
for sentence in tqdm(chinese_sentences, desc="Processing Chinese NER"):
    tagged_words = ner_pipeline(sentence)
    chinese_ner_results.append(tagged_words)

# 將結果轉換為 DataFrame 便於檢視
ner_df = pd.DataFrame({
    "English Sentence": english_sentences,
    "English NER Tagged": english_ner_results,
    "Chinese Sentence": chinese_sentences,
    "Chinese NER Tagged": chinese_ner_results
})

# 保存 NER 標記結果到 CSV 文件
ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Test_tagged.csv", index=False, encoding='utf-8-sig')


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Processing English NER: 100%|██████████████████████████████████████████████████████| 2000/2000 [09:41<00:00,  3.44it/s]
Processing Chinese NER: 100%|█████████████

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\USER\\Downloads\\NLP-Courses\\NLP243\\Projects\\Test_iwslt2017_ner_tagged.csv'

In [5]:
# 保存 NER 標記結果到 CSV 文件
ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Test_tagged.csv", index=False, encoding='utf-8-sig')


In [14]:
# Load the tagged CSV file for processing entity merging
import pandas as pd

# Path to the uploaded tagged file
file_path = './Test_tagged.csv'
df = pd.read_csv(file_path)

# Function to merge segmented entities based on their type
# Update the merge function for Chinese to only merge entities with the same tag and strictly consecutive start:end positions
def merge_entities_for_chinese(entities):
    merged_entities = []
    temp_entity = ""
    temp_tag = None
    temp_score = 1.0  # Start with a high confidence score for the entity being merged
    temp_end = None  # Track the end position of the last added word

    for entity in entities:
        word, tag, score, start, end = entity['word'].replace("▁", ""), entity['entity'], entity['score'], entity['start'], entity['end']

        # Check if we should start a new entity based on tag or strictly consecutive positions (start == temp_end)
        if temp_tag is None or temp_tag != tag or (temp_end is not None and temp_end != start):
            # If we are starting a new entity, tag changes, or start position is not strictly consecutive, store the previous entity
            if temp_entity:
                merged_entities.append({"word": temp_entity, "tag": temp_tag, "score": temp_score})
            # Start a new entity
            temp_entity = word
            temp_tag = tag
            temp_score = score
            temp_end = end
        else:
            # Continue the current entity if the tag is the same and positions are strictly consecutive
            temp_entity += word
            temp_score = min(temp_score, score)  # Track the minimum score as the overall confidence
            temp_end = end  # Update end position for consecutive check

    # Add the final entity if any
    if temp_entity:
        merged_entities.append({"word": temp_entity, "tag": temp_tag, "score": temp_score})

    return merged_entities

# Apply the updated merging function for English and the new function for Chinese
merged_english_entities = []
merged_chinese_entities = []

for index, row in df.iterrows():
    english_entities = eval(row['English NER Tagged']) if row['English NER Tagged'] != "[]" else []
    chinese_entities = eval(row['Chinese NER Tagged']) if row['Chinese NER Tagged'] != "[]" else []

    # Apply merge function with space for English entities, and strict consecutive merge for Chinese entities
    merged_english_entities.append(merge_entities_with_space_for_english(english_entities))
    merged_chinese_entities.append(merge_entities_for_chinese(chinese_entities))

# Add merged entities to the DataFrame
df['Merged English NER Tagged'] = merged_english_entities
df['Merged Chinese NER Tagged'] = merged_chinese_entities

# Display the modified DataFrame with merged entities
df[['English Sentence', 'Merged English NER Tagged', 'Chinese Sentence', 'Merged Chinese NER Tagged']].head()

        
        

# 保存替換後的 NER 標記結果到 CSV 文件
df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Entity.csv", index=False, encoding='utf-8-sig')


In [13]:
# Load the latest tagged CSV file for processing
file_path = './Test_tagged.csv'
df = pd.read_csv(file_path)

# Function to merge segmented entitie# Update the merge function to only combine entities with the same tag and consecutive start:end positions
# Update the merge function to handle cases where consecutive entities in English require a space between them
def merge_entities_with_space_for_english(entities):
    merged_entities = []
    temp_entity = ""
    temp_tag = None
    temp_score = 1.0  # Start with a high confidence score for the entity being merged
    temp_end = None  # Track the end position of the last added word

    for entity in entities:
        word, tag, score, start, end = entity['word'].replace("▁", ""), entity['entity'], entity['score'], entity['start'], entity['end']

        # Check if we should start a new entity based on tag or position (end + 1 == start for consecutive check)
        if temp_tag is None or temp_tag != tag or (temp_end is not None and temp_end + 1 < start):
            # If we are starting a new entity, tag changes, or start position is not consecutive, store the previous entity
            if temp_entity:
                merged_entities.append({"word": temp_entity, "tag": temp_tag, "score": temp_score})
            # Start a new entity
            temp_entity = word
            temp_tag = tag
            temp_score = score
            temp_end = end
        else:
            # Continue the current entity if the tag is the same
            # Add a space if there's a gap between the previous end and current start
            if temp_end + 1 == start:
                temp_entity += " " + word
            else:
                temp_entity += word
            temp_score = min(temp_score, score)  # Track the minimum score as the overall confidence
            temp_end = end  # Update end position for consecutive check

    # Add the final entity if any
    if temp_entity:
        merged_entities.append({"word": temp_entity, "tag": temp_tag, "score": temp_score})

    return merged_entities

# Apply the updated merging function only on the English NER tagged results
merged_english_entities = []
merged_chinese_entities = []

for index, row in df.iterrows():
    english_entities = eval(row['English NER Tagged']) if row['English NER Tagged'] != "[]" else []
    chinese_entities = eval(row['Chinese NER Tagged']) if row['Chinese NER Tagged'] != "[]" else []

    # Apply merge function with space for English entities, and regular merge for Chinese entities
    merged_english_entities.append(merge_entities_with_space_for_english(english_entities))
    merged_chinese_entities.append(merge_entities_by_tag_and_consecutive_extended(chinese_entities))

# Add merged entities to the DataFrame
df['Merged English NER Tagged'] = merged_english_entities
df['Merged Chinese NER Tagged'] = merged_chinese_entities

# Display the modified DataFrame with merged entities
df[['English Sentence', 'Merged English NER Tagged', 'Chinese Sentence', 'Merged Chinese NER Tagged']].head()


# 保存替換後的 NER 標記結果到 CSV 文件
df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Entity.csv", index=False, encoding='utf-8-sig')
