In [1]:
# Import pandas to load and analyze the uploaded file
import pandas as pd

# Load the uploaded CSV file to inspect the tags
file_path = './Chience_iwslt20e17_ner_tagged.csv'
df = pd.read_csv(file_path)

# Extract all unique tags from the "Chinese NER Tagged" column
# Assuming the tags are in the format: [('Entity', 'Tag', start, end), ...]
import ast

# Initialize a set to store unique tags
unique_tags = set()

# Parse each entry in the "Chinese NER Tagged" column
for tagged_data in df['Chinese NER Tagged']:
    # Skip empty entries
    if tagged_data and tagged_data != "[]":
        # Convert string representation of list to actual list
        entities = ast.literal_eval(tagged_data)
        # Extract tags and add them to the set
        for entity, tag, _, _ in entities:
            unique_tags.add(tag)

# Output all unique tags
unique_tags


{'nr', 'ns'}

In [3]:
# Load the newly uploaded file to inspect the English NER tags and extract all unique tags
file_path_new = './Test_iwslt2017_ner_tagged.csv'
df_new = pd.read_csv(file_path_new)

# Initialize a set to store unique tags for English NER Tagged column
unique_english_tags = set()

# Parse each entry in the "English NER Tagged" column
for tagged_data in df_new['English NER Tagged']:
    # Skip empty entries
    if tagged_data and tagged_data != "[]":
        # Convert string representation of list to actual list
        entities = ast.literal_eval(tagged_data)
        # Extract tags and add them to the set
        for _, tag in entities:
            unique_english_tags.add(tag)

# Output all unique English tags
unique_english_tags


{'LOCATION', 'MISC', 'O', 'ORGANIZATION', 'PERSON'}

In [4]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import hanlp

# 載入 HanLP 模型
hanlp_model = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)

# 使用 datasets 套件載入 IWSLT 2017 英中翻譯資料集，設置 trust_remote_code=True
dataset = load_dataset('iwslt2017', 'iwslt2017-en-zh', split='train[:2000]', trust_remote_code=True)

# 提取中文句子
chinese_sentences = [example['translation']['zh'] for example in dataset]
# print("chinese_sentences = ",chinese_sentences)
# 進行中文命名實體識別使用 HanLP
chinese_ner_results = []
for sentence in tqdm(chinese_sentences, desc="Processing Chinese NER with HanLP"):
    tagged_words = hanlp_model(sentence)['ner/pku']
    chinese_ner_results.append(tagged_words)

# 將結果轉換為 DataFrame 便於檢視
ner_df = pd.DataFrame({
    "Chinese Sentence": chinese_sentences,
    "Chinese NER Tagged": chinese_ner_results
})

# 保存 NER 標記結果到 CSV 文件
ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\Chience_ner_tagged_2000.csv", index=False, encoding='utf-8-sig')



  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Processing Chinese NER with HanLP: 100%|███████████████████████████████████████████| 2000/2000 [11:17<00:00,  2.95it/s]
