In [5]:
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import jieba
import spacy

# 設置 Stanford NER jar 和模型的完整路徑
jar = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\stanford-ner-4.2.0.jar"
english_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\english.all.3class.distsim.crf.ser.gz"

# 初始化 Stanford NER Tagger 進行英文的 NER
english_ner_tagger = StanfordNERTagger(english_model, jar, encoding='utf-8')

# 初始化 SpaCy 進行中文的 NER
nlp_zh = spacy.load("zh_core_web_md")

# 定義城市代碼
city_codes = {
    "New York": "NYC",
    "San Francisco": "SFO",
    "Los Angeles": "LA"
}

# 讀取 CSV 文件，指定編碼
train_df = pd.read_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\TEST_Train.csv", encoding='ISO-8859-1')
english_sentences = train_df['translation.en'].dropna().tolist()
chinese_sentences = train_df['translation.zh'].dropna().tolist()

# 提取英文和中文命名實體
def extract_entities(tagged_words):
    return [(word, tag) for word, tag in tagged_words if tag != "O"]

# 進行 NER 標記並提取實體
english_ner_results = []
chinese_ner_results = []

for en_sentence, zh_sentence in tqdm(zip(english_sentences, chinese_sentences), desc="Processing NER", total=len(english_sentences)):
    # 英文命名實體識別
    en_words = word_tokenize(en_sentence)
    en_tagged = english_ner_tagger.tag(en_words)
    en_entities = extract_entities(en_tagged)
    
    # 對於每個城市名稱，加入城市代碼
    enriched_en_entities = []
    for word, tag in en_entities:
        if tag == "LOCATION" and word in city_codes:  # 如果是城市名稱
            enriched_en_entities.append((f"{word} ({city_codes[word]})", tag))
        else:
            enriched_en_entities.append((word, tag))
    english_ner_results.append(enriched_en_entities)

    # 中文命名實體識別
    zh_doc = nlp_zh(zh_sentence)
    zh_entities = [(ent.text, ent.label_) for ent in zh_doc.ents]
    chinese_ner_results.append(zh_entities)

# 將結果放入 DataFrame
mapped_df = pd.DataFrame({
    "English Sentence": english_sentences,
    "English Entities": english_ner_results,
    "Chinese Sentence": chinese_sentences,
    "Chinese Entities": chinese_ner_results
})

# 顯示結果
mapped_df.head()

# 保存映射結果
mapped_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\TEST_Train_mapped_results.csv", index=False)


Processing NER: 100%|██████████████████████████████████████████████████████████████████| 49/49 [00:59<00:00,  1.21s/it]


In [8]:
import pandas as pd
from tqdm import tqdm
import jieba
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# 初始化 Hugging Face NER 模型
english_model_name = "dslim/bert-base-NER"  # 英文 NER 模型
chinese_model_name = "bert-base-chinese"    # 中文模型（如果有專門的 NER 模型也可以用）

# 加載模型和 tokenizer
tokenizer_en = AutoTokenizer.from_pretrained(english_model_name)
model_en = AutoModelForTokenClassification.from_pretrained(english_model_name)
ner_pipeline_en = pipeline("ner", model=model_en, tokenizer=tokenizer_en)

tokenizer_zh = AutoTokenizer.from_pretrained(chinese_model_name)
model_zh = AutoModelForTokenClassification.from_pretrained(chinese_model_name)
ner_pipeline_zh = pipeline("ner", model=model_zh, tokenizer=tokenizer_zh)

# 定義城市代碼
city_codes = {
    "New York": "NYC",
    "San Francisco": "SFO",
    "Los Angeles": "LA"
}

# 讀取 CSV 文件
train_df = pd.read_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\TEST_Train.csv", encoding='ISO-8859-1')
english_sentences = train_df['translation.en'].dropna().tolist()
chinese_sentences = train_df['translation.zh'].dropna().tolist()

# 提取英文和中文命名實體
def extract_entities_ner(ner_results, city_codes=None):
    entities = []
    for entity in ner_results:
        word = entity['word']
        entity_type = entity['entity'].split("_")[-1]  # 取得實體類型
        if city_codes and word in city_codes and entity_type == "LOC":  # 如果是城市
            entities.append((f"{word} ({city_codes[word]})", entity_type))
        else:
            entities.append((word, entity_type))
    return entities

# 進行 NER 標記並提取實體
english_ner_results = []
chinese_ner_results = []

for en_sentence, zh_sentence in tqdm(zip(english_sentences, chinese_sentences), desc="Processing NER", total=len(english_sentences)):
    # 英文命名實體識別
    en_ner = ner_pipeline_en(en_sentence)
    en_entities = extract_entities_ner(en_ner, city_codes=city_codes)
    english_ner_results.append(en_entities)

    # 中文命名實體識別
    print("zh_sentence = " , zh_sentence)
    zh_ner = ner_pipeline_zh(zh_sentence)
    zh_entities = extract_entities_ner(zh_ner)
    chinese_ner_results.append(zh_entities)
    print("zh_entities = " , zh_entities)
# 將結果放入 DataFrame
print("chinese_ner_results = " , chinese_ner_results)
mapped_df = pd.DataFrame({
    "English Sentence": english_sentences,
    "English Entities": english_ner_results,
    "Chinese Sentence": chinese_sentences,
    "Chinese Entities": chinese_ner_results
})

# 顯示結果
mapped_df.head()

# 保存映射結果
mapped_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\TEST_results.csv", index=False)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing NER:   2%|█▎                         

zh_sentence =  «D±`??¡A§J¨½´µ¡CªºÚÌ«D±`?©¯ ¯à¦³²Ä¤G¦¸¯¸¦b??¥x¤WªºÉó?¡A§Ú¯u¬O«D±`·P¿E¡C
zh_entities =  [('«', '1'), ('D±', '1'), ('`', '1'), ('?', '1'), ('?', '0'), ('¡', '1'), ('A', '1'), ('§', '1'), ('J¨½´µ', '1'), ('¡', '1'), ('CªºÚÌ', '1'), ('«', '1'), ('D±', '0'), ('`', '1'), ('?', '1'), ('©¯', '1'), ('¯à¦³²Ä¤G¦¸¯¸¦b', '0'), ('?', '1'), ('?', '0'), ('¥x¤WªºÉó', '0'), ('?', '0'), ('¡', '1'), ('A', '0'), ('§', '1'), ('Ú¯u¬O', '0'), ('«', '1'), ('D±', '1'), ('`', '1'), ('·', '1'), ('P', '1'), ('¿', '1'), ('E', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ????¯u¬O?§Ú·P¨ìÕa?¤£¤w¡A§Ú?­n??§A?¯d¤Uªº ?¤_§Ú¤W¦¸ºt?ªººë±m??


Processing NER:   6%|████                                                               | 3/49 [00:00<00:07,  6.22it/s]

zh_entities =  [('?', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('¯u¬O', '1'), ('?', '1'), ('§', '1'), ('Ú', '1'), ('·', '1'), ('P¨ìÕa', '1'), ('?', '1'), ('¤', '1'), ('##£', '1'), ('##¤', '1'), ('##w', '1'), ('¡', '1'), ('A', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('n', '1'), ('?', '1'), ('?', '1'), ('§', '1'), ('A', '1'), ('?', '1'), ('¯d¤Uªº', '1'), ('?', '1'), ('¤', '1'), ('_', '1'), ('§', '1'), ('Ú¤W¦¸ºt', '1'), ('?', '1'), ('ªººë±m', '1'), ('?', '1'), ('?', '1')]
zh_sentence =  §Ú¬O«D±`¯u?ªº¡A³¡¤À­ì¦]¬O¦]?----§ÚªºÚÌ«D±`»Ý­n¡I §A?¨­?¦a?§Ú·Q·Q¡I
zh_entities =  [('§', '1'), ('Ú¬O', '1'), ('«', '1'), ('D±', '1'), ('`', '1'), ('¯u', '0'), ('?', '1'), ('ªº', '1'), ('¡', '1'), ('A³', '1'), ('¡', '1'), ('¤À\xadì¦', '1'), (']', '0'), ('¬O¦', '1'), (']', '1'), ('?', '1'), ('-', '1'), ('-', '1'), ('-', '1'), ('-', '1'), ('§', '1'), ('ÚªºÚÌ', '1'), ('«', '0'), ('D±', '1'), ('`', '0'), ('»', '0'), ('Ý\xadn', '1'), ('¡', '1'), ('I', '0'), ('§', '1'), ('A', '1'), ('?', '0'), ('¨', '1'), ('?'

Processing NER:  10%|██████▊                                                            | 5/49 [00:00<00:06,  6.66it/s]

zh_sentence =  §Ú§¤¤F8¦~ªºªÅ?¤G?¡C
zh_entities =  [('§', '1'), ('Ú', '1'), ('§', '1'), ('¤¤F8¦', '1'), ('~', '1'), ('ªºªÅ', '1'), ('?', '1'), ('¤G', '1'), ('?', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ¤£??¦b¤W?Éó«e§Ú?­n?±¼§Úªº¾c¤l
zh_entities =  [('¤', '1'), ('##£', '1'), ('?', '1'), ('?', '1'), ('¦b¤W', '1'), ('?', '1'), ('Éó', '1'), ('«', '1'), ('e', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('n', '1'), ('?', '1'), ('±', '1'), ('##¼', '1'), ('§', '1'), ('Úªº¾c¤l', '1')]


Processing NER:  12%|████████▏                                                          | 6/49 [00:00<00:06,  7.04it/s]

zh_sentence =  §Ú?§A??¤p¬G¨Æ?§Î®e§Ú?¦b¬O¤°¤\?ªº
zh_entities =  [('§', '1'), ('Ú', '1'), ('?', '1'), ('§', '1'), ('A', '1'), ('?', '1'), ('?', '1'), ('¤p¬G¨Æ', '1'), ('?', '0'), ('§', '1'), ('Î®e', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('¦b¬O¤°¤', '1'), ('\\', '0'), ('?', '1'), ('ªº', '1')]
zh_sentence =  ?¬O?¯u?ªº¬G¨Æ--¨C¤@?³£¬O¦p¦¹¡C


Processing NER:  16%|██████████▉                                                        | 8/49 [00:01<00:06,  6.36it/s]

zh_entities =  [('?', '1'), ('¬O', '1'), ('?', '1'), ('¯u', '1'), ('?', '1'), ('ªº¬G¨Æ', '1'), ('-', '1'), ('-', '1'), ('¨C¤', '1'), ('@', '1'), ('?', '1'), ('³£¬O¦p¦¹', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ´N¦b§Ú©MTipperÖÃ?¥Õ?µy¦Z¤£¤[ -- -- §Ú?????¤°??ªº®a?¨ì §Ú?¦b??50¨½¥~ªº¤@?¤p??--
zh_entities =  [('´N¦b', '1'), ('§', '1'), ('Ú©MTipperÖÃ', '1'), ('?', '1'), ('¥Õ', '1'), ('?', '1'), ('µy¦Z¤£¤', '1'), ('[', '0'), ('-', '1'), ('-', '1'), ('-', '1'), ('-', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('¤', '1'), ('##°', '1'), ('?', '1'), ('?', '1'), ('ªº®a', '1'), ('?', '1'), ('¨ì', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('¦b', '1'), ('?', '1'), ('?', '1'), ('50¨½¥', '1'), ('~', '1'), ('ªº¤', '1'), ('@', '1'), ('?', '1'), ('¤', '1'), ('##p', '1'), ('?', '1'), ('?', '1'), ('-', '1'), ('-', '1')]
zh_sentence =  ¦Û¤v??


Processing NER:  20%|█████████████▍                                                    | 10/49 [00:01<00:05,  6.85it/s]

zh_entities =  [('¦Û¤v', '1'), ('?', '1'), ('?', '1')]
zh_sentence =  §Úª¾¹D??§A???¬O¤p¨Æ¤@?¡A¦ý¬O ---- ¥¿?§Ú¬Ý?¦Z??ªº?­Ô¡A¬ðµM·P¨ì«D±`Õa©_¡C
zh_entities =  [('§', '1'), ('Úª¾¹D', '1'), ('?', '1'), ('?', '1'), ('§', '1'), ('A', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('¬O¤p¨Æ¤', '1'), ('@', '1'), ('?', '1'), ('¡', '1'), ('A¦ý¬O', '1'), ('-', '1'), ('-', '1'), ('-', '1'), ('-', '1'), ('¥', '1'), ('¿', '1'), ('?', '1'), ('§', '1'), ('Ú¬Ý', '1'), ('?', '1'), ('¦Z', '1'), ('?', '1'), ('?', '1'), ('ªº', '1'), ('?', '1'), ('Ô', '1'), ('¡', '1'), ('A¬ðµM', '1'), ('·', '1'), ('P¨ì', '1'), ('«', '1'), ('D±', '1'), ('`', '1'), ('Õa©', '1'), ('_', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ¦Z­±³ºµM?¦³??¡C
zh_entities =  [('¦Z\xad±³ºµM', '1'), ('?', '1'), ('¦³', '1'), ('?', '1'), ('?', '0'), ('¡', '1'), ('C', '0')]
zh_sentence =  §A?§v?¤ÛªÏµh¤\¡H


Processing NER:  27%|█████████████████▌                                                | 13/49 [00:01<00:05,  6.93it/s]

zh_entities =  [('§', '1'), ('A', '1'), ('?', '1'), ('§', '1'), ('v', '1'), ('?', '1'), ('¤ÛªÏµh¤', '1'), ('\\', '1'), ('¡', '1'), ('H', '1')]
zh_sentence =  ??ºÖ¯STaurus¬O¯²?ªº¡A???¦n¬O±ß?ªº?­Ô §Ú?´N?©l§ä¦Y?ªº¦a¤è
zh_entities =  [('?', '1'), ('?', '1'), ('ºÖ¯STaurus¬O¯²', '1'), ('?', '1'), ('ªº', '1'), ('¡', '1'), ('A', '1'), ('?', '1'), ('?', '0'), ('?', '1'), ('¦n¬O±ß', '1'), ('?', '1'), ('ªº', '1'), ('?', '1'), ('Ô', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('´N', '1'), ('?', '1'), ('©', '1'), ('##l', '1'), ('§', '1'), ('ä¦Y', '1'), ('?', '1'), ('ªº¦a¤è', '1')]
zh_sentence =  §Ú?¦b40?¬w?¤½¸ô¤W¡A­n¦b238¥X¤f¤U¨ì¥Ð?¦è¦{ªº¾¤¤Ú¹à¥«


Processing NER:  31%|████████████████████▏                                             | 15/49 [00:02<00:04,  7.11it/s]

zh_entities =  [('§', '1'), ('Ú', '1'), ('?', '1'), ('¦b40', '1'), ('?', '1'), ('¬w', '1'), ('?', '1'), ('¤½¸ô¤W', '1'), ('¡', '1'), ('A\xadn¦b238¥X¤f¤U¨ì¥Ð', '1'), ('?', '1'), ('¦è¦', '1'), ('{', '1'), ('ªº¾¤¤Ú¹à¥', '1'), ('«', '1')]
zh_sentence =  §Ú??¥X¤f¤U?¡A?©l?§ä¤@®a--§Ú?§ä¨ì¤FShoneyÀ\?¡C
zh_entities =  [('§', '1'), ('Ú', '1'), ('?', '1'), ('?', '1'), ('¥X¤f¤U', '1'), ('?', '1'), ('¡', '1'), ('A', '1'), ('?', '1'), ('©', '1'), ('##l', '1'), ('?', '1'), ('§', '1'), ('ä¤', '1'), ('@', '1'), ('®', '0'), ('##a', '1'), ('-', '1'), ('-', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('§', '1'), ('ä¨ì¤FShoneyÀ', '1'), ('\\', '0'), ('?', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  §A?¦³¨Ç¤H¤]?¤£ª¾¹D¡A?¬O®a§C¦¨¥»ªº®a®x??À\?


Processing NER:  33%|█████████████████████▌                                            | 16/49 [00:02<00:04,  7.41it/s]

zh_entities =  [('§', '1'), ('A', '1'), ('?', '1'), ('¦³¨Ç¤H¤', '1'), (']', '0'), ('?', '1'), ('¤£ª¾¹D', '1'), ('¡', '1'), ('A', '1'), ('?', '1'), ('¬O®a', '1'), ('§', '1'), ('C¦¨¥', '1'), ('»', '1'), ('ªº®a®x', '1'), ('?', '0'), ('?', '1'), ('À', '1'), ('\\', '0'), ('?', '1')]
zh_sentence =  §Ú?¨«?¥h¦Z§¤¦b¤p«F¤l¨½¡A¤kªA??¤]¨«¤F??¡C §Ú¸òTipper¤Þ°_¤F¤@???¡C ¦o??¦Z¨«¨ì§Ú?®Ç?«F¤l¨½ªº¤@?±¡??¡A µM¦Z¦oªº?­µ?±o«D±`¤p¥H¦Ü¤_§Ú­n?°_¦Õ¦·¤~¯à§v¨ì¦o¦b?¤°¤\


Processing NER:  37%|████████████████████████▏                                         | 18/49 [00:02<00:04,  6.52it/s]

zh_entities =  [('§', '1'), ('Ú', '1'), ('?', '1'), ('¨', '1'), ('«', '1'), ('?', '1'), ('¥h¦Z', '1'), ('§', '1'), ('¤¦b¤p', '1'), ('«', '1'), ('F¤l¨½', '1'), ('¡', '1'), ('A¤kªA', '1'), ('?', '1'), ('?', '1'), ('¤', '1'), (']', '1'), ('¨', '1'), ('«', '1'), ('¤F', '1'), ('?', '1'), ('?', '0'), ('¡', '1'), ('C', '1'), ('§', '1'), ('Ú¸òTipper¤Þ°', '1'), ('_', '1'), ('¤F¤', '1'), ('@', '1'), ('?', '0'), ('?', '1'), ('?', '0'), ('¡', '1'), ('C', '1'), ('¦o', '1'), ('?', '1'), ('?', '1'), ('¦Z¨', '1'), ('«', '1'), ('¨ì', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('®Ç', '1'), ('?', '0'), ('«', '1'), ('F¤l¨½ªº¤', '1'), ('@', '1'), ('?', '1'), ('±', '1'), ('¡', '0'), ('?', '1'), ('?', '0'), ('¡', '1'), ('A', '1'), ('µM¦Z¦oªº', '1'), ('?', '1'), ('µ', '1'), ('?', '1'), ('±', '1'), ('##o', '1'), ('«', '1'), ('D±', '1'), ('`', '1'), ('¤p¥H¦Ü¤', '1'), ('_', '1'), ('§', '1'), ('Ú\xadn', '1'), ('?', '1'), ('°', '1'), ('_', '1'), ('¦Õ¦', '1'), ('·', '1'), ('¤', '1'), ('~', '1'), ('¯à', '1'), ('§', '

Processing NER:  41%|██████████████████████████▉                                       | 20/49 [00:03<00:04,  6.29it/s]

zh_entities =  [('¨º', '1'), ('?', '1'), ('¨kªº±µ', '1'), ('?', '1'), ('?', '1'), ('¡', '1'), ('A', '1'), ('¡', '1'), ('§', '1'), ('¥L¤w', '1'), ('?', '1'), ('«', '0'), ('D±', '1'), ('`', '1'), ('§', '1'), ('V¤O¤F', '1'), ('¡', '1'), ('A¤£¬O', '1'), ('?', '1'), ('¡', '1'), ('H', '1'), ('¡', '0'), ('§', '1')]
zh_sentence =  ¤w?ºâ¬O¦³¨Ç??¤F¡C
zh_entities =  [('¤', '1'), ('##w', '1'), ('?', '1'), ('ºâ¬O¦³¨Ç', '1'), ('?', '1'), ('?', '1'), ('¤F', '1'), ('¡', '1'), ('C', '1')]


Processing NER:  43%|████████████████████████████▎                                     | 21/49 [00:03<00:04,  6.04it/s]

zh_sentence =  ²Ä¤G¤Ñ¡A??¤@?§¹¥þ¯u?ªº¬G¨Æ¡C §Úµn¤WG-5´N?¨ì«D¬wªº¥§?§Q?°µºt?¥h¤F¡A ¦b­º³£©Ô¦U´µ¡A¥D??¬O?¤_¯à·½¡C
zh_entities =  [('²Ä¤G¤Ñ', '1'), ('¡', '1'), ('A', '1'), ('?', '1'), ('?', '1'), ('¤', '1'), ('@', '1'), ('?', '1'), ('§', '1'), ('¹¥þ¯u', '1'), ('?', '1'), ('ªº¬G¨Æ', '1'), ('¡', '1'), ('C', '1'), ('§', '1'), ('Úµn¤WG', '1'), ('-', '1'), ('5´N', '1'), ('?', '1'), ('¨ì', '1'), ('«', '1'), ('D¬wªº¥', '1'), ('§', '1'), ('?', '1'), ('§', '1'), ('Q', '1'), ('?', '1'), ('°', '1'), ('##µ', '1'), ('##º', '1'), ('##t', '1'), ('?', '1'), ('¥h¤F', '1'), ('¡', '1'), ('A', '1'), ('¦b\xadº³£©Ô¦U´µ', '1'), ('¡', '1'), ('A¥D', '1'), ('?', '0'), ('?', '1'), ('¬O', '1'), ('?', '1'), ('¤', '1'), ('_', '1'), ('¯à', '1'), ('·', '1'), ('½', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ¦Óºt??©l§Ú?§â«e¤@¤Ñ?¥Í¦b?¤°??¦{ ªº¬G¨Æ?¤F¤@¹M¡C


Processing NER:  47%|██████████████████████████████▉                                   | 23/49 [00:03<00:04,  5.62it/s]

zh_entities =  [('¦Óºt', '1'), ('?', '1'), ('?', '1'), ('©', '1'), ('##l', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('§', '1'), ('â', '1'), ('«', '1'), ('e', '1'), ('##¤', '1'), ('@', '1'), ('¤Ñ', '1'), ('?', '1'), ('¥Í¦b', '1'), ('?', '1'), ('¤', '1'), ('##°', '1'), ('?', '1'), ('?', '1'), ('¦', '1'), ('{', '1'), ('ªº¬G¨Æ', '1'), ('?', '1'), ('¤F¤', '1'), ('@', '1'), ('¹M', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ´N¦p¦P??§Ú?§A??ªº¬G¨Æ¤@¼Ò¤@?¡C §Ú©d¤l©M§Ú¦Û¤v??¡AShoney'sÀ\?¡A§C¦¨¥»®a®x??À\? ?©±¨º?¨k¤Hªº?--µM¦Z??¯º¤F
zh_entities =  [('´N¦p¦P', '1'), ('?', '1'), ('?', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('§', '1'), ('A', '1'), ('?', '1'), ('?', '1'), ('ªº¬G¨Æ¤', '1'), ('@', '1'), ('¼Ò¤', '1'), ('@', '1'), ('?', '1'), ('¡', '1'), ('C', '1'), ('§', '1'), ('Ú©d¤l©M', '1'), ('§', '1'), ('Ú¦Û¤v', '1'), ('?', '1'), ('?', '1'), ('¡', '1'), ('AShoney', '1'), ("'", '1'), ('sÀ', '1'), ('\\', '0'), ('?', '1'), ('¡', '1'), ('A', '1'), ('§', '1'), ('C¦¨¥', '1'), ('»', '1'), ('®', '0'), ('

Processing NER:  51%|█████████████████████████████████▋                                | 25/49 [00:03<00:03,  6.52it/s]

zh_sentence =  ºt?§¹¦Z´N?¨ì?Éó??¦^¦Ñ®a¤F
zh_entities =  [('º', '1'), ('##t', '1'), ('?', '1'), ('§', '1'), ('¹¦Z´N', '1'), ('?', '1'), ('¨ì', '1'), ('?', '1'), ('Éó', '1'), ('?', '1'), ('?', '1'), ('¦', '1'), ('^', '1'), ('¦Ñ®a¤F', '1')]
zh_sentence =  ±µ?§Ú´N¦b?Éó¤WºÎ?¤F¡Aª½¨ì¤T§ó¥b©]¤~¿ô?¡A ?Éó¦b?³t?¸s?­°¸¨¥[ªo¡C
zh_entities =  [('±', '1'), ('##µ', '1'), ('?', '1'), ('§', '1'), ('Ú´N¦b', '1'), ('?', '1'), ('Éó¤WºÎ', '1'), ('?', '1'), ('¤F', '1'), ('¡', '1'), ('Aª½¨ì¤T', '1'), ('§', '1'), ('ó¥b©', '1'), (']', '1'), ('¤', '1'), ('~', '1'), ('¿', '1'), ('ô', '1'), ('?', '1'), ('¡', '1'), ('A', '1'), ('?', '1'), ('Éó¦b', '1'), ('?', '1'), ('³', '1'), ('##t', '1'), ('?', '1'), ('¸s', '1'), ('?', '1'), ('°¸¨¥', '1'), ('[', '0'), ('ªo', '1'), ('¡', '1'), ('C', '1')]


Processing NER:  53%|███████████████████████████████████                               | 26/49 [00:04<00:04,  5.68it/s]

zh_sentence =  §Ú¿ô¤F??¡A¨«¥X?Éó¥h©I§l?·s?ªÅÉa¡A µM¦Z§Ú¬Ý¨ì¦³?¤H¶]??Éó¶]¹D
zh_entities =  [('§', '1'), ('Ú', '1'), ('¿', '1'), ('ô¤F', '1'), ('?', '1'), ('?', '1'), ('¡', '1'), ('A¨', '1'), ('«', '0'), ('¥X', '1'), ('?', '1'), ('Éó¥h©I', '1'), ('§', '1'), ('l', '1'), ('?', '1'), ('·', '1'), ('s', '1'), ('?', '1'), ('ªÅÉa', '1'), ('¡', '0'), ('A', '0'), ('µM¦Z', '0'), ('§', '1'), ('Ú¬Ý¨ì¦³', '0'), ('?', '0'), ('¤H', '1'), ('¶', '0'), (']', '1'), ('?', '1'), ('?', '0'), ('Éó', '1'), ('¶', '1'), (']', '1'), ('¹D', '1')]


Processing NER:  55%|████████████████████████████████████▎                             | 27/49 [00:04<00:03,  5.96it/s]

zh_sentence =  ¥L¤@??»R?¤@??¡A¤@?¤j³Û¡A ¡¨¥´????²±?¡I¥´????²±?¡I¡§
zh_entities =  [('¥L¤', '1'), ('@', '1'), ('?', '1'), ('?', '1'), ('»', '1'), ('R', '1'), ('?', '1'), ('¤', '0'), ('@', '1'), ('?', '1'), ('?', '1'), ('¡', '1'), ('A¤', '1'), ('@', '1'), ('?', '1'), ('¤j³Û', '1'), ('¡', '1'), ('A', '1'), ('¡', '1'), ('¨¥´', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('²', '1'), ('##±', '1'), ('?', '1'), ('¡', '1'), ('I¥´', '0'), ('?', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('²', '1'), ('##±', '1'), ('?', '1'), ('¡', '1'), ('I', '1'), ('¡', '1'), ('§', '1')]
zh_sentence =  µM¦Z§Ú¦Û¤v¦b¨º«ä¦Ò¡A¤T§ó¥b©]¦b¤j¦è¬vªº¤¤¤ß ¨º¤\?²±?¯à¦³¤°¤\¥X?©O¡H


Processing NER:  59%|███████████████████████████████████████                           | 29/49 [00:04<00:03,  5.86it/s]

zh_entities =  [('µM¦Z', '1'), ('§', '1'), ('Ú¦Û¤v¦b¨º', '1'), ('«', '1'), ('ä¦Ò', '1'), ('¡', '1'), ('A¤T', '1'), ('§', '1'), ('ó¥b©', '1'), (']', '1'), ('¦b¤j¦è¬vªº¤¤¤ß', '1'), ('¨º¤', '1'), ('\\', '1'), ('?', '1'), ('²', '1'), ('##±', '1'), ('?', '1'), ('¯à¦³¤°¤', '1'), ('\\', '1'), ('¥X', '1'), ('?', '1'), ('©O', '1'), ('¡', '1'), ('H', '1')]
zh_sentence =  ±µ¤U?§Ú?°_?­ì??¬O¦³«Ü¦h¨Æ±¡ªº¡C
zh_entities =  [('±µ¤U', '1'), ('?', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('°', '0'), ('_', '1'), ('?', '1'), ('ì', '1'), ('?', '1'), ('?', '1'), ('¬O¦³', '1'), ('«', '1'), ('Ü¦h¨Æ±', '1'), ('¡', '1'), ('ªº', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ¦ý¬O?ªG?©ú­ì?¬O§Úªº?¤uªºÌå«×?Ä¼ ¦]?¥§?§Q?¦³¤@®a³q?ªÀ¤w?§â§Úªººt??¦¨¤@?¬G¨Æ ¦}¥B¤w?¦L¥X?¦b¥þ¬ü?¦æ¤F


Processing NER:  61%|████████████████████████████████████████▍                         | 30/49 [00:04<00:03,  5.64it/s]

zh_entities =  [('¦ý¬O', '1'), ('?', '1'), ('ªG', '1'), ('?', '1'), ('©ú\xadì', '1'), ('?', '1'), ('¬O', '1'), ('§', '1'), ('Úªº', '1'), ('?', '1'), ('¤uªºÌå', '1'), ('«', '0'), ('×', '1'), ('?', '1'), ('Ä¼', '1'), ('¦', '1'), (']', '1'), ('?', '1'), ('¥', '0'), ('§', '1'), ('?', '0'), ('§', '1'), ('Q', '1'), ('?', '1'), ('¦³¤', '0'), ('@', '1'), ('®', '0'), ('##a', '1'), ('##³', '1'), ('##q', '0'), ('?', '1'), ('ªÀ¤w', '1'), ('?', '0'), ('§', '1'), ('â', '0'), ('§', '1'), ('Úªººt', '0'), ('?', '0'), ('?', '0'), ('¦¨¤', '0'), ('@', '1'), ('?', '0'), ('¬G¨Æ', '1'), ('¦', '1'), ('}', '0'), ('¥B¤w', '1'), ('?', '1'), ('¦L¥X', '1'), ('?', '1'), ('¦b¥þ¬ü', '1'), ('?', '1'), ('¦æ¤F', '1')]
zh_sentence =  -- §Ú¬d?¤F¬O¦b»X¯S§Q?¦L¨êªº¡A¦Ó?ªº¬G¨Æ?¬O????ªº¡A ¡¨«e°Æ??¦ã?¡P¤à?«e¤Ñ¦b¥§?§Q??©ú¡A ¡¥§ÚÉO§Ú©d¤lTipper?¤F¤@®a§C¦¨¥»®a®x??À\?¡A¦W?Shoney's, ¦Ó¥B¬O¦³§Ú?¦Û¤v??ªº¡C¡§ ¦b§Ú¦^¨ì¬ü?¥»¤g«e¡A À¹??¯S°Ò©MªN¥ì??¤w??©l¦b·d?¨Ç¥Í·N¤F -- ¥L?¤¤ªº¤@?¤H??§ÚÀ¹¤F?«Ü¤jªº¤j?´U©O¡A


Processing NER:  65%|███████████████████████████████████████████                       | 32/49 [00:05<00:03,  4.63it/s]

zh_entities =  [('-', '1'), ('-', '1'), ('§', '1'), ('Ú¬d', '1'), ('?', '1'), ('¤F¬O¦b', '1'), ('»', '1'), ('X¯S', '1'), ('§', '1'), ('Q', '1'), ('?', '1'), ('¦L¨êªº', '1'), ('¡', '1'), ('A¦Ó', '1'), ('?', '1'), ('ªº¬G¨Æ', '1'), ('?', '1'), ('¬O', '1'), ('?', '1'), ('?', '0'), ('?', '0'), ('?', '0'), ('ªº', '1'), ('¡', '1'), ('A', '1'), ('¡', '1'), ('¨', '1'), ('«', '1'), ('e°Æ', '1'), ('?', '1'), ('?', '1'), ('¦ã', '1'), ('?', '1'), ('¡', '1'), ('P¤à', '1'), ('?', '1'), ('«', '1'), ('e¤Ñ¦b¥', '1'), ('§', '1'), ('?', '1'), ('§', '1'), ('Q', '1'), ('?', '1'), ('?', '1'), ('©ú', '1'), ('¡', '1'), ('A', '1'), ('¡', '1'), ('¥', '1'), ('§', '1'), ('ÚÉO', '1'), ('§', '1'), ('Ú©d¤lTipper', '1'), ('?', '1'), ('¤F¤', '1'), ('@', '1'), ('®', '1'), ('##a', '1'), ('§', '1'), ('C¦¨¥', '0'), ('»', '1'), ('®', '1'), ('##a', '1'), ('##®', '0'), ('##x', '0'), ('?', '1'), ('?', '1'), ('À', '1'), ('\\', '0'), ('?', '1'), ('¡', '1'), ('A¦W', '1'), ('?', '1'), ('Shoney', '1'), ("'", '1'), ('s', '1'), (',',

Processing NER:  67%|████████████████████████████████████████████▍                     | 33/49 [00:05<00:03,  4.90it/s]

zh_sentence =  ¤T¤Ñ¦Z¡A§Ú?§ÚªºªB¤Í­Ý¥ë¦ñ­Ý¦P¨Æ¤ñ?§JªL?­þ¨½¦¬¨ì¤@«Ê ¤â?¡Aºë¬ü­Ý«Ü?ªº¤@«Ê«H¡A?®e¬O¡A¡¨¦ã?¡A¯¬·sÀ\???¡I¡¨
zh_entities =  [('¤T¤Ñ¦Z', '1'), ('¡', '1'), ('A', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('§', '1'), ('ÚªºªB¤Í\xadÝ¥ë¦ñ\xadÝ¦P¨Æ¤ñ', '1'), ('?', '1'), ('§', '1'), ('JªL', '1'), ('?', '1'), ('þ¨½¦¬¨ì¤', '1'), ('@', '1'), ('«', '1'), ('Ê', '1'), ('¤â', '1'), ('?', '1'), ('¡', '1'), ('Aºë¬ü\xadÝ', '0'), ('«', '1'), ('Ü', '0'), ('?', '1'), ('ªº¤', '1'), ('@', '1'), ('«', '1'), ('Ê', '1'), ('«', '1'), ('H', '1'), ('¡', '1'), ('A', '1'), ('?', '1'), ('®e¬O', '0'), ('¡', '0'), ('A', '0'), ('¡', '0'), ('¨¦ã', '0'), ('?', '0'), ('¡', '1'), ('A¯¬', '0'), ('·', '0'), ('sÀ', '1'), ('\\', '0'), ('?', '1'), ('?', '1'), ('?', '1'), ('¡', '1'), ('I', '1'), ('¡', '1'), ('¨', '1')]
zh_sentence =  §Ú?³ß?¦b?¤è¦¨¥\ªº?­Ô¤¬¬Û¯¬ºÖ


Processing NER:  71%|███████████████████████████████████████████████▏                  | 35/49 [00:05<00:02,  6.38it/s]

zh_entities =  [('§', '1'), ('Ú', '1'), ('?', '1'), ('³', '1'), ('##ß', '1'), ('?', '1'), ('¦b', '1'), ('?', '1'), ('¤è¦¨¥', '1'), ('\\', '1'), ('ªº', '1'), ('?', '1'), ('Ô¤¬¬Û¯¬ºÖ', '1')]
zh_sentence =  §Ú­n?¤@?«H®§¥Í??¡C
zh_entities =  [('§', '1'), ('Ú\xadn', '1'), ('?', '1'), ('¤', '1'), ('@', '1'), ('?', '1'), ('«', '1'), ('H®', '1'), ('§', '1'), ('¥Í', '1'), ('?', '1'), ('?', '1'), ('¡', '1'), ('C', '1')]


Processing NER:  73%|████████████████████████████████████████████████▍                 | 36/49 [00:05<00:02,  5.60it/s]

zh_sentence =  ¦ý¬O§Ú¤S¦b·Q¤]?§Ú¥i¯à?¦E§âTEDºt??§@§Ú?¥Íªº??¡A ¨º?ªº?¤]?§Ú¥i¥H¦b¨ä¥Lªº?­Ô?-«ü«H®§¥Í??¡]¹ª´x¡^ §J¨½´µ ¦w¼w´Ë¡G ¤@¨¥?©w¡I
zh_entities =  [('¦ý¬O', '1'), ('§', '1'), ('Ú¤S¦b', '1'), ('·', '1'), ('Q¤', '1'), (']', '1'), ('?', '1'), ('§', '1'), ('Ú¥i¯à', '1'), ('?', '1'), ('¦E', '1'), ('§', '1'), ('âTEDºt', '1'), ('?', '1'), ('?', '1'), ('§', '1'), ('@', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('¥Íªº', '1'), ('?', '1'), ('?', '0'), ('¡', '1'), ('A', '1'), ('¨º', '1'), ('?', '1'), ('ªº', '1'), ('?', '1'), ('¤', '1'), (']', '0'), ('?', '1'), ('§', '1'), ('Ú¥i¥H¦b¨ä¥Lªº', '1'), ('?', '1'), ('Ô', '0'), ('?', '0'), ('-', '1'), ('«', '1'), ('ü', '1'), ('«', '1'), ('H®', '1'), ('§', '1'), ('¥Í', '1'), ('?', '1'), ('?', '1'), ('¡', '0'), (']', '1'), ('¹ª´x', '1'), ('¡', '1'), ('^', '0'), ('§', '1'), ('J¨½´µ', '1'), ('¦w¼w´Ë', '1'), ('¡', '1'), ('G', '1'), ('¤', '1'), ('@', '1'), ('¨¥', '1'), ('?', '1'), ('©', '1'), ('##w', '1'), ('¡', '1'), ('I', '1')]


Processing NER:  76%|█████████████████████████████████████████████████▊                | 37/49 [00:06<00:02,  5.31it/s]

zh_sentence =  §Ú?¦b¥D­n·Q?¤@?§A?³Ì?¤ßªº?? ?¤_Éa­Ô¦MÉó§A¥i¥H°µ¤°¤\©O¡H§Ú·Q????©l?-- §Ú­n?§A?®i¥Ü¤@¨Ç·sªº·Ó¤ù¡A?µM?¦³¥|¤­?¤]»Ý­n­n­«·s®i¥Ü
zh_entities =  [('§', '1'), ('Ú', '1'), ('?', '1'), ('¦b¥D\xadn', '1'), ('·', '1'), ('Q', '1'), ('?', '1'), ('¤', '1'), ('@', '1'), ('?', '0'), ('§', '1'), ('A', '1'), ('?', '1'), ('³Ì', '1'), ('?', '1'), ('¤ßªº', '1'), ('?', '1'), ('?', '0'), ('?', '1'), ('¤', '1'), ('_', '1'), ('Éa\xadÔ¦MÉó', '1'), ('§', '1'), ('A¥i¥H°µ¤°¤', '1'), ('\\', '0'), ('©O', '1'), ('¡', '1'), ('H', '1'), ('§', '1'), ('Ú', '1'), ('·', '1'), ('Q', '1'), ('?', '1'), ('?', '0'), ('?', '0'), ('?', '0'), ('©', '1'), ('##l', '1'), ('?', '1'), ('-', '1'), ('-', '1'), ('§', '1'), ('Ú\xadn', '1'), ('?', '1'), ('§', '1'), ('A', '1'), ('?', '1'), ('®i¥Ü¤', '1'), ('@', '1'), ('¨Ç', '0'), ('·', '1'), ('sªº', '0'), ('·', '1'), ('Ó¤ù', '1'), ('¡', '1'), ('A', '1'), ('?', '1'), ('µM', '1'), ('?', '1'), ('¦³¥', '1'), ('|', '1'), ('¤', '1'), ('?', '1'), ('¤', '1'), (']', '1'), ('»', '1'), ('

Processing NER:  78%|███████████████████████████████████████████████████▏              | 38/49 [00:06<00:02,  5.11it/s]

zh_sentence =  ?¦b¬Ý¤Û?¤ù¡A§Ú¨C¦¸³£?§ó·s?¨Ç¤Û?¤ùªº¡C
zh_entities =  [('?', '1'), ('¦b¬Ý¤Û', '1'), ('?', '1'), ('¤ù', '1'), ('¡', '1'), ('A', '1'), ('§', '1'), ('Ú¨C¦¸³£', '1'), ('?', '1'), ('§', '1'), ('ó', '1'), ('·', '1'), ('s', '1'), ('?', '1'), ('¨Ç¤Û', '1'), ('?', '1'), ('¤ùªº', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  §Ú¨C¦¸§ó·sªº?­Ô³£?¥[·sªº?¤ù?¥h¡A¨ä¤¤¤]?¨ì¤F?¦h·sªº¡C


Processing NER:  82%|█████████████████████████████████████████████████████▉            | 40/49 [00:06<00:01,  6.15it/s]

zh_entities =  [('§', '1'), ('Ú¨C¦¸', '1'), ('§', '1'), ('ó', '1'), ('·', '1'), ('sªº', '1'), ('?', '1'), ('Ô³£', '1'), ('?', '1'), ('¥', '1'), ('[', '0'), ('·', '1'), ('sªº', '1'), ('?', '1'), ('¤ù', '1'), ('?', '1'), ('¥', '1'), ('##h', '0'), ('¡', '1'), ('A¨ä¤¤¤', '1'), (']', '1'), ('?', '1'), ('¨ì¤F', '1'), ('?', '1'), ('¦h', '1'), ('·', '1'), ('sªº', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ´N¹³¨º¨Ç¦b¼é¦Á?¹jªº®ü?¤¤§ä?ª«¤@¯ë¡A
zh_entities =  [('´N¹³¨º¨Ç¦b¼é¦Á', '1'), ('?', '1'), ('¹jªº®ü', '1'), ('?', '1'), ('¤', '1'), ('##¤', '1'), ('§', '1'), ('ä', '1'), ('?', '1'), ('ª', '1'), ('«', '1'), ('¤', '1'), ('@', '1'), ('¯ë', '1'), ('¡', '1'), ('A', '1')]
zh_sentence =  §A?§ä¨ì§ó¦hªº??¡C


Processing NER:  86%|████████████████████████████████████████████████████████▌         | 42/49 [00:06<00:00,  7.07it/s]

zh_entities =  [('§', '1'), ('A', '1'), ('?', '1'), ('§', '1'), ('ä¨ì', '1'), ('§', '1'), ('ó¦hªº', '1'), ('?', '1'), ('?', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ??¦b¬Q¤Ñ¡A§Ú?´N±o¨ì¤F·sªº¤@¤ë¥÷ªº?«×??¡C
zh_entities =  [('?', '1'), ('?', '1'), ('¦b¬Q¤Ñ', '1'), ('¡', '1'), ('A', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('´N±o¨ì¤F', '1'), ('·', '1'), ('sªº¤', '1'), ('@', '1'), ('¤ë¥÷ªº', '1'), ('?', '1'), ('«', '1'), ('×', '1'), ('?', '1'), ('?', '1'), ('¡', '1'), ('C', '1')]


Processing NER:  88%|█████████████████████████████████████████████████████████▉        | 43/49 [00:07<00:00,  6.72it/s]

zh_sentence =  ???¬O¬ü?ªº¡A¤@¤ë¥÷ªº?¥v¥­§½??¬O31«×¡C ¤W?¤@¤ë¥÷¬O39.5«×¡C
zh_entities =  [('?', '1'), ('?', '1'), ('?', '1'), ('¬O¬ü', '1'), ('?', '1'), ('ªº', '1'), ('¡', '1'), ('A¤', '1'), ('@', '1'), ('¤ë¥÷ªº', '1'), ('?', '1'), ('¥', '1'), ('##v', '1'), ('##¥', '1'), ('§', '1'), ('½', '1'), ('?', '1'), ('?', '1'), ('¬O31', '1'), ('«', '1'), ('×', '1'), ('¡', '1'), ('C', '1'), ('¤W', '1'), ('?', '1'), ('¤', '0'), ('@', '1'), ('¤ë¥÷¬O39', '1'), ('.', '1'), ('5', '1'), ('«', '1'), ('×', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ?¦b¡A§Úª¾¹D§A·Q­n§ó¦hªº?¤_?¹Òªº§¥·s? --¥u¬O?ª±¯º-- ¦ý?¨Ç¥u¬O­«­z¤Û?¤ù¡A µM¦Z§Ú?????¤_§A¯à°µ¤°¤\ªº·s?®e¡C


Processing NER:  92%|████████████████████████████████████████████████████████████▌     | 45/49 [00:07<00:00,  5.94it/s]

zh_entities =  [('?', '1'), ('¦b', '1'), ('¡', '1'), ('A', '1'), ('§', '1'), ('Úª¾¹D', '1'), ('§', '1'), ('A', '1'), ('·', '1'), ('Q\xadn', '1'), ('§', '1'), ('ó¦hªº', '1'), ('?', '1'), ('¤', '1'), ('_', '1'), ('?', '1'), ('¹Òªº', '1'), ('§', '1'), ('¥', '0'), ('·', '1'), ('s', '1'), ('?', '1'), ('-', '1'), ('-', '1'), ('¥u¬O', '1'), ('?', '1'), ('ª±¯º', '1'), ('-', '1'), ('-', '1'), ('¦ý', '1'), ('?', '1'), ('¨Ç¥u¬O', '1'), ('«', '1'), ('z¤Û', '1'), ('?', '1'), ('¤ù', '1'), ('¡', '0'), ('A', '1'), ('µM¦Z', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('¤', '1'), ('_', '1'), ('§', '1'), ('A¯à°µ¤°¤', '1'), ('\\', '1'), ('ªº', '1'), ('·', '1'), ('s', '1'), ('?', '1'), ('®', '1'), ('##e', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ¦ý§Ú·Q¥ý?©ú¤U?¨Ç?¦è¡C
zh_entities =  [('¦ý', '1'), ('§', '1'), ('Ú', '1'), ('·', '1'), ('Q¥ý', '1'), ('?', '1'), ('©ú¤U', '1'), ('?', '1'), ('¨Ç', '1'), ('?', '0'), ('¦è', '1'), ('¡', '1'), ('C', '1')]


Processing NER:  94%|█████████████████████████████████████████████████████████████▉    | 46/49 [00:07<00:00,  5.02it/s]

zh_sentence =  ­º¥ý¡A?¬O§Ú?¦ô?¨ìªº?¤_¬ü??¥þ²y?«Ç®Ä?ªº??¡A ¦b´¶³q±¡?¤U?¤_?¤O©M¯à·½ªº³Ì?¨Ï¥Îªº®Ä²v ´N¹³ç÷?´N¯à??ªºªG?¡A«D±`§Cªº¡C ®Ä²v©M?¦ÛµM?¹Òªº«O?¡G
zh_entities =  [('º¥ý', '1'), ('¡', '1'), ('A', '1'), ('?', '1'), ('¬O', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('¦ô', '1'), ('?', '1'), ('¨ìªº', '1'), ('?', '1'), ('¤', '1'), ('_', '1'), ('¬ü', '1'), ('?', '1'), ('?', '1'), ('¥þ²y', '1'), ('?', '1'), ('«', '1'), ('Ç®Ä', '1'), ('?', '1'), ('ªº', '0'), ('?', '1'), ('?', '0'), ('¡', '1'), ('A', '1'), ('¦b´', '1'), ('¶', '1'), ('³', '1'), ('##q', '1'), ('##±', '1'), ('¡', '1'), ('?', '1'), ('¤U', '1'), ('?', '1'), ('¤', '1'), ('_', '1'), ('?', '1'), ('¤O©M¯à', '0'), ('·', '1'), ('½ªº³Ì', '1'), ('?', '1'), ('¨Ï¥Îªº®Ä²v', '1'), ('´N¹³ç÷', '1'), ('?', '1'), ('´N¯à', '0'), ('?', '1'), ('?', '0'), ('ªºªG', '0'), ('?', '0'), ('¡', '1'), ('A', '1'), ('«', '1'), ('D±', '1'), ('`', '1'), ('§', '1'), ('Cªº', '1'), ('¡', '1'), ('C', '1'), ('®Ä²v©M', '1'), ('?', '1'), ('¦ÛµM', '1'), ('?', '1'), ('¹Òª

Processing NER:  96%|███████████████████████████████████████████████████████████████▎  | 47/49 [00:07<00:00,  5.05it/s]

zh_sentence =  ¤£¬O¦¨¥»¡A¦Ó¬O§Q?¡C «H?¬O??ªº¡C
zh_entities =  [('¤£¬O¦¨¥', '1'), ('»', '1'), ('¡', '1'), ('A¦Ó¬O', '1'), ('§', '1'), ('Q', '1'), ('?', '1'), ('¡', '1'), ('C', '1'), ('«', '0'), ('H', '1'), ('?', '1'), ('¬O', '1'), ('?', '1'), ('?', '0'), ('ªº', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ¤£¬O?­±®Ä?¡A¦Ó¬O¥¿­±ªº¡C?¨Ç³£¬O?¦Û¤v¤ä¥Iªº§ë?¡C


Processing NER: 100%|██████████████████████████████████████████████████████████████████| 49/49 [00:08<00:00,  5.29it/s]

zh_entities =  [('¤£¬O', '1'), ('?', '1'), ('±®Ä', '1'), ('?', '1'), ('¡', '1'), ('A¦Ó¬O¥', '1'), ('¿', '1'), ('±ªº', '1'), ('¡', '1'), ('C', '1'), ('?', '1'), ('¨Ç³£¬O', '1'), ('?', '1'), ('¦Û¤v¤ä¥Iªº', '1'), ('§', '1'), ('ë', '1'), ('?', '1'), ('¡', '1'), ('C', '1')]
zh_sentence =  ¦ý¬O¥¦?¦b?²¾§Ú?«ä·Qªº?­Ô¤]¬O«D±`¦³®Äªº¡C
zh_entities =  [('¦ý¬O¥¦', '1'), ('?', '1'), ('¦b', '1'), ('?', '1'), ('²¾', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('«', '1'), ('ä', '1'), ('·', '1'), ('Qªº', '1'), ('?', '1'), ('Ô¤', '1'), (']', '1'), ('¬O', '1'), ('«', '1'), ('D±', '1'), ('`', '1'), ('¦³®Äªº', '1'), ('¡', '1'), ('C', '1')]


Processing NER: 100%|██████████████████████████████████████████████████████████████████| 49/49 [00:08<00:00,  5.93it/s]

chinese_ner_results =  [[('«', '1'), ('D±', '1'), ('`', '1'), ('?', '1'), ('?', '0'), ('¡', '1'), ('A', '1'), ('§', '1'), ('J¨½´µ', '1'), ('¡', '1'), ('CªºÚÌ', '1'), ('«', '1'), ('D±', '0'), ('`', '1'), ('?', '1'), ('©¯', '1'), ('¯à¦³²Ä¤G¦¸¯¸¦b', '0'), ('?', '1'), ('?', '0'), ('¥x¤WªºÉó', '0'), ('?', '0'), ('¡', '1'), ('A', '0'), ('§', '1'), ('Ú¯u¬O', '0'), ('«', '1'), ('D±', '1'), ('`', '1'), ('·', '1'), ('P', '1'), ('¿', '1'), ('E', '1'), ('¡', '1'), ('C', '1')], [('?', '1'), ('?', '1'), ('?', '1'), ('?', '1'), ('¯u¬O', '1'), ('?', '1'), ('§', '1'), ('Ú', '1'), ('·', '1'), ('P¨ìÕa', '1'), ('?', '1'), ('¤', '1'), ('##£', '1'), ('##¤', '1'), ('##w', '1'), ('¡', '1'), ('A', '1'), ('§', '1'), ('Ú', '1'), ('?', '1'), ('n', '1'), ('?', '1'), ('?', '1'), ('§', '1'), ('A', '1'), ('?', '1'), ('¯d¤Uªº', '1'), ('?', '1'), ('¤', '1'), ('_', '1'), ('§', '1'), ('Ú¤W¦¸ºt', '1'), ('?', '1'), ('ªººë±m', '1'), ('?', '1'), ('?', '1')], [('§', '1'), ('Ú¬O', '1'), ('«', '1'), ('D±', '1'), ('`', '1'), ('¯




In [19]:
import chardet
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# 偵測檔案編碼
with open(r"./TEST_Train.csv", 'rb') as f:
    result = chardet.detect(f.read())
encoding = result['encoding']
print(f"Detected encoding: {encoding}")

# 設置 Stanford NER jar 和模型的完整路徑
jar = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\stanford-ner-4.2.0.jar"
english_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\english.all.3class.distsim.crf.ser.gz"
chinese_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\chinese.misc.distsim.crf.ser.gz"

# 初始化 Stanford NER Tagger
english_ner_tagger = StanfordNERTagger(english_model, jar, encoding='utf-8')
chinese_ner_tagger = StanfordNERTagger(chinese_model, jar, encoding='utf-8')

# 讀取 CSV 文件中的 'translation.en' 和 'translation.zh' 列，使用偵測到的編碼
train_df = pd.read_csv(r"./TEST_Train.csv", encoding='Big5-HKSCS', errors='replace')
english_sentences = train_df['translation.en'].dropna().tolist()  # 去除空值並轉換為列表
chinese_sentences = train_df['translation.zh'].dropna().tolist()  # 去除空值並轉換為列表

print("chinese_sentences =", chinese_sentences)


Detected encoding: Big5


TypeError: read_csv() got an unexpected keyword argument 'errors'

In [22]:
import chardet
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# 偵測檔案編碼
with open(r"./TEST_Train.csv", 'rb') as f:
    result = chardet.detect(f.read())
encoding = result['encoding']
print(f"Detected encoding: {encoding}")

# 使用檢測到的編碼讀取檔案內容並解決亂碼問題
with open(r"./TEST_Train.csv", 'r', encoding=encoding, errors='replace') as f:
    data = f.read()

# 將內容轉換為 DataFrame
from io import StringIO
train_df = pd.read_csv(StringIO(data))

# 確認中文列的內容
chinese_sentences = train_df['translation.zh'].dropna().tolist()
print("chinese_sentences =", chinese_sentences)


Detected encoding: Big5
chinese_sentences = ['非常??，克里斯。的确非常?幸 能有第二次站在??台上的机?，我真是非常感激。', '????真是?我感到惊?不已，我?要??你?留下的 ?于我上次演?的精彩??', '我是非常真?的，部分原因是因?----我的确非常需要！ 你?身?地?我想想！', '我坐了8年的空?二?。', '不??在上?机前我?要?掉我的鞋子', '我?你??小故事?形容我?在是什么?的', '?是?真?的故事--每一?都是如此。', '就在我和Tipper离?白?稍后不久 -- -- 我?????什??的家?到 我?在??50里外的一?小??--', '自己??', '我知道??你???是小事一?，但是 ---- 正?我看?后??的?候，突然感到非常惊奇。', '后面竟然?有??。', '你?听?幻肢痛么？', '??福特Taurus是租?的，???好是晚?的?候 我?就?始找吃?的地方', '我?在40?洲?公路上，要在238出口下到田?西州的黎巴嫩市', '我??出口下?，?始?找一家--我?找到了Shoney餐?。', '你?有些人也?不知道，?是家低成本的家庭??餐?', '我?走?去后坐在小亭子里，女服??也走了??。 我跟Tipper引起了一???。 她??后走到我?旁?亭子里的一?情??， 然后她的?音?得非常小以至于我要?起耳朵才能听到她在?什么', '接?她?”是的，他?就是前副??艾?·戈?和他的妻子Tipper。“', '那?男的接??，“他已?非常努力了，不是?？“', '已?算是有些??了。', '第二天，??一?完全真?的故事。 我登上G-5就?到非洲的尼?利?做演?去了， 在首都拉各斯，主??是?于能源。', '而演??始我?把前一天?生在?什??州 的故事?了一遍。', "就如同??我?你??的故事一模一?。 我妻子和我自己??，Shoney's餐?，低成本家庭??餐? ?店那?男人的?--然后??笑了", '演?完后就?到?机??回老家了', '接?我就在?机上睡?了，直到三更半夜才醒?， ?机在?速?群?降落加油。', '我醒了??，走出?机去呼吸?新?空气， 然后我看到有?人跑??机跑道', '他一??舞?一??，一?大喊， ”打????盛?！打????盛?！“

In [16]:
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# 設置 Stanford NER jar 和模型的完整路徑
jar = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\stanford-ner-4.2.0.jar"
english_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\english.all.3class.distsim.crf.ser.gz"
chinese_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\chinese.misc.distsim.crf.ser.gz"

# 初始化 Stanford NER Tagger
english_ner_tagger = StanfordNERTagger(english_model, jar, encoding='utf-8')
chinese_ner_tagger = StanfordNERTagger(chinese_model, jar, encoding='utf-8')

# 讀取 CSV 文件中的 'translation.en' 和 'translation.zh' 列，使用完整路徑
train_df = pd.read_csv(r"./TEST_Train.csv", encoding='utf-8-sig', encoding_errors='ignore')

english_sentences = train_df['translation.en'].dropna().tolist()  # 去除空值並轉換為列表
chinese_sentences = train_df['translation.zh'].dropna().tolist()  # 去除空值並轉換為列表

print("chinese_sentences = " , chinese_sentences)
# # 使用 tqdm 查看進度並進行英文命名實體識別
# english_ner_results = []
# for sentence in tqdm(english_sentences, desc="Processing English NER"):
#     words = word_tokenize(sentence)
#     tagged_words = english_ner_tagger.tag(words)
#     english_ner_results.append(tagged_words)

# # 使用 tqdm 查看進度並進行中文命名實體識別
# chinese_ner_results = []
# for sentence in tqdm(chinese_sentences, desc="Processing Chinese NER"):
#     words = list(sentence)  # 中文不需要分詞，直接逐字標記
#     tagged_words = chinese_ner_tagger.tag(words)
#     chinese_ner_results.append(tagged_words)

# # 將結果轉換為 DataFrame 便於檢視
# ner_df = pd.DataFrame({
#     "English Sentence": english_sentences,
#     "English NER Tagged": english_ner_results,
#     "Chinese Sentence": chinese_sentences,
#     "Chinese NER Tagged": chinese_ner_results
# })

# # 顯示結果
# ner_df.head()

# # 保存 NER 標記結果到 CSV 文件（可選）
# ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\ner_tagged_train.csv", index=False)


chinese_sentences =  ['D`??AJC̫D`? \u09b3ĤGb??xW?AگuOD`PEC', '????uO?ڷPa?wA?n??A?dU ?_ڤWt?m??', 'ڬOD`u?A]O]?----ڪ̫D`ݭnI A??a?ڷQQI', 'ڧF8~?G?C', '??bW?e?n?ڪcl', '?A??pG?ήe?bO\\?', '?O?u?G--C@?OpC', 'NbکMTipper??yZ[ -- -- ???????a? ?b??50~@?p??--', 'ۤv??', 'ڪD??A???OpƤ@?AO ---- ?ڬ?Z???ԡAMPD`a_C', 'ZM???C', 'A?v?۪ϵh\\H', '??֯STaurusO?A???nO?? ?N?lY?a', '?b40?w?WAnb238XfU?{ڹ५', '??XfU?A?l?@a--?FShoney\\?C', 'A?ǤH]?DA?OaCax??\\?', '??hZbpFlAkA??]F??C ڸTipperް_F@???C o??Z??Fl@???A MZo??oD`pHܤ_ڭn?_զ~vob?\\', '?o?OAL?NOe???P?MLdlTipperC', '?k??ALw?D`VOFAO?H', 'w?O??FC', 'ĤGѡA??@?u?GơC ڵnWG-5N?Dw?Q?t?hFA bԦUAD??O?_\u0dfdC', 'Ӻt??l?e@?ͦb???{ G?F@MC', "NpP???A??GƤ@Ҥ@?C کdlMڦۤv??AShoney's\\?ACax??\\? ??kH?--MZ??F", 't?ZN????^ѮaF', '?ڴNb?W?FATb]~?A ?b?t?s?[oC', 'ڿF??AX?hIl?s?aA MZڬݨ즳?H]??]D', 'L@??R?@??A@?jۡA ?????I?????I', 'MZڦۤvbҡATb]bjv \\??\u09b3\\X?OH', 'U??_???OܦhƱC', 'O?G??Oڪ?u?ļ ]??Q?@aq?w?ڪt??@?G }Bw?LX?b?F', "-- ڬd?FObXSQ?LꪺA?G?O????A e???P?eѦb?Q??A OکdlTipper?F@aCax??\\?AW?Shoney's, ӥBO?

In [31]:
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from tqdm import tqdm

# 設置 Stanford NER jar 和模型的完整路徑
jar = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\stanford-ner-4.2.0.jar"
english_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\english.all.3class.distsim.crf.ser.gz"
chinese_model = r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\stanford-ner-2020-11-17\classifiers\chinese.misc.distsim.crf.ser.gz"

# 初始化 Stanford NER Tagger
english_ner_tagger = StanfordNERTagger(english_model, jar, encoding='utf-8')
chinese_ner_tagger = StanfordNERTagger(chinese_model, jar, encoding='utf-8')

# 試試不同編碼
# encodings = ['utf-8-sig']
# encodings = [ 'Big5']
# encodings = ['GB2312']
encodings = ['GB18030']
for encoding in encodings:
    try:
        train_df = pd.read_csv(r"./TEST_Train.csv", encoding=encoding, encoding_errors='ignore')
        print(f"Successfully loaded with encoding: {encoding}")
        break
    except UnicodeDecodeError as e:
        print(f"Failed with encoding {encoding}: {e}")
print("chinese_sentences =", chinese_sentences)        
# 讀取英文和中文列
english_sentences = train_df['translation.en'].dropna().tolist()  # 去除空值並轉換為列表
chinese_sentences = train_df['translation.zh'].dropna().tolist()  # 去除空值並轉換為列表



# # 使用 tqdm 查看進度並進行英文命名實體識別
# english_ner_results = []
# for sentence in tqdm(english_sentences, desc="Processing English NER"):
#     words = word_tokenize(sentence)
#     tagged_words = english_ner_tagger.tag(words)
#     english_ner_results.append(tagged_words)

# # 使用 tqdm 查看進度並進行中文命名實體識別
# chinese_ner_results = []
# for sentence in tqdm(chinese_sentences, desc="Processing Chinese NER"):
#     words = list(sentence)  # 中文不需要分詞，直接逐字標記
#     tagged_words = chinese_ner_tagger.tag(words)
#     chinese_ner_results.append(tagged_words)

# # 將結果轉換為 DataFrame 便於檢視
# ner_df = pd.DataFrame({
#     "English Sentence": english_sentences,
#     "English NER Tagged": english_ner_results,
#     "Chinese Sentence": chinese_sentences,
#     "Chinese NER Tagged": chinese_ner_results
# })

# # 顯示結果
# ner_df.head()

# # 保存 NER 標記結果到 CSV 文件（可選）
# ner_df.to_csv(r"C:\Users\USER\Downloads\NLP-Courses\NLP243\Projects\ner_tagged_train.csv", index=False)


Successfully loaded with encoding: GB18030
chinese_sentences = ['D`??AJ酱怠C黑太D`?┋ 唳巢膜GΩ甫b??xW荷?AиuOD`PEC', '????uO?иP煺a?ぃwAи?n??A?dU ?_иWΩt?汉氡m??', 'иOD`u?骸A场だ歃]O]?----и黑太D`惠nI A?ō?a?иQQI', 'иГF8~邯?G?C', 'ぃ??bW?审eи?n?奔и壕cl', 'и?A??pGㄆ?萎eи?bOぐ\\?', '?O?u?含Gㄆ--C@?常Op埂C', 'NbиMTipper置?フ?yZぃ[ -- -- и?????ぐ??寒a? и?b??50渐~氦@?p??--', '郅v??', 'и竟D??A???Opㄆ@?AO ---- タ?и?Z???浴A鸬MP飓D`a_C', 'Z背旱M?Τ??C', 'A?v?ほ系h\\H', '??褐STaurusO?骸A???nO边?? и?N?lтY?害aよ', 'и?b40?w?そ隔WAnb238XfU欹?瑕{壕い诠啷', 'и??XfU?A?l?т@a--и?т欷FShoney\\?C', 'A?ΤㄇH]?ぃ竟DA?OaCΘセ寒ax??\\?', 'и?ǐ?hZГbpFl健AkA??]ǐF??C и蛤Tipperま_F@???C o??Zǐ歆??Fl姜氦@?薄??A MZo??oD`pH埭_иn?_φΨ~唰v歃ob?ぐ\\', '钡?o?〃O骸AL?NOe捌???Pむ?ML憨dlTipperC¨', 'ê?k罕??A¨Lw?D`VOFAぃO?H¨', 'w?衡OΤㄇ??FC', '材GぱA??@?Чu?含GㄆC иnWG-5N?飓Dw亥?Q?暗t?hFA b撼）驭U吹AD??O?_喾健C', 'τt??lи?рe@ぱ?ネb?ぐ??{ 含Gㄆ?F@MC', "NpP??и?A??含Gㄆ@家@?C иdlMи郅v??AShoney's\\?ACΘセax??\\? ?┍ê?kH?--MZ??氦F", 't?ЧZN??审??^ρaF', '钡?иNb?审W何?FA建欷T螗b]~眶?A ?审b?t?s?案ē[oC', 'и眶F??AǐX?审hIl?s?派aA MZи莰歃?H]??审]D', 'L@??R?@??A@?j驰A 〃ゴ????脖?Iゴ????脖?