In [1]:
import ast
import os

from ckiptagger import (
    data_utils, construct_dictionary, 
    WS, POS, NER
)

import pandas as pd
import spacy
import stanza

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Load data

In [2]:
data_path = "data/tbrain_train_final_0610.csv"

df = pd.read_csv(data_path)
df.head()

Unnamed: 0,news_ID,hyperlink,content,name
0,1,https://news.cnyes.com/news/id/4352432,0理財基金量化交易追求絕對報酬有效對抗牛熊市鉅亨網記者鄭心芸2019/07/05 22:35...,[]
1,2,https://udn.com/news/story/120775/4112519,10月13日晚間發生Uber Eats黃姓外送人員職災死亡案件 ### 省略內文 ### 北...,[]
2,3,https://www.mirrormedia.mg/story/20191008soc011/,2019.10.08 01:53【法拍有詭4】飯店遭管委會斷水斷電員工怒吼：生計何去何從？文...,[]
3,4,https://www.chinatimes.com/realtimenews/201909...,58歲林姓男子昨凌晨與朋友聚餐結束後 ### 省略內文 ### 而地點就距離林家僅30公尺。...,[]
4,5,http://domestic.judicial.gov.tw/abbs/wkw/WHD9H...,例稿名稱：臺灣屏東地方法院公示催告公告發文日期：中華民國108年9月20日發文字號：屏院進家...,[]


In [3]:
DIR_PATH = os.path.abspath(
    f"./data/news"
)
FILES = sorted(os.listdir(DIR_PATH))
FILES.sort(key=len, reverse=False)

FILES = [f"{DIR_PATH}/{path}" for path in FILES]

corpus = []

for p in FILES:
    with open(p, "r") as f:
        text = f.readlines()
        text = " ".join([t.strip("\n") for t in text])
        corpus.append(text)
        
name_list = df["name"].tolist()
name_list = [ast.literal_eval(name) for name in name_list]

In [11]:
for idx, names in enumerate(name_list):
    if len(names) != 0:
        print(f"{idx}: {names}")

14: ['周麗真', '張志偉', '陳逢璿']
17: ['王派宏']
37: ['王桂霜', '李威儀', '藍秀琪']
40: ['陳鏡如', '陳星佑']
65: ['朱小蓉']
72: ['廖泰宇']
95: ['鄭博文', '徐金龍', '李大彰']
107: ['彭振源', '王澤生', '楚瑞芳']
120: ['林良琪', '戴吟曲']
139: ['吳宗憲', '張恒嘉', '邱彰信', '于堯', '黃川禎', '劉尊彰', '李宗原', '羅雅美', '沈珉', '白梓佑', '田佳宜', '陳穎彥', '徐世立']
149: ['徐少東', '劉明冠', '張永昌']
150: ['張淑晶']
165: ['許正雄']
174: ['王羿雄', '黃彥儒', '黃哲修', '陳震歐']
176: ['黃睿靚', '陳敏薰', '陳水扁', '陳致中']
192: ['許志堅']
197: ['黃振榮', '陳武騰']
208: ['陳淳伍']
219: ['劉進福']
224: ['連千毅', '鄭又仁']
245: ['李士綸', '吳哲瑋']
268: ['林昱伯', '周漢祥', '林煒智', '林睿耆']
271: ['鄭淑珠']
279: ['楊正平', '蔡思庭']
280: ['林文章']
290: ['王春容', '蔡登裕', '郭再旺', '郭義禮', '吳勝夫', '王協模', '廖聰昇']
314: ['李深淵', '黃文焱', '賴建誠', '陳西元']
325: ['葉添洽']
326: ['許祈文']
344: ['阮氏秋雲', '劉明楊', '朱明俊', '蘇嘉美']
351: ['高盟傑']
352: ['王延順']
359: ['陳建湘', '徐文龍', '吳京哲']
391: ['張慶龍', '楊政錦']
397: ['林嘉凌', '林茂樹', '陳國帥']
415: ['顏志峰']
418: ['陳世坤', '陳素娟', '祁興國']
422: ['陳慶男']
443: ['張簡復中']
471: ['鄭銘坤', '李岳怡']
502: ['邱俊銘', '邱水成', '邱秀芬']
519: ['蘇怡寧']
543: ['胡珍綾', '陳玉瑛', '葉永和', '葉仲琦']
546: ['徐洪貴']
5

## Prepare Models

In [4]:
# CKIP models
ws = WS("./ckip")
ner = NER("./ckip")
pos = POS("./ckip")

# SpaCy model
nlp = spacy.load("zh_core_web_lg")

# Stanza model
snlp = stanza.Pipeline(lang="zh", processors="tokenize,ner")


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



Building prefix dict from the default dictionary ...
2020-07-05 15:27:32 DEBUG: Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
2020-07-05 15:27:32 DEBUG: Loading model from cache /tmp/jieba.cache
Loading model cost 0.431 seconds.
2020-07-05 15:27:32 DEBUG: Loading model cost 0.431 seconds.
Prefix dict has been built successfully.
2020-07-05 15:27:32 DEBUG: Prefix dict has been built successfully.
2020-07-05 15:27:33 INFO: "zh" is an alias for "zh-hans"
2020-07-05 15:27:33 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package   |
-------------------------
| tokenize  | gsdsimp   |
| ner       | ontonotes |

2020-07-05 15:27:33 INFO: Use device: cpu
2020-07-05 15:27:33 INFO: Loading: tokenize
2020-07-05 15:27:33 INFO: Loading: ner
2020-07-05 15:27:34 INFO: Done loading processors!


## NER Tagger

In [5]:
def get_ckip_ner(doc):
    # print(*[f'token: {e[3]}\tner: {e[2]}' for e in doc if e[2] == "PERSON"], sep="\n")
    return [[token[3], token[2]] for token in doc if token[2] == "PERSON"]

In [6]:
def get_spacy_ner(doc):
    # print(*[f'token: {X.text}\tner: {X.label_}' for X in doc.ents if X.label_ == "PERSON"], sep="\n")
    return [[token.text, token.label_] for token in doc.ents if token.label_ == "PERSON"]

In [7]:
def get_stanza_ner(doc):
    # print(*[f'token: {token.text}\tner: {token.ner}' for sent in doc.sentences for token in sent.tokens if "PERSON" in token.ner], sep='\n')
    return [[token.text, token.ner] for sent in doc.sentences for token in sent.tokens if "PERSON" in token.ner]

In [8]:
def print_NER(corpus):
    word_sentence_list = ws([corpus])
    pos_sentence_list = pos(word_sentence_list)
    entity_sentence_list = ner(word_sentence_list, pos_sentence_list)
    
    spacy_doc = nlp(corpus)
    stanza_doc = snlp(corpus)
    
    ckip_ner = get_ckip_ner(entity_sentence_list[0])
    spacy_ner = get_spacy_ner(spacy_doc)
    stanza_ner = get_stanza_ner(stanza_doc)

    print(f"CKIP: {len(ckip_ner)}")
    print(*[f'token: {ner[0]}\t\tner: {ner[1]}' for ner in ckip_ner], sep="\n")
    print("-------------------------")
    print(f"SPACY: {len(spacy_ner)}")
    print(*[f'token: {ner[0]}\t\tner: {ner[1]}' for ner in spacy_ner], sep="\n")
    print("-------------------------")
    print(f"STANZA: {len(stanza_ner)}")
    print(*[f'token: {ner[0]}\t\tner: {ner[1]}' for ner in stanza_ner], sep="\n")

In [15]:
i = 290

print(f"Corpus:\n{corpus[i]}\n\n")
print(f"Labels: {name_list[i]}\n\n")
print_NER(corpus[i])

Corpus:
為達最佳瀏覽效果,建議使用 Chrome、Firefox 或 Microsoft Edge 的瀏覽器。 自由時報版權所有不得轉載© 2020 The Liberty Times. All Rights Reserved.


Labels: ['王春容', '蔡登裕', '郭再旺', '郭義禮', '吳勝夫', '王協模', '廖聰昇']


CKIP: 0

-------------------------
SPACY: 0

-------------------------
STANZA: 0

