In [1]:
import tensorflow as tf

In [2]:
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
!pip install -U ckip-transformers

Collecting ckip-transformers
  Using cached ckip_transformers-0.3.2-py3-none-any.whl (26 kB)
Collecting torch>=1.5.0
  Using cached torch-1.12.1-cp39-none-macosx_11_0_arm64.whl (49.1 MB)
Installing collected packages: torch, ckip-transformers
Successfully installed ckip-transformers-0.3.2 torch-1.12.1


In [3]:
import torch
print(torch.__version__)
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())
device = torch.device("mps")

1.13.0.dev20220930
True
True


In [35]:
ws_driver = CkipWordSegmenter(device=device)
pos_driver = CkipPosTagger(device=device)
ner_driver = CkipNerChunker(device=device)

In [50]:
def ckipped_ws(input):
    input = [input]
    ws = ws_driver(input, use_delim = False)
    return ' '.join(ws[0])

def ckipped_pos(input):
    input = [input]
    ws = ws_driver(input, use_delim = False)
    pos = pos_driver(ws, use_delim = False)

    assert len(ws) == len(pos)
    res = []
    for word_ws, word_pos in zip(ws, pos):
       for wws, wpos in zip(word_ws, word_pos):
         res.append((wws,wpos))
    
    wp = [' '.join(r) for r in res]

    return ' '.join(wp)

def ckipped_ner(input):
    input = [input]
    ner = ner_driver(input, use_delim=False)
    ner_word = [i[0]  for n in ner for i in n]
    ner_type = [i[1]  for n in ner for i in n]
    return ner_word, ner_type
            
    

In [51]:
sent = '今天的社科院圖書館好熱'
ws = ckipped_ws(sent)
pos = ckipped_pos(ws)
ner_w, ner_t = ckipped_ner(sent)
print(pos)
print(ner_w)
print(ner_t)

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 9362.29it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  3.90it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 15887.52it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00, 32.35it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 3785.47it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  4.74it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 10205.12it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  4.91it/s]

今天 Nd   WHITESPACE 的 DE   WHITESPACE 社科院 Nc   WHITESPACE 圖書館 Nc   WHITESPACE 好 Dfa   WHITESPACE 熱 VHC
['今天', '社科院']
['DATE', 'ORG']





In [30]:
input = ['美國參議院針對今天總統布什所提名的勞工部長趙小蘭展開認可聽證會，預料她將會很順利通過參議院支持，成為該國有史以來第一位的華裔女性內閣成員。']
ner = ner_driver(input, use_delim=False)
ner

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 9098.27it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  6.49it/s]


[[NerToken(word='美國參議院', ner='ORG', idx=(0, 5)),
  NerToken(word='今天', ner='LOC', idx=(7, 9)),
  NerToken(word='布什', ner='PERSON', idx=(11, 13)),
  NerToken(word='勞工部長', ner='ORG', idx=(17, 21)),
  NerToken(word='趙小蘭', ner='PERSON', idx=(21, 24)),
  NerToken(word='認可聽證會', ner='EVENT', idx=(26, 31)),
  NerToken(word='參議院', ner='ORG', idx=(42, 45)),
  NerToken(word='第一', ner='ORDINAL', idx=(56, 58)),
  NerToken(word='華裔', ner='NORP', idx=(60, 62))]]

In [57]:
def ckipped(input):
    input = [input]    
    ws = ws_driver(input, use_delim = False)
    pos = pos_driver(ws, use_delim = False)

    assert len(ws) == len(pos)
    res = []
    for word_ws, word_pos in zip(ws, pos):
       for wws, wpos in zip(word_ws, word_pos):
         res.append((wws,wpos))
    print('ckipped done')
    return res
  
def cwn_tagged(lemma):
    if cwn_tagger is None:
       print('re-initializing ckip...')
       warmup()
    
    tagged = cwn_tagger.find_lemma(lemma)
    senses = tagged[0].senses
    num_of_sense = len(senses)

    return senses, num_of_sense

In [54]:
input= '三角函數很棒'
res = ckipped(input)

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 4120.14it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  3.13it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 8905.10it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  4.22it/s]

ckipped done





In [44]:
from CwnGraph import CwnImage

In [64]:
word = [r[0] for r in res]
pos = [r[1] for r in res]

senses, num_of_sense = [], []
for w in word:
    s, ns = cwn_tagged(w)
    senses.append(s)
    num_of_sense.append(ns)
print(senses)
print(num_of_sense)


[[], [<CwnSense[04040801](很少，VH): 形容數量少。>, <CwnSense[04040802](很少，Neqa): 數量少。>, <CwnSense[04040803](很少，VH): 形容低於預期程度。>, <CwnSense[04040804](很少，D): 表事件發生的頻率比預期低。>], [<CwnSense[05130301](接棒，VB,nom): 接力賽跑中，接受他人傳交的棒子。>, <CwnSense[05130302](接棒，VB,nom): 接力賽跑中，接交棒子的過程。>, <CwnSense[05130303](接棒，VB,nom): 接替責任或所有權。>, <CwnSense[05130304](接棒，VB,nom): 延續承接股盤漲勢。>]]
[0, 4, 4]


In [60]:
cwn_tagger = CwnImage.latest()
lemmas = cwn_tagger.find_lemma("電腦")
senses = lemmas[0].senses
senses

[<CwnSense[06613601](電腦，Na): 一種資料處理裝置，能自動接受並儲存、處理輸入的資料，然後經由一組預先存放在機器內的指令逐步引導下產生輸出結果。>,
 <CwnSense[06613602](電腦，Na): 研究或操作電腦的知識。>,
 <CwnSense[06613603](電腦，Na): 比喻計算或記憶能力很強的人。>]

In [54]:
ptt = pd.read_csv('../../nlp_web/assignments/ptt-crawler/data/Soft_job/2019/Soft_job_2019_10.csv')

In [60]:
ptt.head()

Unnamed: 0,author,alias,title,date,ip,city,country,ups,downs,comments,url
0,eacdpizzy,I love baseball,[徵才] 成醫臨醫中心徵資訊演算法博士級研究員,2019-10-07 12:56:03,140.116.253.41,Tainan City,Taiwan,1,1,1,https://www.ptt.cc/bbs/Soft_Job/M.1570424165.A...
1,ting8489,Allie [33mapolkingg8: m,Re: [討論] Positive Grid 佳格科技 (板主代Po),2019-10-07 11:28:47,122.116.28.34,New Taipei,Taiwan,19,27,8,https://www.ptt.cc/bbs/Soft_Job/M.1570418929.A...
2,ting8489,Allie [33mapolkingg8: m,Re: [討論] Positive Grid 佳格科技 (板主代Po),2019-10-06 21:53:37,122.116.20.215,New Taipei,Taiwan,18,24,29,https://www.ptt.cc/bbs/Soft_Job/M.1570370019.A...
3,s89227,Kei,[討論] Positive Grid 佳格科技 (板主代Po),2019-10-06 18:38:07,101.13.131.211,Sanchong District,Taiwan,16,21,22,https://www.ptt.cc/bbs/Soft_Job/M.1570358292.A...
4,vu04y94,今,[討論] 比薪網站ursalary被消失了?,2019-10-06 17:53:53,27.242.103.249,New Taipei,Taiwan,8,0,1,https://www.ptt.cc/bbs/Soft_Job/M.1570355635.A...


In [64]:
title = ptt.title.to_list()
title

['[徵才] 成醫臨醫中心徵資訊演算法博士級研究員',
 'Re: [討論] Positive Grid 佳格科技 (板主代Po)',
 'Re: [討論] Positive Grid 佳格科技 (板主代Po)',
 '[討論] Positive Grid 佳格科技 (板主代Po)',
 '[討論] 比薪網站ursalary被消失了?',
 '[心得] 如何開始內部易用性測試？',
 '[請益] 各種軟體授權的商業使用',
 '[徵才] 新創徵專任研究助理',
 '[徵才]  Shinli 鑫利 徵多位工程師 50K-200K',
 '[請益] 有沒有推薦的自然語言處理課程?',
 '[請益] offer選擇 金融 博弈',
 '[請益] 當SVN update後，檔案遇到三方衝突...',
 '[情報] 金融科技自動交易訓練班第04期(政府補助)',
 '[請益] 如何向老闆證明Server 100 GB不夠用？',
 '[心得] 由田面試心得',
 '[請益] OFFER 請益',
 '[請益] AI人工智慧 vs VR虛擬實境',
 '[請益] 從中國回台的半路出家工程師找工作疑問',
 '[徵才] 成醫臨醫中心徵資訊演算法碩士級研究員',
 'Offer 請益(公部門/安永）']

In [63]:
import re

In [65]:
search_word = '徵才'
for t in title:
    if re.search(search_word, t):
        print(t)

[徵才] 成醫臨醫中心徵資訊演算法博士級研究員
[徵才] 新創徵專任研究助理
[徵才]  Shinli 鑫利 徵多位工程師 50K-200K
[徵才] 成醫臨醫中心徵資訊演算法碩士級研究員
