# 使用NER、SRL自動回覆

## NER工具 spacy

In [1]:
import spacy
#python -m spacy download zh_core_web_sm
#python -m spacy download zh_core_web_lg

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0
2022-09-14 15:08:28.208789: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-14 15:08:29.128168: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-09-14 15:08:29.512027: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-14 15:08:31.862296: W tensorflow/stream

In [2]:
nlp = spacy.load('zh_core_web_sm')

In [3]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner']

In [4]:
doc = nlp('今天天氣真好，颱風沒有登陸台灣本島。')

In [5]:
for ent in doc.ents:
    print(ent.text, '|', ent.label_, spacy.explain(ent.label_))

今天 | DATE Absolute or relative dates or periods
台灣 | GPE Countries, cities, states


In [6]:
from spacy import displacy
displacy.render(doc, style='ent')

In [7]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [8]:
doc = nlp('孫中山在10月10日建立了中華民國')

In [9]:
for ent in doc.ents:
    print(ent.text, '|', ent.label_, spacy.explain(ent.label_))

孫中山 | PERSON People, including fictional
10月10日 | DATE Absolute or relative dates or periods


In [10]:
doc = nlp('鄭中遠是石牌人，出生於1990年3月8日，目前就讀陽明交通大學。')

In [11]:
for ent in doc.ents:
    print(ent.text, '|', ent.label_, spacy.explain(ent.label_))

石牌人 | PERSON People, including fictional
1990年3月8日 | DATE Absolute or relative dates or periods


In [12]:
doc[0:2]

鄭中遠

In [13]:
doc[14:17]

陽明交通大學

## 如何自己新增Label

In [14]:
from spacy.tokens import Span

In [15]:
s1 = Span(doc, 0, 2, label='PERSON')
s2 = Span(doc, 14, 17, label='ORG')

In [16]:
doc.set_ents([s1, s2], default='unmodified')

In [17]:
for ent in doc.ents:
    print(ent.text, '|', ent.label_, spacy.explain(ent.label_))

鄭中遠 | PERSON People, including fictional
石牌人 | PERSON People, including fictional
1990年3月8日 | DATE Absolute or relative dates or periods
陽明交通大學 | ORG Companies, agencies, institutions, etc.


In [18]:
displacy.render(doc, style='ent')

## 使用規則來新增實體規則

In [19]:
doc = nlp('鄭中遠的手機號碼是 0912-884-930 。')

In [20]:
displacy.render(doc, style='ent')

In [21]:
nlp = spacy.blank('en')

In [22]:
ruler = nlp.add_pipe('entity_ruler')

In [23]:
# https://spacy.io/usage/rule-based-matching
patterns = [
    {'label':'PHONE_NUMBER', 'pattern':[{'SHAPE':'dddd'},{'ORTH':'-'},{'SHAPE':'ddd'},{'ORTH':'-'},{'SHAPE':'ddd'}]}
]

In [24]:
ruler.add_patterns(patterns)

In [25]:
doc = nlp('鄭中遠的手機號碼是 0912-884-930 。')

In [26]:
displacy.render(doc, style='ent')

## Semantic Role Labeling
### https://aclanthology.org/L16-1262.pdf

In [27]:
from supar import Parser
import torch
!export CUDA_VISIBLE_DEVICES=""

In [28]:
parser = Parser.load('biaffine-dep-zh')

In [35]:
dataset = parser.predict('今天天氣很好，颱風沒有登陸台灣本島。', lang='zh', prob=True, verbose=False) #訓練集是使用簡體中文...

In [36]:
dataset[0]

1	今天	_	_	_	_	4	tmod	_	_
2	天	_	_	_	_	4	nsubj	_	_
3	氣	_	_	_	_	4	nsubj	_	_
4	很好	_	_	_	_	0	root	_	_
5	，	_	_	_	_	4	punct	_	_
6	颱	_	_	_	_	7	nn	_	_
7	風	_	_	_	_	8	nn	_	_
8	沒	_	_	_	_	9	top	_	_
9	有	_	_	_	_	4	conj	_	_
10	登陸	_	_	_	_	12	vmod	_	_
11	台	_	_	_	_	12	nn	_	_
12	灣	_	_	_	_	13	nn	_	_
13	本島	_	_	_	_	9	dobj	_	_
14	。	_	_	_	_	4	punct	_	_

In [32]:
print(f"arcs:  {dataset.arcs[0]}\n"
      f"rels:  {dataset.rels[0]}\n"
      f"probs: {dataset.probs[0].gather(1,torch.tensor(dataset.arcs[0]).unsqueeze(1)).squeeze(-1)}")

arcs:  [3, 3, 0, 3, 7, 7, 3, 9, 7, 3]
rels:  ['tmod', 'nsubj', 'root', 'punct', 'nsubj', 'neg', 'conj', 'nn', 'dobj', 'punct']
probs: tensor([0.9944, 0.9999, 0.9976, 0.9989, 0.9999, 1.0000, 0.9982, 0.9998, 0.9996,
        0.9959])


In [34]:
con = Parser.load('crf-con-zh')
con.predict(['今天','天氣','很好','，','颱風','没有','登陸','台灣','本島','。'], verbose=False)[0].pretty_print()

                                                 
                 |                                
                 IP                              
      ___________|_____________________________   
     |       |            IP                   | 
     |       |    ________|___                 |  
     |       |   |            VP               | 
     |       |   |    ________|___             |  
     |       |   |   |            VP           | 
     |       |   |   |     _______|____        |  
     IP      |   |   |    |            NP      | 
  ___|___    |   |   |    |        ____|___    |  
 NP  NP  VP  |   NP ADVP  |      ADJP      NP  | 
 |   |   |   |   |   |    |       |        |   |  
 _   _   _   _   _   _    _       _        _   _ 
 |   |   |   |   |   |    |       |        |   |  
 今天  天氣  很好  ，   颱風  没有   登陸      台灣       本島  。 

