# 使用NER、SRL自動回覆

## NER工具 spacy

In [1]:
import spacy
#python -m spacy download zh_core_web_sm
#python -m spacy download zh_core_web_lg

2022-09-12 12:01:40.384211: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-12 12:01:40.534960: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-09-12 12:01:40.561868: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-12 12:01:41.011278: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

In [2]:
nlp = spacy.load('zh_core_web_sm')

In [3]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'ner']

In [4]:
doc = nlp('今天天氣真好，颱風沒有登陸台灣本島。')

In [5]:
for ent in doc.ents:
    print(ent.text, '|', ent.label_, spacy.explain(ent.label_))

今天 | DATE Absolute or relative dates or periods
台灣 | GPE Countries, cities, states


In [6]:
from spacy import displacy
displacy.render(doc, style='ent')

In [7]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [8]:
doc = nlp('孫中山在10月10日建立了中華民國')

In [9]:
for ent in doc.ents:
    print(ent.text, '|', ent.label_, spacy.explain(ent.label_))

孫中山 | PERSON People, including fictional
10月10日 | DATE Absolute or relative dates or periods


In [10]:
doc = nlp('鄭中遠是石牌人，出生於1990年3月8日，目前就讀陽明交通大學。')

In [11]:
for ent in doc.ents:
    print(ent.text, '|', ent.label_, spacy.explain(ent.label_))

石牌人 | PERSON People, including fictional
1990年3月8日 | DATE Absolute or relative dates or periods


In [12]:
doc[0:2]

鄭中遠

In [13]:
doc[14:17]

陽明交通大學

## 如何自己新增Label

In [14]:
from spacy.tokens import Span

In [15]:
s1 = Span(doc, 0, 2, label='PERSON')
s2 = Span(doc, 14, 17, label='ORG')

In [16]:
doc.set_ents([s1, s2], default='unmodified')

In [17]:
for ent in doc.ents:
    print(ent.text, '|', ent.label_, spacy.explain(ent.label_))

鄭中遠 | PERSON People, including fictional
石牌人 | PERSON People, including fictional
1990年3月8日 | DATE Absolute or relative dates or periods
陽明交通大學 | ORG Companies, agencies, institutions, etc.


In [18]:
displacy.render(doc, style='ent')

## 使用規則來新增實體規則

In [19]:
doc = nlp('鄭中遠的手機號碼是 0912-884-930 。')

In [20]:
displacy.render(doc, style='ent')

In [33]:
nlp = spacy.blank('en')

In [34]:
ruler = nlp.add_pipe('entity_ruler')

In [35]:
# https://spacy.io/usage/rule-based-matching
patterns = [
    {'label':'PHONE_NUMBER', 'pattern':[{'SHAPE':'dddd'},{'ORTH':'-'},{'SHAPE':'ddd'},{'ORTH':'-'},{'SHAPE':'ddd'}]}
]

In [36]:
ruler.add_patterns(patterns)

In [37]:
doc = nlp('鄭中遠的手機號碼是 0912-884-930 。')

In [38]:
displacy.render(doc, style='ent')

## Semantic Role Labeling

In [3]:
from supar import Parser
import torch
!export CUDA_VISIBLE_DEVICES=""

In [4]:
parser = Parser.load('biaffine-dep-zh')

  return torch._C._cuda_getDeviceCount() > 0


In [9]:
dataset = parser.predict('今天天气很好，台风没有登陆台湾本岛。', lang='zh', prob=True, verbose=False) #訓練集是使用簡體中文...

In [10]:
dataset[0]

1	今天	_	_	_	_	3	tmod	_	_
2	天气	_	_	_	_	3	nsubj	_	_
3	很好	_	_	_	_	0	root	_	_
4	，	_	_	_	_	3	punct	_	_
5	台风	_	_	_	_	7	nsubj	_	_
6	没有	_	_	_	_	7	neg	_	_
7	登陆	_	_	_	_	3	conj	_	_
8	台湾	_	_	_	_	9	nn	_	_
9	本岛	_	_	_	_	7	dobj	_	_
10	。	_	_	_	_	3	punct	_	_

In [11]:
print(f"arcs:  {dataset.arcs[0]}\n"
      f"rels:  {dataset.rels[0]}\n"
      f"probs: {dataset.probs[0].gather(1,torch.tensor(dataset.arcs[0]).unsqueeze(1)).squeeze(-1)}")

arcs:  [3, 3, 0, 3, 7, 7, 3, 9, 7, 3]
rels:  ['tmod', 'nsubj', 'root', 'punct', 'nsubj', 'neg', 'conj', 'nn', 'dobj', 'punct']
probs: tensor([0.9944, 0.9999, 0.9976, 0.9989, 0.9999, 1.0000, 0.9982, 0.9998, 0.9996,
        0.9959])


In [27]:
con = Parser.load('crf-con-zh')
con.predict(['今天','天气','很好','，','台风','没有','登陆','台湾','本岛','。'], verbose=False)[0].pretty_print()

                                                
                 |                               
                 IP                             
      ___________|____________________________   
     |       |            IP                  | 
     |       |    ________|___                |  
     |       |   |            VP              | 
     |       |   |    ________|___            |  
     |       |   |   |            VP          | 
     |       |   |   |     _______|___        |  
     IP      |   |   |    |           NP      | 
  ___|___    |   |   |    |        ___|___    |  
 NP  NP  VP  |   NP ADVP  |       NP      NP  | 
 |   |   |   |   |   |    |       |       |   |  
 _   _   _   _   _   _    _       _       _   _ 
 |   |   |   |   |   |    |       |       |   |  
 今天  天气  很好  ，   台风  没有   登陆      台湾      本岛  。 

