In [37]:
# 關係抽取
# https://github.com/percent4/spo_extract_platform
# https://geek.digiasset.org/pages/nlp/nlpinfo/spacy-python-information-extraction_22Jul07203415699453/

In [38]:
# 匯入套件
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 

# Matcher 用於定義和匹配自訂的文字模式。它可以協助您搜尋文字中的特定結構或模式。
from spacy.matcher import Matcher 
# spaCy 庫中的一個類別，表示文字中的一個片段或子字串。它通常用於提取或標記文字中的特定部分。
from spacy.tokens import Span 
# display 模組提供了一些spaCy中用於視覺化的功能，例如在Jupyter Notebook中顯示文字和實體。
from spacy import displacy 



In [39]:
# load spaCy model
# https://www.projectpro.io/recipes/install-and-use-spacy-models

!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 10.5 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


2023-09-12 21:44:34.522897: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2023-09-12 21:44:34.524418: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-09-12 21:44:42.650081: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'nvcuda.dll'; dlerror: nvcuda.dll not found
2023-09-12 21:44:42.650929: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-09-12 21:44:42.662546: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: SUNNY
2023-09-12 21:44:42.662921: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: SUNNY


In [40]:
# sample text 
text = "GDP in developing countries such as Vietnam will continue growing at a high rate." 

# create a spaCy object 
doc = nlp(text)

In [41]:
# print token, dependency, POS tag 
'''
為了能夠從上面的句子中提取所需的信息，
理解它的句法結構非常重要——比如句子中的主語、賓語、修飾語和詞性(POS)。
'''

for tok in doc: 
  print(tok.text, "-->",tok.dep_,"-->", tok.pos_)

GDP --> nsubj --> NOUN
in --> prep --> ADP
developing --> amod --> VERB
countries --> pobj --> NOUN
such --> amod --> ADJ
as --> prep --> ADP
Vietnam --> pobj --> PROPN
will --> aux --> AUX
continue --> ROOT --> VERB
growing --> xcomp --> VERB
at --> prep --> ADP
a --> det --> DET
high --> amod --> ADJ
rate --> pobj --> NOUN
. --> punct --> PUNCT


## Y such as X 規則

In [44]:
#define the pattern  Y such as X  
'''
根據英語文法 “such”和“as”。它們前面跟一個名詞（“國家”）。
在它們之後，我們有一個專有名詞（“越南”）作為下位詞。
根據以上結構定義匹配框架 

'''
pattern = [{'POS':'NOUN'}, 
           {'LOWER': 'such'}, 
           {'LOWER': 'as'}, 
           {'POS': 'PROPN'} ]

# Matcher class object 
# 使用spaCy的Matcher類創建了一個matcher對象，並將定義的模式添加到matcher中。
matcher = Matcher(nlp.vocab) 
matcher.add("matching", [pattern]) 

# 使用matcher對文本進行匹配，找到與模式匹配的部分。在這個例子中，matcher找到了一個匹配。
matches = matcher(doc) 

# 提取匹配的文本
span = doc[matches[0][1]:matches[0][2]] 

print(matches)
print(span.text)

[(1221037237276548748, 3, 7)]
countries such as Vietnam


In [63]:
# 關係提取的子樹匹配

text = "GDP in developing countries such as Vietnam will continue growing at a high rate." 

# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep',jupyter=True)

## 修飾語 +  Y such as X 規則

In [45]:
# Matcher class object
matcher = Matcher(nlp.vocab)

#define the pattern
pattern = [{'DEP':'amod', 'OP':"?"}, # adjectival modifier
           {'POS':'NOUN'},
           {'LOWER': 'such'},
           {'LOWER': 'as'},
           {'POS': 'PROPN'}]

matcher.add("matching", [pattern])
matches = matcher(doc)

span = doc[matches[0][1]:matches[0][2]]
print(span.text)

developing countries such as Vietnam


## X，尤其是 Y 規則

In [60]:
doc = nlp("A healthy eating pattern includes fruits, especially whole fruits.") 

for tok in doc: 
  print(tok.text, "-->", tok.dep_, "-->", tok.pos_)

A --> det --> DET
healthy --> amod --> ADJ
eating --> compound --> NOUN
pattern --> nsubj --> NOUN
includes --> ROOT --> VERB
fruits --> dobj --> NOUN
, --> punct --> PUNCT
especially --> advmod --> ADV
whole --> amod --> ADJ
fruits --> appos --> NOUN
. --> punct --> PUNCT


In [61]:
# Matcher class object 
matcher = Matcher(nlp.vocab)

#define the pattern 
pattern = [{'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}, 
           {'IS_PUNCT':True}, 
           {'LOWER': 'especially'}, 
           {'DEP':'nummod','OP':"?"}, 
           {'DEP':'amod','OP':"?"}, 
           {'POS':'NOUN'}]

matcher.add("matching", [pattern]) 

matches = matcher(doc) 
span = doc[matches[0][1]:matches[0][2]] 
print(span.text)

fruits, especially whole fruits
