In [1]:
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [2]:
nlp=spacy.load('en')

In [5]:
# sample text 
text = "GDP in developing countries such as Vietnam will continue growing at a high rate." 

# create a spaCy object 
doc = nlp(text)

In [12]:
for tok in doc:
    print("{:12s}--->  {:6s}--->  {:6s}".format(tok.text,tok.dep_,tok.pos_))

GDP         --->  nsubj --->  NOUN  
in          --->  prep  --->  ADP   
developing  --->  amod  --->  VERB  
countries   --->  pobj  --->  NOUN  
such        --->  amod  --->  ADJ   
as          --->  prep  --->  SCONJ 
Vietnam     --->  pobj  --->  PROPN 
will        --->  aux   --->  VERB  
continue    --->  ROOT  --->  VERB  
growing     --->  xcomp --->  VERB  
at          --->  prep  --->  ADP   
a           --->  det   --->  DET   
high        --->  amod  --->  ADJ   
rate        --->  pobj  --->  NOUN  
.           --->  punct --->  PUNCT 


In [13]:
#pattern for extracting
pattern=[{'POS':'NOUN'}, 
           {'LOWER': 'such'}, 
           {'LOWER': 'as'}, 
           {'POS': 'PROPN'}]

In [25]:
#match class object
matcher=Matcher(nlp.vocab)
matcher.add('matching1',None,pattern)

matches=matcher(doc)
span=doc[matches[0][1]:matches[0][2]]

span.text

'countries such as Vietnam'

In [26]:
#getting a modifier
matcher = Matcher(nlp.vocab)

pattern = [{'DEP':'amod', 'OP':"?"}, # adjectival modifier #op for so that modifier occurs once or not at alldoc=
           {'POS':'NOUN'},
           {'LOWER': 'such'},
           {'LOWER': 'as'},
           {'POS': 'PROPN'}]

matcher.add("matching2", None, pattern)
matches = matcher(doc)

span = doc[matches[0][1]:matches[0][2]]
print(span.text)

developing countries such as Vietnam


In [27]:
doc=nlp("Here is how you can keep your car and other vehicles clean.") 

for tok in doc:
    print("{:12s}--->  {:6s}--->  {:6s}".format(tok.text,tok.dep_,tok.pos_))

Here        --->  advmod--->  ADV   
is          --->  ROOT  --->  AUX   
how         --->  advmod--->  ADV   
you         --->  nsubj --->  PRON  
can         --->  aux   --->  VERB  
keep        --->  ccomp --->  VERB  
your        --->  poss  --->  DET   
car         --->  dobj  --->  NOUN  
and         --->  cc    --->  CCONJ 
other       --->  amod  --->  ADJ   
vehicles    --->  conj  --->  NOUN  
clean       --->  oprd  --->  ADJ   
.           --->  punct --->  PUNCT 


In [28]:
match=Matcher(nlp.vocab)

#define the pattern 
pattern = [{'DEP':'amod', 'OP':"?"}, 
           {'POS':'NOUN'}, 
           {'LOWER': 'and', 'OP':"?"}, 
           {'LOWER': 'or', 'OP':"?"}, 
           {'LOWER': 'other'}, 
           {'POS': 'NOUN'}] 
           
matcher.add("matching_1", None, pattern) 

matches = matcher(doc) 
span = doc[matches[0][1]:matches[0][2]] 
print(span.text)

car and other vehicles


In [29]:
# Subtree Matching for relation Exraction

In [34]:
text='Honda was recently acquired by TVS'
# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep',jupyter=True)

In [36]:
text='Honda,a major two wheeler company was recently acquired by TVS'
# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep',jupyter=True)

In [63]:
text='Honda was recently acquired by TVS'
# Plot the dependency graph 
doc = nlp(text) 

for tok in doc:
    print("{:10s}{:10s}{:10s}".format(tok.text,tok.dep_,tok.pos_))

Honda     nsubjpass PROPN     
was       auxpass   AUX       
recently  advmod    ADV       
acquired  ROOT      VERB      
by        agent     ADP       
TVS       pobj      PROPN     


In [65]:
def subtree_matcher(doc):
    x=""
    y=""
    
    for i,tok in enumerate(doc):
        #extract subject ..remeber its paassive voice
        if tok.dep_.endswith('nsubjpass')==True:
            y=tok.text
            
        if tok.dep_.endswith('pobj')==True:
            x=tok.text
    return x,y
       

In [67]:
text='Honda was recently acquired by TVS'
doc = nlp(text) 
subtree_matcher(doc)      

('TVS', 'Honda')

In [70]:
text='Honda, a major two wheeler company was recently acquired by TVS'
doc = nlp(text) 
subtree_matcher(doc)     

('TVS', 'Honda')

In [71]:
text = "TVS recently acquired Honda." 
doc = nlp(text) 
subtree_matcher(doc)

('', '')

In [72]:
for tok in doc:
    print("{:10s}{:10s}{:10s}".format(tok.text,tok.dep_,tok.pos_))

TVS       nsubj     PROPN     
recently  advmod    ADV       
acquired  ROOT      VERB      
Honda     dobj      PROPN     
.         punct     PUNCT     


In [75]:
#function for matching both active anD Passive sent
def subtree_matcher(doc):
    subjpass=0
    
    for i,tok in enumerate(doc):
        # find dependency tag that contains the text "subjpass"   
        if tok.dep_.endswith('subjpass')==True:
            subjpass=1
    
    x=""
    y=""
    
    if subjpass==1:
        for i,tok in enumerate(doc):
            #extract subject ..remeber its paassive voice
            if tok.dep_.endswith('subjpass')==True:
                y=tok.text

            if tok.dep_.endswith('pobj')==True:
                x=tok.text
            
    else:
        #for active voice
        for i,tok in enumerate(doc):
            if tok.dep_.endswith("subj") == True:
                x = tok.text

            if tok.dep_.endswith("obj") == True:
                y = tok.text
            
    return x,y

In [76]:
subtree_matcher(doc)

('TVS', 'Honda')

In [100]:
match=Matcher(nlp.vocab)

#define the pattern 
pattern = [{'dep':'ccomp'}, 
           {'POS': 'NOUN'}, 
           {'POS': 'NOUN', 'OP':"?"}
          ] 
           
matcher.add("matching_1", None, pattern) 

matches = matcher(doc2)
for i in range(len(matches)):
    span = doc2[matches[i][1]:matches[i][2]] 
    print(span.text)
    print('\n')

facing fuel


facing fuel leakage




In [347]:
pattern1 = [{'dep':{'REGEX':'ccomp'}}, 
             {'POS': 'NOUN'}, 
             {'POS': 'NOUN', 'OP':"?"}
            ] 

pattern2 = [{'dep':{'REGEX':'comp'}}, 
           {'POS': 'NOUN'}, 
           {'dep': 'aux', 'OP':"?"},
           {'POS': 'PART', 'OP':"?"},
           {'dep': 'ccomp'}
          ]  

pattern3 = [{'dep':{'IN':['amod']}},# can alos use 'NOT_IN'
           {'dep':'ROOT'}
          ] 

pattern4 = [
            {'POS':'NOUN'},
            {'dep':'prep'},\
            {'POS':'NOUN'}
          ] 

pattern5 = [{'dep':'compound'},
           {'dep':'nsubj'},
           {'dep':'prep'}
          ] 

pattern6 = [{'dep':'compound'},
            {'POS':'NOUN'}
          ] 

pattern6b = [{'POS':'NOUN'}, 
           {'dep': 'aux', 'OP':"?"},
           {'POS': 'PART', 'OP':"?"},
           {'POS': {'IN':['VERB','NOUN']}}
          ]  

pattern7 = [{'POS':'ADJ'},
            {'dep':'prep','OP':'?'},
            {'POS':'NOUN'},
            {'POS':'NOUN','OP':'?'}
          ] 

In [348]:
def info_search(doc,pattern):
    doc=" ".join(doc.split())#remove extra space
    doc=nlp(doc)
  
    matcher=Matcher(nlp.vocab)
    matcher.add("matching_1", None, pattern) 
    matches = matcher(doc)
    all_info=[]
  
    for i in range(len(matches)):
        span = doc[matches[i][1]:matches[i][2]] 
        all_info.append(span)

    #for  pattern1 as its non-closed pattern with ? at end ^^^ 
    for i in range(len(all_info)-1):
        try:
            if (len(all_info)>1)&(all_info[i]==all_info[i+1][:2]):
                all_info.remove(all_info[i])
        except IndexError:
            pass

    all_info=[str(c) for c in all_info]
    all_info_temp=[]
    #removing unnecesary phrase
    for item in all_info:
        toks=set(item.split())
        rem=set(['said','state','stating','tell','told','stated','people','he','they','i','we','this','that',
                 'them','those','cs','cust','customer','service','vehicle','showroom','automobiles','motors',\
                 'center','executives','his','him','center','purchase','tvs','proper','solution','number','month','day'])
        if len(toks.intersection(rem))>0:
            all_info_temp.append(item)
    
    all_info=list(set(all_info)-set(all_info_temp))

    if len(all_info)==0:
        return np.nan
    else:
        return all_info

In [349]:
for pattern in [pattern1,pattern2,pattern3,pattern4,pattern5,pattern6,pattern6b,pattern7]:
    print(info_search(text,pattern))

nan
nan
nan
nan
nan
nan
['problem not solved', 'speedometer is not working']
nan
