In [1]:
claim = """A decoding system, comprising:

a decoding engine running on a mobile device, the decoding engine in operation decoding signals produced from a read of a buyer's financial transaction card, the decoding engine in operation accepting and initializing incoming signals from the read of the buyer's financial transaction card until the signals reach a steady state, detecting the read of the buyer's financial transaction card once the incoming signals are in a steady state, identifying peaks in the incoming signals and digitizing the identified peaks in the incoming signals into bits;

and

a transaction engine running on the mobile device and coupled to the decoding engine, the transaction engine in operation receiving as its input decoded buyer's financial transaction card information from the decoding engine and serving as an intermediary between the buyer and a merchant, so that the buyer does not have to share his/her financial transaction card information with the merchant."""

In [2]:
import nltk

In [3]:
text = nltk.word_tokenize(claim)

In [4]:
print(text, end="")

['A', 'decoding', 'system', ',', 'comprising', ':', 'a', 'decoding', 'engine', 'running', 'on', 'a', 'mobile', 'device', ',', 'the', 'decoding', 'engine', 'in', 'operation', 'decoding', 'signals', 'produced', 'from', 'a', 'read', 'of', 'a', 'buyer', "'s", 'financial', 'transaction', 'card', ',', 'the', 'decoding', 'engine', 'in', 'operation', 'accepting', 'and', 'initializing', 'incoming', 'signals', 'from', 'the', 'read', 'of', 'the', 'buyer', "'s", 'financial', 'transaction', 'card', 'until', 'the', 'signals', 'reach', 'a', 'steady', 'state', ',', 'detecting', 'the', 'read', 'of', 'the', 'buyer', "'s", 'financial', 'transaction', 'card', 'once', 'the', 'incoming', 'signals', 'are', 'in', 'a', 'steady', 'state', ',', 'identifying', 'peaks', 'in', 'the', 'incoming', 'signals', 'and', 'digitizing', 'the', 'identified', 'peaks', 'in', 'the', 'incoming', 'signals', 'into', 'bits', ';', 'and', 'a', 'transaction', 'engine', 'running', 'on', 'the', 'mobile', 'device', 'and', 'coupled', 'to',

In [5]:
# This needs you to download the perceptron pos tagger from Models via nltk.download()
pos = nltk.pos_tag(text)

In [6]:
print(pos, end="")

[('A', 'DT'), ('decoding', 'NN'), ('system', 'NN'), (',', ','), ('comprising', 'VBG'), (':', ':'), ('a', 'DT'), ('decoding', 'VBG'), ('engine', 'NN'), ('running', 'VBG'), ('on', 'IN'), ('a', 'DT'), ('mobile', 'JJ'), ('device', 'NN'), (',', ','), ('the', 'DT'), ('decoding', 'VBG'), ('engine', 'NN'), ('in', 'IN'), ('operation', 'NN'), ('decoding', 'NN'), ('signals', 'NNS'), ('produced', 'VBD'), ('from', 'IN'), ('a', 'DT'), ('read', 'NN'), ('of', 'IN'), ('a', 'DT'), ('buyer', 'NN'), ("'s", 'POS'), ('financial', 'JJ'), ('transaction', 'NN'), ('card', 'NN'), (',', ','), ('the', 'DT'), ('decoding', 'VBG'), ('engine', 'NN'), ('in', 'IN'), ('operation', 'NN'), ('accepting', 'NN'), ('and', 'CC'), ('initializing', 'VBG'), ('incoming', 'VBG'), ('signals', 'NNS'), ('from', 'IN'), ('the', 'DT'), ('read', 'NN'), ('of', 'IN'), ('the', 'DT'), ('buyer', 'NN'), ("'s", 'POS'), ('financial', 'JJ'), ('transaction', 'NN'), ('card', 'NN'), ('until', 'IN'), ('the', 'DT'), ('signals', 'NNS'), ('reach', 'VBP'),

In [7]:
# Print only the nouns
nouns = [word for word, part in pos if "NN" in part]

In [8]:
print(nouns, end="")

['decoding', 'system', 'engine', 'device', 'engine', 'operation', 'decoding', 'signals', 'read', 'buyer', 'transaction', 'card', 'engine', 'operation', 'accepting', 'signals', 'read', 'buyer', 'transaction', 'card', 'signals', 'state', 'read', 'buyer', 'transaction', 'card', 'incoming', 'signals', 'state', 'peaks', 'incoming', 'signals', 'peaks', 'incoming', 'signals', 'bits', 'transaction', 'engine', 'device', 'decoding', 'engine', 'transaction', 'engine', 'operation', 'receiving', 'input', 'buyer', 'transaction', 'card', 'information', 'engine', 'intermediary', 'buyer', 'merchant', 'buyer', 'share', 'transaction', 'card', 'information', 'merchant']

---
Playing with SpaCy to perform NLP

In [9]:
#import spacy

#from spacy.en import English
#parser = English()
#This causes the kernel to crash and be restarted

Back to the NLTK...
***

Now we try some chunking using Regular Expressions.

In [10]:
# Define Regex rules for Noun Phrases (NP), prepositions (P), Verbs (V), preposition phrase (PP), verb phrase (VP)
grammar = '''
    NP: {<DT>? <JJ>* <NN>*} # NP
    P: {<IN>}           # Preposition
    V: {<V.*>}          # Verb
    PP: {<P> <NP>}      # PP -> P NP
    VP: {<V> <NP|PP>*}  # VP -> V (NP|PP)*
'''
cp = nltk.RegexpParser(grammar)

In [13]:
result = cp.parse(pos)

In [14]:
result.draw()

Not so bad for out of the box. We need some tweaks to recognise some verbs used as adjectives. Maybe if we add a new rule for a noun-phrase ```{<DT>+ <JJ>* <V>+ <NN>+}```...

In [34]:
grammar = '''
    NP: {<DT>+ <VBG>? <NN.?>+} 
        {<DT>? <JJ>* <NN.*>+} # NP
'''

cp = nltk.RegexpParser(grammar)
result = cp.parse(pos)
result.draw()

This is working better and picks up "a decoding engine" as a noun phrase. We also need to add a rule to identify "the buyer's financial transaction card" as a noun phrase. Currently the presence of the possesive "s" is throwing things off.

In [74]:
grammar = '''
    NP: {<DT|PRP\$> <VBG> <NN.*>+} 
        {<DT|PRP\$> <NN.*> <POS> <JJ>* <NN.*>+}
        {<DT|PRP\$>? <JJ>* <NN.*>+ }
'''

cp = nltk.RegexpParser(grammar)
result = cp.parse(pos)
#result.draw()

In [75]:
print(result)

(S
  (NP A/DT decoding/NN system/NN)
  ,/,
  comprising/VBG
  :/:
  (NP a/DT decoding/VBG engine/NN)
  running/VBG
  on/IN
  (NP a/DT mobile/JJ device/NN)
  ,/,
  (NP the/DT decoding/VBG engine/NN)
  in/IN
  (NP operation/NN decoding/NN signals/NNS)
  produced/VBD
  from/IN
  (NP a/DT read/NN)
  of/IN
  (NP a/DT buyer/NN 's/POS financial/JJ transaction/NN card/NN)
  ,/,
  (NP the/DT decoding/VBG engine/NN)
  in/IN
  (NP operation/NN accepting/NN)
  and/CC
  initializing/VBG
  incoming/VBG
  (NP signals/NNS)
  from/IN
  (NP the/DT read/NN)
  of/IN
  (NP the/DT buyer/NN 's/POS financial/JJ transaction/NN card/NN)
  until/IN
  (NP the/DT signals/NNS)
  reach/VBP
  (NP a/DT steady/JJ state/NN)
  ,/,
  detecting/VBG
  (NP the/DT read/NN)
  of/IN
  (NP the/DT buyer/NN 's/POS financial/JJ transaction/NN card/NN)
  once/IN
  (NP the/DT incoming/NN signals/NNS)
  are/VBP
  in/IN
  (NP a/DT steady/JJ state/NN)
  ,/,
  identifying/VBG
  (NP peaks/NNS)
  in/IN
  (NP the/DT incoming/NN signals/NNS)

In [76]:
result.draw()

Now we define a tree traversal function to print all the noun phrases.

In [77]:
result.label()

'S'

In [139]:
def traverse(t, np=False):
    try:
        t.label()
    except AttributeError:
        # This is then a tuple e.g. ('A', 'DT') which is a leaf of the tree
        if np:
            print(t[0], end=" ")
    else:
        # Now we know that t.node is defined
        if "S" in t.label():
            for child in t:
                traverse(child)
        elif "NP" in t.label():
            for child in t:
                traverse(child, np=True)
            print("")

In [140]:
traverse(result)

A decoding system 
a decoding engine 
a mobile device 
the decoding engine 
operation decoding signals 
a read 
a buyer 's financial transaction card 
the decoding engine 
operation accepting 
signals 
the read 
the buyer 's financial transaction card 
the signals 
a steady state 
the read 
the buyer 's financial transaction card 
the incoming signals 
a steady state 
peaks 
the incoming signals 
the identified peaks 
the incoming signals 
bits 
a transaction engine 
the mobile device 
the decoding engine 
the transaction engine 
operation receiving 
its input 
buyer 
financial transaction card information 
the decoding engine 
an intermediary 
the buyer 
a merchant 
the buyer 
share 
his/her financial transaction card information 
the merchant 


It would be good to get the unique entities in these noun phrases. We can ignore the determinant (although this could be used to check antecedence). A good grammar parser should do this - if a noun phrase is repeated it could be used to avoid ambiguity.

We can adapt our traversal function:

In [None]:
 n      def traverse(t, np=False):
    try:
        t.label()
    except AttributeError:
        # This is then a tuple e.g. ('A', 'DT') which is a leaf of the tree
        if np:
            print(t[0], end=" ")
    else:
        # Now we know that t.node is defined
        if "S" in t.label():
            for child in t:
                traverse(child)
        elif "NP" in t.label():
            for child in t:
                traverse(child, np=True)
            print("")

In [130]:
result[0][0]

('A', 'DT')

In [133]:
result[0][0]

('A', 'DT')

In [None]:
" ".join([])

In [116]:
result.pprint()

(S
  (NP A/DT decoding/NN system/NN)
  ,/,
  comprising/VBG
  :/:
  (NP a/DT decoding/VBG engine/NN)
  running/VBG
  on/IN
  (NP a/DT mobile/JJ device/NN)
  ,/,
  (NP the/DT decoding/VBG engine/NN)
  in/IN
  (NP operation/NN decoding/NN signals/NNS)
  produced/VBD
  from/IN
  (NP a/DT read/NN)
  of/IN
  (NP a/DT buyer/NN 's/POS financial/JJ transaction/NN card/NN)
  ,/,
  (NP the/DT decoding/VBG engine/NN)
  in/IN
  (NP operation/NN accepting/NN)
  and/CC
  initializing/VBG
  incoming/VBG
  (NP signals/NNS)
  from/IN
  (NP the/DT read/NN)
  of/IN
  (NP the/DT buyer/NN 's/POS financial/JJ transaction/NN card/NN)
  until/IN
  (NP the/DT signals/NNS)
  reach/VBP
  (NP a/DT steady/JJ state/NN)
  ,/,
  detecting/VBG
  (NP the/DT read/NN)
  of/IN
  (NP the/DT buyer/NN 's/POS financial/JJ transaction/NN card/NN)
  once/IN
  (NP the/DT incoming/NN signals/NNS)
  are/VBP
  in/IN
  (NP a/DT steady/JJ state/NN)
  ,/,
  identifying/VBG
  (NP peaks/NNS)
  in/IN
  (NP the/DT incoming/NN signals/NNS)