In [1]:
import re 
import string 
import nltk
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 200)

In [2]:
# load spaCy model
nlp = spacy.load("en_core_web_sm")

In [3]:
# sample text 
IFRS_STD_15 = {
    9: {
        "category": "Recognition",
        "standards": """
                        An entity shall account for a contract with a customer that is within the
                        scope of this Standard only when all of the following criteria are met:
                        (a) the parties to the contract have approved the contract (in writing,
                        orally or in accordance with other customary business practices)
                        and are committed to perform their respective obligations;
                        
                        (b) the entity can identify each party’s rights regarding the goods or
                        services to be transferred;
                        
                        (c) the entity can identify the payment terms for the goods or services
                        to be transferred;
                        
                        (d) the contract has commercial substance (ie the risk, timing or
                        amount of the entity’s future cash flows is expected to change as a
                        result of the contract); and
                        
                        (e) it is probable that the entity will collect the consideration to which
                        it will be entitled in exchange for the goods or services that will be
                        transferred to the customer. In evaluating whether collectability of
                        an amount of consideration is probable, an entity shall consider
                        only the customer’s ability and intention to pay that amount of
                        consideration when it is due. The amount of consideration to which
                        the entity will be entitled may be less than the price stated in the
                        contract if the consideration is variable because the entity may offer
                        the customer a price concession (see paragraph 52).                        
                    """
    },
    22: {
            "category": "Identifying performance obligations",
            "standards": """
                            At contract inception, an entity shall assess the goods or services promised
                            in a contract with a customer and shall identify as a performance
                            obligation each promise to transfer to the customer either:
                            
                            (a) a good or service (or a bundle of goods or services) that is distinct; or
                            
                            (b) a series of distinct goods or services that are substantially the same
                            and that have the same pattern of transfer to the customer (see paragraph 23).
                        """
    },
    23: {
            "category": "Identifying performance obligations",
            "standards": """
                            A series of distinct goods or services has the same pattern of transfer to
                            the customer if both of the following criteria are met:
                            
                            (a) each distinct good or service in the series that the entity promises to
                            transfer to the customer would meet the criteria in paragraph 35 to be
                            a performance obligation satisfied over time; and
                            
                            (b) in accordance with paragraphs 39–40, the same method would be used
                            to measure the entity’s progress towards complete satisfaction of the
                            performance obligation to transfer each distinct good or service in the
                            series to the customer.            
                        """
    },
    24: {
            "category": "Promises in contracts with customers",
            "standards": """
                            A contract with a customer generally explicitly states the goods or services
                            that an entity promises to transfer to a customer. However, the performance
                            obligations identified in a contract with a customer may not be limited to the
                            goods or services that are explicitly stated in that contract. This is because a
                            contract with a customer may also include promises that are implied by an
                            entity’s customary business practices, published policies or specific statements
                            if, at the time of entering into the contract, those promises create a valid
                            expectation of the customer that the entity will transfer a good or service to
                            the customer.
                        """
    },
    27: {
            "category": "Distinct goods or services",
            "standards": """
                            A good or service that is promised to a customer is distinct if both of the
                            following criteria are met:
                            
                            (a) the customer can benefit from the good or service either on its own or
                            together with other resources that are readily available to the
                            customer (ie the good or service is capable of being distinct); and
                            
                            (b) the entity’s promise to transfer the good or service to the customer is
                            separately identifiable from other promises in the contract (ie the
                            promise to transfer the good or service is distinct within the context of
                            the contract).
                        """
    },
    31: {
            "category": "Satisfaction of performance obligations",
            "standards": """
                            An entity shall recognise revenue when (or as) the entity satisfies a
                            performance obligation by transferring a promised good or service (ie an
                            asset) to a customer. An asset is transferred when (or as) the customer
                            obtains control of that asset.
                        """
    },
    35: {
            "category": "Satisfaction of performance obligations", 
            "standards": """
                            An entity transfers control of a good or service over time and, therefore,
                            satisfies a performance obligation and recognises revenue over time, if one of the following criteria is met:
                            
                            (a) the customer simultaneously receives and consumes the benefits provided by the entity’s performance as the entity performs.
                            
                            (b) the entity’s performance creates or enhances an asset that the customer controls as the asset is created or enhanced.
                            
                            (c) the entity’s performance does not create an asset with an alternative use to the entity and the entity has an enforceable
                            right to payment for performance completed to date.
                        """
    }
}

# create a spaCy object 
doc = nlp(IFRS_STD_15[35]['standards'])
print(doc)


                            An entity transfers control of a good or service over time and, therefore,
                            satisfies a performance obligation and recognises revenue over time, if one of the following criteria is met:
                            
                            (a) the customer simultaneously receives and consumes the benefits provided by the entity’s performance as the entity performs.
                            
                            (b) the entity’s performance creates or enhances an asset that the customer controls as the asset is created or enhanced.
                            
                            (c) the entity’s performance does not create an asset with an alternative use to the entity and the entity has an enforceable
                            right to payment for performance completed to date.
                        


In [4]:
# print token, dependency, POS tag 
for tok in doc:
    if (tok.pos_ == "NOUN"):
        print(tok.text, "-->", tok.pos_)

entity --> NOUN
transfers --> NOUN
control --> NOUN
good --> NOUN
service --> NOUN
time --> NOUN
performance --> NOUN
obligation --> NOUN
revenue --> NOUN
time --> NOUN
criteria --> NOUN
customer --> NOUN
benefits --> NOUN
entity --> NOUN
’s --> NOUN
performance --> NOUN
entity --> NOUN
entity --> NOUN
’s --> NOUN
performance --> NOUN
asset --> NOUN
customer --> NOUN
asset --> NOUN
entity --> NOUN
’s --> NOUN
performance --> NOUN
asset --> NOUN
use --> NOUN
entity --> NOUN
entity --> NOUN
right --> NOUN
payment --> NOUN
performance --> NOUN
date --> NOUN


In [5]:
spacy.explain("GPE")

'Countries, cities, states'

In [6]:
spacy.explain("NNP")

'noun, proper singular'

In [7]:
spacy.explain("cc")

'coordinating conjunction'

In [8]:
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

In [9]:
# print token, dependency, POS tag 
for tok in doc:
    print(tok.text, "-->", tok.dep_, "-->", tok.pos_)


                             --> dep --> SPACE
An --> det --> DET
entity --> compound --> NOUN
transfers --> compound --> NOUN
control --> nsubj --> NOUN
of --> prep --> ADP
a --> det --> DET
good --> pobj --> NOUN
or --> cc --> CCONJ
service --> conj --> NOUN
over --> prep --> ADP
time --> pobj --> NOUN
and --> cc --> CCONJ
, --> punct --> PUNCT
therefore --> advmod --> ADV
, --> punct --> PUNCT

                             --> dep --> SPACE
satisfies --> conj --> VERB
a --> det --> DET
performance --> compound --> NOUN
obligation --> dobj --> NOUN
and --> cc --> CCONJ
recognises --> conj --> VERB
revenue --> dobj --> NOUN
over --> prep --> ADP
time --> pobj --> NOUN
, --> punct --> PUNCT
if --> mark --> SCONJ
one --> nsubjpass --> NUM
of --> prep --> ADP
the --> det --> DET
following --> amod --> VERB
criteria --> pobj --> NOUN
is --> auxpass --> AUX
met --> advcl --> VERB
: --> punct --> PUNCT

                            
                             --> dep --> SPACE
( --> punct

In [10]:
#define the pattern 
patterns = [[{'POS':'NOUN'}, 
           {'LOWER': 'such'}, 
           {'LOWER': 'as'}, 
           {'POS': 'PROPN'} #proper noun
          ], 
           # [{'DEP':'amod', 'OP':"?"}, # adjectival modifier Note: The key ‘OP’: ‘?’ in the pattern above means that the modifier (‘amod’) can occur once or not at all.
           # {'POS':'NOUN'},
           # {'LOWER': 'such'},
           # {'LOWER': 'as'},
           # {'POS': 'PROPN'}]
           ]

In [11]:
# Matcher class object 
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", patterns) 

matches = matcher(doc) 
span = doc[matches[0][1]:matches[0][2]]

print(span.text)

IndexError: list index out of range

In [12]:
##another

In [46]:
from spacy.matcher import Matcher

In [47]:
nine_a = """the parties to the contract have approved the contract (in writing, orally or in accordance with other customary business practices) and are committed to perform their respective obligations."""
nine_b = "the entity can identify each party’s rights regarding the goods or services to be transferred."
nine_c = "the entity can identify the payment terms for the goods or services to be transferred."
nine_d = """the contract has commercial substance (ie the risk, timing or amount of the entity’s future cash flows is expected to change as a result of the contract)."""

In [48]:
def clean_string(raw_string):
    """
    clean_string: 
    input_params:
        raw_string: 
    output_params:
        Returns
    """
    # Find all the round brackets substring indexes
#     pattern = '\((.*?)\)'
#     bracket_idxs = [(m.start(0)+1, m.end(0)-1) for m in re.finditer(pattern, raw_string)]
#     print(bracket_idxs)
    
#     nlp = spacy.load("en_core_web_sm")
#     doc = nlp(raw_string)
#     for np in doc.noun_chunks:
#         print(np.start, np.end, np.text)
#     return 1

    nlp = spacy.load("en_core_web_sm")
    matcher = Matcher(nlp.vocab)
    # Add match ID "HelloWorld" with no callback and one pattern
    pattern = [{"POS": "DET",  "OP": "?"}, {"POS": "NOUN"}, {"TEXT": '(', }, {"TEXT": {"NOT_IN": [")"]}, "OP": "*"}, {"TEXT": ')'}]
    matcher.add("FindSubstringInBrackets", [pattern])

    doc = nlp(raw_string)
    matches = matcher(doc)
    high = 0
    for match_id, start, end in matches:
        if (high == 0) or (start >= high):
            string_id = nlp.vocab.strings[match_id]  # Get string representation
            span = doc[start:end]  # The matched span
            noun_chunks = list(span.noun_chunks)            
            print(start, end, span.text)
            print('noun_chunks', noun_chunks)
            high = end

In [49]:
clean_string(nine_a)
clean_string(nine_b)
clean_string(nine_c)
clean_string(nine_d)    

7 23 the contract (in writing, orally or in accordance with other customary business practices)
noun_chunks [the contract, writing, accordance, other customary business practices]
4 31 substance (ie the risk, timing or amount of the entity’s future cash flows is expected to change as a result of the contract)
noun_chunks [ie the risk, timing, amount, the entity’s future cash flows, a result, the contract]


In [50]:
pattern = '\((.*?)\)'
bracket_idxs = [(m.start(0)+1, m.end(0)-1) for m in re.finditer(pattern, nine_a)]
print(bracket_idxs)

[(56, 131)]


In [51]:
nine_a_rm_brac = nine_a[:bracket_idxs[0][0]-2] + nine_a[bracket_idxs[0][1]+1:]
print(nine_a_rm_brac)

the parties to the contract have approved the contract and are committed to perform their respective obligations.


In [52]:
doc = nlp(nine_a_rm_brac)
print(doc)

the parties to the contract have approved the contract and are committed to perform their respective obligations.


In [53]:
# print token, dependency, POS tag 
for tok in doc:
    if tok.dep_ == 'nsubj':
        print("SUB:" , tok.text, "-->", tok.dep_, "-->", tok.pos_)
    elif tok.dep_.endswith('obj'):
        print("OBJ:" , tok.text, "-->", tok.dep_, "-->", tok.pos_)        
    else:
        pass
#         print(tok.text, "-->", tok.dep_, "-->", tok.pos_)

SUB: parties --> nsubj --> NOUN
OBJ: contract --> pobj --> NOUN
OBJ: contract --> dobj --> NOUN
OBJ: obligations --> dobj --> NOUN


In [54]:
def merge_phrases(doc):
    with doc.retokenize() as retokenizer:
        for np in list(doc.noun_chunks):
            attrs = {
                "tag": np.root.tag_,
                "lemma": np.root.lemma_,
                "ent_type": np.root.ent_type_,
            }
            retokenizer.merge(np, attrs=attrs)
    return doc

In [55]:
def merge_punct(doc):
    spans = []
    for word in doc[:-1]:
        if word.is_punct or not word.nbor(1).is_punct:
            continue
        start = word.i
        end = word.i + 1
        while end < len(doc) and doc[end].is_punct:
            end += 1
        span = doc[start:end]
        spans.append((span, word.tag_, word.lemma_, word.ent_type_))
    with doc.retokenize() as retokenizer:
        for span, tag, lemma, ent_type in spans:
            attrs = {"tag": tag, "lemma": lemma, "ent_type": ent_type}
            retokenizer.merge(span, attrs=attrs)
    return doc

In [56]:
# Merge noun phrases into one token.
doc = merge_phrases(doc)
# Attach punctuation to tokens
doc = merge_punct(doc)

In [57]:
# print token, dependency, POS tag 
for tok in doc:
    if tok.dep_ == 'nsubj':
        print("SUB:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)
    elif tok.dep_.endswith('obj'):
        print("OBJ:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.dep_ == 'agent':
        print("Agent:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
#     else:
#         print(tok.text, "-->", tok.dep_, "-->", tok.pos_)

SUB: the parties --> party --> nsubj --> NOUN
OBJ: the contract --> contract --> pobj --> NOUN
OBJ: the contract --> contract --> dobj --> NOUN
OBJ: their respective obligations. --> obligation --> dobj --> NOUN


In [80]:
doc = nlp("A body has been found by police.")
print(doc)

A body has been found by police.


In [59]:
# Merge noun phrases into one token.
doc = merge_phrases(doc)
# Attach punctuation to tokens
doc = merge_punct(doc)

In [60]:
# print token, dependency, POS tag 
for tok in doc:
    if tok.dep_.startswith('nsubj'):
        print("SUB:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)
    elif tok.dep_.endswith('obj'):
        print("OBJ:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.dep_ == 'agent':
        print("Agent:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.pos_ == 'VERB':
        print("VERB:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)
#     else:
#         print(tok.text, "-->", tok.dep_, "-->", tok.pos_)

SUB: A body --> body --> nsubjpass --> NOUN
VERB: found --> find --> ROOT --> VERB
Agent: by --> by --> agent --> ADP
OBJ: police. --> police --> pobj --> NOUN


In [77]:
doc = nlp("Fallujah is an Iraqi city.")
print(doc)
# Merge noun phrases into one token.
# doc = merge_phrases(doc)
# Attach punctuation to tokens
# doc = merge_punct(doc)

Fallujah is an Iraqi city.


In [62]:
# print token, dependency, POS tag 
for tok in doc:
    if tok.dep_.startswith('nsubj'):
        print("SUB:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)
    elif tok.dep_.endswith('obj'):
        print("OBJ:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.dep_ == 'agent':
        print("Agent:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.pos_ == 'VERB':
        print("VERB:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)
    elif tok.pos_ == 'AUX':
        print("AUX:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.pos_ == 'NOUN':
        print("NOUN:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
#     else:
#         print(tok.text, "-->", tok.dep_, "-->", tok.pos_)

SUB: Fallujah --> Fallujah --> nsubj --> PROPN
AUX: is --> be --> ROOT --> AUX
NOUN: city --> city --> attr --> NOUN


In [72]:
doc = nlp("Ghazi al-Yawar is new president of Iraq.")
print(doc)
# Merge noun phrases into one token.
# doc = merge_phrases(doc)
# Attach punctuation to tokens
# doc = merge_punct(doc)

Ghazi al-Yawar is new president of Iraq.


In [64]:
# print token, dependency, POS tag 
for tok in doc:
    if tok.dep_.startswith('nsubj'):
        print("SUB:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)
    elif tok.dep_.endswith('obj'):
        print("OBJ:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.dep_ == 'agent':
        print("Agent:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.pos_ == 'VERB':
        print("VERB:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)
    elif tok.pos_ == 'AUX':
        print("AUX:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.pos_ == 'NOUN':
        print("NOUN:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.pos_ == 'ADP':
        print("ADP:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
#     else:
#         print(tok.text, "-->", tok.dep_, "-->", tok.pos_)

SUB: Yawar --> Yawar --> nsubj --> PROPN
AUX: is --> be --> ROOT --> AUX
NOUN: president --> president --> attr --> NOUN
ADP: of --> of --> prep --> ADP
OBJ: Iraq --> Iraq --> pobj --> PROPN


In [65]:
doc = nlp("10,000 people in Africa died of Ebola.")
print(doc)
# Merge noun phrases into one token.
# doc = merge_phrases(doc)
# Attach punctuation to tokens
# doc = merge_punct(doc)

10,000 people in Africa died of Ebola.


In [66]:
# print token, dependency, POS tag 
for tok in doc:
    if tok.dep_.startswith('nsubj'):
        print("SUB:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)
    elif tok.dep_.endswith('obj'):
        print("OBJ:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.dep_ == 'agent':
        print("Agent:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.pos_ == 'VERB':
        print("VERB:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)
    elif tok.pos_ == 'AUX':
        print("AUX:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.pos_ == 'NOUN':
        print("NOUN:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
    elif tok.pos_ == 'ADP':
        print("ADP:" , tok.text, "-->", tok.lemma_, "-->", tok.dep_, "-->", tok.pos_)        
#     else:
#         print(tok.text, "-->", tok.dep_, "-->", tok.pos_)

SUB: people --> people --> nsubj --> NOUN
ADP: in --> in --> prep --> ADP
OBJ: Africa --> Africa --> pobj --> PROPN
VERB: died --> die --> ROOT --> VERB
ADP: of --> of --> prep --> ADP
OBJ: Ebola --> Ebola --> pobj --> PROPN


In [112]:
doc = nlp("10,000 people in Africa died of Ebola.")


In [113]:
from nltk import Tree
import json

In [114]:
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

In [115]:
[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

     died                     
  ____|____________________    
 |         people          |  
 |     ______|______       |   
 |    |             in     of 
 |    |             |      |   
 .  10,000        Africa Ebola



[None]

In [144]:
def tok_format(tok):
    return "(".join([tok.orth_, tok.pos_]) + ")"


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)

In [145]:
for sent in doc.sents:
    to_nltk_tree(sent.root).pretty_print(unicodelines=True, nodedist=3)

                                                                                         states(VERB)                                                                                                            
      ┌─────────────────┬─────────────┬──────────────────────┬────────────────────────────────┴───────────────────────────────────────────┐                                                                         
      │                 │             │                      │                                                                       goods(NOUN)                                                                 
      │                 │             │                      │                                ┌─────────────┬─────────────┬───────────────┴────────────────────────────┐                                            
      │                 │             │                      │                                │             │             │                               

In [127]:
[to_nltk_tree(sent.root).pretty_print(unicodelines=True, nodedist=3) for sent in doc.sents]

                               died(VERB)                
                   ┌───────────────┴──────────────┐         
              people(NOUN)                        │      
     ┌─────────────┴───────────────┐              │         
     │                          in(ADP)        of(ADP)   
     │                             │              │         
10,000(NUM)                  Africa(PROPN)   Ebola(PROPN)



[None]

In [126]:
# doc = nlp("the parties to the contract have approved in writing, orally or in accordance with other customary business practices and are committed to perform their respective obligations")
doc = nlp("10,000 people in Africa died of Ebola")

In [110]:
# tree = nltk.tree.Tree.fromstring()

In [103]:
import nltk
s = '(ROOT (S (NP (NNP Europe)) (VP (VBZ is) (PP (IN in) (NP (DT the) (JJ same) (NNS trends)))) (. .)))'
tree = nltk.tree.Tree.fromstring(s)
def traverse_tree(tree):
    print("tree:", tree)
    for subtree in tree:
        if type(subtree) == nltk.tree.Tree:
            traverse_tree(subtree)
traverse_tree(tree)

tree: (ROOT
  (S
    (NP (NNP Europe))
    (VP (VBZ is) (PP (IN in) (NP (DT the) (JJ same) (NNS trends))))
    (. .)))
tree: (S
  (NP (NNP Europe))
  (VP (VBZ is) (PP (IN in) (NP (DT the) (JJ same) (NNS trends))))
  (. .))
tree: (NP (NNP Europe))
tree: (NNP Europe)
tree: (VP (VBZ is) (PP (IN in) (NP (DT the) (JJ same) (NNS trends))))
tree: (VBZ is)
tree: (PP (IN in) (NP (DT the) (JJ same) (NNS trends)))
tree: (IN in)
tree: (NP (DT the) (JJ same) (NNS trends))
tree: (DT the)
tree: (JJ same)
tree: (NNS trends)
tree: (. .)


In [131]:
doc = nlp("A contract with a customer generally explicitly states the goods or services that an entity promises to transfer to a customer.")

In [133]:
[to_nltk_tree(sent.root).pretty_print(unicodelines=True, nodedist=1) for sent in doc.sents]

                                                                             states(VERB)                                                                                              
      ┌───────────────┬───────────┬──────────────────┬────────────────────────────┴─────────────────────────────────────┐                                                               
      │               │           │                  │                                                             goods(NOUN)                                                         
      │               │           │                  │                            ┌───────────┬───────────┬─────────────┴────────────────────────┐                                      
      │               │           │                  │                            │           │           │                                promises(VERB)                              
      │               │           │                  │                        

[None]

In [141]:
from nltk.draw.tree import TreeView
import os 
for sent in doc.sents:
    to_nltk_tree(sent.root).pretty_print(unicodelines=True, nodedist=1) 

                                                                             states(VERB)                                                                                              
      ┌───────────────┬───────────┬──────────────────┬────────────────────────────┴─────────────────────────────────────┐                                                               
      │               │           │                  │                                                             goods(NOUN)                                                         
      │               │           │                  │                            ┌───────────┬───────────┬─────────────┴────────────────────────┐                                      
      │               │           │                  │                            │           │           │                                promises(VERB)                              
      │               │           │                  │                        

In [151]:
import os
from nltk.draw.tree import TreeView

doc = nlp("My Name is Anuj!")
for sent in doc.sents:
    GPRAH = to_nltk_tree(sent.root)
    TreeView(GRAPH)._cframe.print_to_file('tree.ps')
    os.system('convert tree.ps tree.png')

None


In [1]:
text = """
An entity shall account for a contract with a customer that is within the scope of this Standard only when all of the following criteria are met:  the parties to the contract have approved the contract and are committed to perform their respective obligations; the entity can identify each party’s rights regarding the goods or services to be transferred; the entity can identify the payment terms for the goods or services to be transferred; the contract has commercial substance; and it is probable that the entity will collect the consideration to which it will be entitled in exchange for the goods or services that will be transferred to the customer. In evaluating whether collectability of an amount of consideration is probable, an entity shall consider only the customer’s ability and intention to pay that amount of consideration when it is due. The amount of consideration to which the entity will be entitled may be less than the price stated in the contract if the consideration is variable because the entity may offer the customer a price concession. At contract inception, an entity shall assess the goods or services promised in a contract with a customer and shall identify as a performance obligation each promise to transfer to the customer either: a good or service that is distinct; or a series of distinct goods or services that are substantially the same and that have the same pattern of transfer to the customer. A series of distinct goods or services has the same pattern of transfer to the customer if both of the following criteria are met: each distinct good or service in the series that the entity promises to transfer to the customer would meet the criteria to be a performance obligation satisfied over time; and in accordance, the same method would be used to measure the entity’s progress towards complete satisfaction of the performance obligation to transfer each distinct good or service in the series to the customer. A good or service that is promised to a customer is distinct if both of the following criteria are met: the customer can benefit from the good or service either on its own or together with other resources that are readily available to the customer; and the entity’s promise to transfer the good or service to the customer is separately identifiable from other promises in the contract. An entity transfers control of a good or service over time and, therefore, satisfies a performance obligation and recognises revenue over time, if one of the following criteria is met: the customer simultaneously receives and consumes the benefits provided by the entity’s performance as the entity performs; the entity’s performance creates or enhances an asset that the customer controls as the asset is created or enhanced; or the entity’s performance does not create an asset with an alternative use to the entity and the entity has an enforceable right to payment for performance completed to date.
"""

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [6]:
from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [
    {"label": "ENTITY", "pattern": [{"LOWER": "entity"}]},
    {"label": "CONTRACT", "pattern": [{"LOWER": "contract"}]},
    {"label": "CUSTOMER", "pattern": [{"LOWER": "customer"}]},
    {"label": "PARTY", "pattern": [{"LOWER": "party"}]},
    {"label": "OBLIGATION", "pattern": [{"LOWER": "obligation"}]},
    {"label": "GOODS", "pattern": [{"LOWER": "goods"}]},
    {"label": "SERVICES", "pattern": [{"LOWER": "services"}]},
    {"label": "PAYMENT", "pattern": [{"LOWER": "payment"}]},
    {"label": "COMMERICIAL SUBSTANCE", "pattern": [{"LOWER": "commercial substance"}]},
    {"label": "PRICE", "pattern": [{"LOWER": "price"}]},
    {"label": "CONCESSION", "pattern": [{"LOWER": "concession"}]},
    {"label": "REVENUE", "pattern": [{"LOWER": "revenue"}]},
    {"label": "ASSET", "pattern": [{"LOWER": "asset"}]},
    {"label": "ENFORCEABLE", "pattern": [{"LOWER": "enforceable"}]},
    {"label": "PERFORMANCE", "pattern": [{"LOWER": "performance|"}]},
   ]

ruler.add_patterns(patterns)
doc = nlp(text)
print([(ent.text, ent.label_) for ent in doc.ents])

[('entity', 'ENTITY'), ('contract', 'CONTRACT'), ('customer', 'CUSTOMER'), ('contract', 'CONTRACT'), ('contract', 'CONTRACT'), ('entity', 'ENTITY'), ('party', 'PARTY'), ('goods', 'GOODS'), ('services', 'SERVICES'), ('entity', 'ENTITY'), ('payment', 'PAYMENT'), ('goods', 'GOODS'), ('services', 'SERVICES'), ('contract', 'CONTRACT'), ('entity', 'ENTITY'), ('goods', 'GOODS'), ('services', 'SERVICES'), ('customer', 'CUSTOMER'), ('entity', 'ENTITY'), ('customer', 'CUSTOMER'), ('entity', 'ENTITY'), ('price', 'PRICE'), ('contract', 'CONTRACT'), ('entity', 'ENTITY'), ('customer', 'CUSTOMER'), ('price', 'PRICE'), ('concession', 'CONCESSION'), ('contract', 'CONTRACT'), ('entity', 'ENTITY'), ('goods', 'GOODS'), ('services', 'SERVICES'), ('contract', 'CONTRACT'), ('customer', 'CUSTOMER'), ('obligation', 'OBLIGATION'), ('customer', 'CUSTOMER'), ('goods', 'GOODS'), ('services', 'SERVICES'), ('customer', 'CUSTOMER'), ('goods', 'GOODS'), ('services', 'SERVICES'), ('customer', 'CUSTOMER'), ('entity', 'E