## The Following are Templates of Information Extraction using SpaCy
##### 1). Extracting named entity and their relationship (revenue, loss, currency, etc.)
##### 2). Decompose sentences to get what the verb is about

In [1]:
from __future__ import unicode_literals, print_function

import plac
import spacy

"""
A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:
$9.4 million --> Net income.

Compatible with: spaCy v2.0.0+
"""

def extract_currency_relations(doc):
    # merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    for span in spans:
        span.merge()
    relations = []
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
        if money.dep_ in ('attr', 'dobj'):
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
            relations.append((money.head.head, money))
    return relations


In [2]:
TEXTS = [
    'Net income was $9.4 million compared to the prior year of $2.7 million.',
    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
]

model = "en_core_web_sm"
"""model:  English multi-task CNN trained on OntoNotes, with GloVe vectors trained 
    on Common Crawl. Assigns word vectors, context-specific token vectors, 
    POS tags, dependency parse and named entities."""
nlp = spacy.load(model)

for text in TEXTS:
    doc = nlp(text)
    relations = extract_currency_relations(doc)
    for r1, r2 in relations:
        print("[1]:  ",text)
        print("[2]:  ",doc)
        print("[3]:  relationships: ",relations)
        print("[4]:  relationships components: ",'{:<10}\t    --    {}\t    --    {}'.format(r1.text, r2.ent_type_, r2.text))
        print("*"*70)


text = 'Revenue exceeded twelve billion dollars, with a loss of $1b.'
doc = nlp(text)
print("[5]:  doc.ents: ",doc.ents, " -- doc.noun_chunks:", list(doc.noun_chunks))
spans = list(doc.ents) + list(doc.noun_chunks)
for span in spans:
    span.merge()
print("[6]:  span: ", span)
relations = []
for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
    if money.dep_ in ('attr', 'dobj'):
        subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
        if subject:
            subject = subject[0]
            relations.append((subject, money))
            print("[7]:  money: ", money," -- subject: ", subject)
    elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
        relations.append((money.head.head, money))
    break

[1]:   Net income was $9.4 million compared to the prior year of $2.7 million.
[2]:   Net income was $9.4 million compared to the prior year of $2.7 million.
[3]:  relationships:  [(Net income, $9.4 million), (the prior year, $2.7 million)]
[4]:  relationships components:  Net income	    --    MONEY	    --    $9.4 million
**********************************************************************
[1]:   Net income was $9.4 million compared to the prior year of $2.7 million.
[2]:   Net income was $9.4 million compared to the prior year of $2.7 million.
[3]:  relationships:  [(Net income, $9.4 million), (the prior year, $2.7 million)]
[4]:  relationships components:  the prior year	    --    MONEY	    --    $2.7 million
**********************************************************************
[1]:   Revenue exceeded twelve billion dollars, with a loss of $1b.
[2]:   Revenue exceeded twelve billion dollars, with a loss of $1b.
[3]:  relationships:  [(Revenue, twelve billion dollars), (a loss, 1b)

In [3]:
model = 'en_core_web_sm'
lp = spacy.load(model)

doc = nlp("displaCy uses CSS and JavaScript to show you how computers "
           "understand language")

# The easiest way is to find the head of the subtree you want, and then use
# the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
# is the one that does what you're asking for most directly:
for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        print("[1]:  ", word, " ---- :", ''.join(w.text_with_ws for w in word.subtree))
print("Another way to parse out the text for clausal complement is: ")
for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
        print("[2]:  ",subtree_span.text, '|', subtree_span.root.text)

[1]:   show  ---- : to show you how computers understand language
[1]:   understand  ---- : how computers understand language
Another way to parse out the text for clausal complement is: 
[2]:   to show you how computers understand language | show
[2]:   how computers understand language | understand
