In [1]:
import os
from termcolor import colored
from texar.torch import HParams

from nlp.pipeline import Pipeline
from nlp.pipeline.data.ontology import conll03_ontology
from nlp.pipeline.data.ontology.conll03_ontology import Token, Sentence, EntityMention, PredicateLink
from nlp.pipeline.data.readers import StringReader
from nlp.pipeline.processors.impl import (
    NLTKWordTokenizer, NLTKSentenceSegmenter, NLTKPOSTagger, SRLPredictor, CoNLLNERPredictor)

# Creates the pipeline here:

## In a pipeline, processors should follow a consistent ontology.

In [2]:
pl = Pipeline(ontology=conll03_ontology)

# Set the reader of the pipeline

In [3]:
pl.set_reader(StringReader())

# Add processors
## The processors can wrap any external tools. For example, we are wrapping some NLTK tools.

In [4]:
pl.add_processor(NLTKSentenceSegmenter())
pl.add_processor(NLTKWordTokenizer())
pl.add_processor(NLTKPOSTagger())

# We now load our own NER predictor

In [5]:
ner_configs = HParams(
    {
        'storage_path': os.path.join('../NER_model', 'resources.pkl')
    },
    CoNLLNERPredictor.default_hparams())

pl.add_processor(CoNLLNERPredictor(), ner_configs)

# And here is our SRL predictor

In [6]:
srl_configs = HParams(
    {
        'storage_path': '../SRL_model/',
    },
    SRLPredictor.default_hparams()
)
pl.add_processor(SRLPredictor(), srl_configs)

In [7]:
pl.initialize_processors()

# Our pipeline is ready, now let's try out some text snippets.

In [8]:
search_engine_text = "A Scottish firm is looking to attract web surfers with a search engine that reads out results."\
                " Called Speegle, it has the look and feel of a normal search engine, with the added feature of being able to read"\
                " out the results. Scottish speech technology firm CEC Systems launched the site in November. But experts have"\
                " questioned whether talking search engines are of any real benefit to people with visual impairments. The"\
                " Edinburgh-based firm CEC has married speech technology with ever-popular internet search. The ability to search is"\
                " becoming increasingly crucial to surfers baffled by the huge amount of information available on the web."\

win_medal_text = "British hurdler Sarah Claxton is confident she can win her first major medal at next "\
                "month's European Indoor Championships in Madrid. Claxton will see if her new training "\
                "regime pays dividends at the European Indoors which take place on 5-6 March."

# Process this snippet with one simple command.

In [9]:
pack = pl.process(win_medal_text)

  self.num_layers, self.dropout, self.training, self.bidirectional)


# Now all the results are ready.
## We have added the results as "entries" into our data.
## Let's first take a look at the sentences.

In [10]:
for sentence in pack.get(Sentence):  # returns an iterator of sentences in this pack
    sent_text = sentence.text
    print(colored("Sentence:",'red'), sent_text, "\n")

[31mSentence:[0m British hurdler Sarah Claxton is confident she can win her first major medal at next month's European Indoor Championships in Madrid. 

[31mSentence:[0m Claxton will see if her new training regime pays dividends at the European Indoors which take place on 5-6 March. 



# We can access more fine-grained data in the sentences using our magical "get" function.
## Let's get all the tokens in the first sentence and print out their Part-of-Speech value.

In [11]:
for sentence in pack.get(Sentence):
    tokens = [(token.text, token.pos_tag) for token in
              pack.get(Token, sentence)]  # get tokens in the span of "sentence"
    print(colored("Tokens:",'red'), tokens, "\n")
    break

[31mTokens:[0m [('British', 'JJ'), ('hurdler', 'NN'), ('Sarah', 'NNP'), ('Claxton', 'NNP'), ('is', 'VBZ'), ('confident', 'JJ'), ('she', 'PRP'), ('can', 'MD'), ('win', 'VB'), ('her', 'PRP$'), ('first', 'JJ'), ('major', 'JJ'), ('medal', 'NN'), ('at', 'IN'), ('next', 'JJ'), ('month', 'NN'), ("'s", 'POS'), ('European', 'JJ'), ('Indoor', 'NNP'), ('Championships', 'NNP'), ('in', 'IN'), ('Madrid', 'NNP'), ('.', '.')] 



## Similarly, we can get all the named entities in the sentences, let's look at their types.

In [12]:
for sentence in pack.get(Sentence):
    for entity in pack.get(EntityMention, sentence):
        print(colored("EntityMention:",'red'), 
              entity.text, 
              'has type', 
              colored(entity.ner_type, 'blue'), "\n")
    break

[31mEntityMention:[0m British has type [34mMISC[0m 

[31mEntityMention:[0m Sarah Claxton has type [34mPER[0m 

[31mEntityMention:[0m European Indoor Championships has type [34mMISC[0m 

[31mEntityMention:[0m Madrid has type [34mLOC[0m 



## With this simple "get" function we can do a lot more. Let's see how one can play with semantic role labeling and NER at the same time.

In [13]:
for sentence in pack.get(Sentence):
    print(colored("Semantic role labels:", 'red'))
    # Here we can get all the links within this sentence.
    for link in pack.get(PredicateLink, sentence):
        parent = link.get_parent()
        child = link.get_child()
        print(f"  - \"{child.text}\" is role {link.arg_type} of predicate \"{parent.text}\"")
        # get entities in the span of predicate args
        entities = [entity.text for entity in pack.get(EntityMention, child)] 
        print("      Has entities:", entities, "\n")
    break

[31mSemantic role labels:[0m
  - "at next month's European Indoor Championships in Madrid" is role AM-LOC of predicate "win"
      Has entities: ['European Indoor Championships', 'Madrid'] 

  - "can" is role AM-MOD of predicate "win"
      Has entities: [] 

  - "she" is role A0 of predicate "win"
      Has entities: [] 

  - "her first major medal" is role A1 of predicate "win"
      Has entities: [] 



In [14]:
for sentence in pack.get(Sentence):
    for entity in pack.get(EntityMention, sentence):
        print(colored("EntityMention:",'red'), entity.text)
        tokens = [token.text for token in pack.get(Token, entity)]
        print("    Has tokens:", tokens, "\n")

[31mEntityMention:[0m British
    Has tokens: ['British'] 

[31mEntityMention:[0m Sarah Claxton
    Has tokens: ['Sarah', 'Claxton'] 

[31mEntityMention:[0m European Indoor Championships
    Has tokens: ['European', 'Indoor', 'Championships'] 

[31mEntityMention:[0m Madrid
    Has tokens: ['Madrid'] 

[31mEntityMention:[0m Claxton
    Has tokens: ['Claxton'] 

[31mEntityMention:[0m European Indoors
    Has tokens: ['European', 'Indoors'] 

