In [1]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
text = 'Apple is looking to buy a UK startup for $1 billion'

In [5]:
doc = nlp(text)

## Tokenization

In [7]:
for token in doc:
    print(token.text)

Apple
is
looking
to
buy
a
UK
startup
for
$
1
billion


## Part of Speech (POS) tagging

In [10]:
for token in doc:
    print(f'{token.text:{15}} {token.pos_}')

Apple           PROPN
is              AUX
looking         VERB
to              PART
buy             VERB
a               DET
UK              PROPN
startup         NOUN
for             ADP
$               SYM
1               NUM
billion         NUM


## Vizualization

In [11]:
from spacy import displacy

In [13]:
displacy.render(doc, style ='dep', options = {'distance': 100, 'compact':True})

## Named Entity Recognition 

In [15]:
#Accessing named entity recognition using iteration over all the tokens or words.
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
UK GPE
$1 billion MONEY


In [17]:
#Accessing named entity recognition using vizualization (displacy)
displacy.render(doc, style = 'ent')

## Setence Segmentation

In [20]:
text = 'Apple is looking to buy a UK startup for $1 billion. Government has approved the transaction.'
doc = nlp(text)

In [22]:
for sent in doc.sents:
    print(sent)

Apple is looking to buy a UK startup for $1 billion.
Government has approved the transaction.


## Phrase Matcher

In [24]:
from spacy.matcher import Matcher
from spacy.tokens import Span

In [25]:
text = 'Hello, World! hello world'

In [26]:
doc = nlp(text)

In [37]:
pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True, 'OP':'?'}, {'LOWER': 'world'}]

In [38]:
matcher = Matcher(nlp.vocab)
matcher.add('hw', [pattern])

In [39]:
matches = matcher(doc)

In [40]:
matches

[(17790654416186116455, 0, 3), (17790654416186116455, 4, 6)]

In [41]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, span.text)

17790654416186116455 hw Hello, World
17790654416186116455 hw hello world
