In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
nlp

<spacy.lang.en.English at 0x78b0c1c0c8b0>



---



###The Doc Object for Processed Text

In [None]:
introduction_doc = nlp(
...     "This tutorial is about Natural Language Processing in spaCy."
... )
type(introduction_doc)
[token.text for token in introduction_doc]

['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'spaCy',
 '.']

In [None]:
import pathlib
file_name = "/content/introduction.txt"
introduction_doc = nlp(pathlib.Path(file_name).read_text(encoding="utf-8"))
print ([token.text for token in introduction_doc])

['"', 'This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'spaCy', '.', '"']




---



###Sentence Detection

In [None]:
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)

for sentence in sentences:
    print(f"{sentence[:5]}...")

Gus Proto is a Python...
He is interested in learning...


In [None]:
ellipsis_text = (
    "Gus, can you, ... never mind, I forgot"
    " what I was saying. So, do you think"
    " we should ..."
)

from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    """Add support to use `...` as a delimiter for sentence detection"""
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i + 1].is_sent_start = True
    return doc


custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
    print(sentence)

Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...


* We used the @Language.component("set_custom_boundaries") decorator to define a new function that takes a Doc object as an argument. The job of this function is to identify tokens in Doc that are the beginning of sentences and mark their .is_sent_start attribute to True. Once done, the function must return the Doc object again



---



###Tokens in spaCy

Building the Doc container involves tokenizing the text. The process of tokenization breaks a text down into its basic units—or tokens—which are represented in spaCy as Token objects.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)
about_doc = nlp(about_text)

for token in about_doc:
    print (token, token.idx)

Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [None]:
print(f'{"Text with Whitespace":22}'
      f'{"Is Alphanumeric?":15}'
      f'{"Is Punctuation?":18}'
      f'{"Is Stop Word?"}'
)

for token in about_doc:
    print(
        f"{str(token.text_with_ws):22}"
        f"{str(token.is_alpha):15}"
        f"{str(token.is_punct):18}"
        f"{str(token.is_stop)}"
    )

Text with Whitespace  Is Alphanumeric?Is Punctuation?   Is Stop Word?
Gus                   True           False             False
Proto                 True           False             False
is                    True           False             True
a                     True           False             True
Python                True           False             False
developer             True           False             False
currently             True           False             False
working               True           False             False
for                   True           False             True
a                     True           False             True
London                True           False             False
-                     False          True              False
based                 True           False             False
Fintech               True           False             False
company               True           False             False
.                  

* text_with_ws prints the token text along with any trailing space, if present
* is_alpha indicates whether the token consists of alphabetic characters or not
* is_punct indicates whether the token is a punctuation symbol or not
* is_stop indicates whether the token is a stop word or not

In [None]:
custom_about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London@based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

print([token.text for token in nlp(custom_about_text)[8:15]])

['for', 'a', 'London@based', 'Fintech', 'company', '.', 'He']


* As with many aspects of spaCy, we can also customize the tokenization process to detect tokens on custom characters. This is often used for hyphenated words such as London-based.

* To customize tokenization, we need to update the tokenizer property on the callable Language object with a new Tokenizer object.

* To see what's involved, imagine we had some text that used the @ symbol instead of the usual hyphen (-) as an infix to link words together. So, instead of London-based, we had London@based

* In this example, the default parsing read the London@based text as a single token, but if we had used a hyphen instead of the @ symbol, then we'd get three tokens.

* To include the @ symbol as a custom infix, we need to build our own Tokenizer object

In [None]:
import re
from spacy.tokenizer import Tokenizer

custom_nlp = spacy.load("en_core_web_sm")
prefix_re = spacy.util.compile_prefix_regex(
    custom_nlp.Defaults.prefixes
)
suffix_re = spacy.util.compile_suffix_regex(
    custom_nlp.Defaults.suffixes
)

custom_infixes = [r"@"]

infix_re = spacy.util.compile_infix_regex(
    list(custom_nlp.Defaults.infixes) + custom_infixes
)

custom_nlp.tokenizer = Tokenizer(
    nlp.vocab,
    prefix_search=prefix_re.search,
    suffix_search=suffix_re.search,
    infix_finditer=infix_re.finditer,
    token_match=None,
)

custom_tokenizer_about_doc = custom_nlp(custom_about_text)

print([token.text for token in custom_tokenizer_about_doc[8:15]])

['for', 'a', 'London', '@', 'based', 'Fintech', 'company']


###Stop Words

Stop words are typically defined as the most common words in a language. In the English language, some examples of stop words are the, are, but, and they. Most sentences need to contain stop words in order to be full sentences that make grammatical sense.

With NLP, stop words are generally removed because they aren’t significant, and they heavily distort any word frequency analysis. spaCy stores a list of stop words for the English language

In [None]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

for stop_word in list(spacy_stopwords)[:10]:
    print(stop_word)

across
've
hers
beforehand
re
six
namely
is
’ve
we


In [None]:
custom_about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)
nlp = spacy.load("en_core_web_sm")
about_doc = nlp(custom_about_text)
print([token for token in about_doc if not token.is_stop])

[Gus, Proto, Python, developer, currently, working, London, -, based, Fintech, company, ., interested, learning, Natural, Language, Processing, .]


###Lemmatization

Lemmatization is the process of reducing inflected forms of a word while still ensuring that the reduced form belongs to the language. This reduced form, or root word, is called a lemma.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
conference_help_text = (
    "Gus is helping organize a developer"
    " conference on Applications of Natural Language"
    " Processing. He keeps organizing local Python meetups"
    " and several internal talks at his workplace."
)
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
    if str(token) != str(token.lemma_):
        print(f"{str(token):>20} : {str(token.lemma_)}")

###Word Embeddings

Word embedding techniques - Word2Vec, GloVe, and FastText using Gensim library in Python.

In [None]:
from gensim.models import Word2Vec
sentences = [['I', 'love', 'machine', 'learning'],
             ['Word', 'embeddings', 'are', 'powerful'],
             ['Natural', 'language', 'processing', 'is', 'interesting']]

model_w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
word_vector = model_w2v.wv['learning']
print(word_vector)


[-0.00515624 -0.00666834 -0.00777684  0.00831073 -0.00198234 -0.00685496
 -0.00415439  0.00514413 -0.00286914 -0.00374966  0.00162143 -0.00277629
 -0.00158436  0.00107449 -0.00297794  0.00851928  0.00391094 -0.00995886
  0.0062596  -0.00675425  0.00076943  0.00440423 -0.00510337 -0.00211067
  0.00809548 -0.00424379 -0.00763626  0.00925791 -0.0021555  -0.00471943
  0.0085708   0.00428334  0.00432484  0.00928451 -0.00845308  0.00525532
  0.00203935  0.00418828  0.0016979   0.00446413  0.00448629  0.00610452
 -0.0032021  -0.00457573 -0.00042652  0.00253373 -0.00326317  0.00605772
  0.00415413  0.00776459  0.00256927  0.00811668 -0.00138721  0.00807793
  0.00371702 -0.00804732 -0.00393361 -0.00247188  0.00489304 -0.00087216
 -0.00283091  0.00783371  0.0093229  -0.00161493 -0.00515925 -0.00470176
 -0.00484605 -0.00960283  0.00137202 -0.00422492  0.00252671  0.00561448
 -0.00406591 -0.00959658  0.0015467  -0.00670012  0.00249517 -0.00378063
  0.00707842  0.00064022  0.00356094 -0.00273913 -0

In [None]:
from gensim.models import FastText
sentences = [['I', 'love', 'machine', 'learning'],
             ['Word', 'embeddings', 'are', 'powerful'],
             ['Natural', 'language', 'processing', 'is', 'interesting']]

model_fasttext = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)
word_vector = model_fasttext.wv['learning']
print(word_vector)


[ 9.1823016e-04 -3.0900480e-04 -1.4241622e-03  3.6915974e-04
 -1.6989922e-03 -4.7368856e-04 -9.6330949e-04 -8.1712339e-04
  1.4180044e-03  3.7115382e-04 -1.1983200e-04  2.9081755e-04
 -9.9326216e-04  1.0794357e-03  5.1063707e-04 -1.1530268e-03
 -1.5505445e-03 -1.9562184e-03  5.7489559e-04 -2.0934530e-03
 -1.8944317e-03 -2.0447427e-03  1.0821981e-03 -2.2537062e-04
 -5.0764071e-04 -1.0467614e-03 -6.9561160e-05 -7.7934202e-04
 -3.2007642e-04 -1.4727070e-03 -1.4016990e-03  1.8091477e-03
  7.5201882e-04  4.1703676e-04 -1.6488129e-04  1.2592709e-03
 -4.0854918e-04  5.9077330e-04 -1.3326405e-03 -8.8910357e-04
 -1.0974610e-03 -1.0485642e-03 -7.7783951e-04  6.8859063e-04
 -1.1026027e-03  3.0985163e-04 -8.7897031e-05  4.3270687e-04
  2.0680524e-04  7.2805583e-04  1.5761263e-03 -4.8463579e-04
 -3.9394345e-04  1.9359115e-03  9.9428324e-04  7.9648633e-04
 -2.5278129e-04  2.9308398e-04 -6.4702188e-05 -9.9841051e-04
  3.2134666e-04 -6.8074651e-04 -1.5585172e-03  1.4566220e-03
 -1.5276216e-03 -1.03553

FastText with spacy:

In [None]:
from gensim.models import FastText
import spacy

# Sample sentences
sentences = ['I love machine learning.',
             'Word embeddings are powerful.',
             'Natural language processing is interesting.']

# Tokenize and preprocess the sentences using spacy
nlp = spacy.load('en_core_web_sm')
preprocessed_sentences = [[token.text.lower() for token in nlp(sentence) if token.is_alpha] for sentence in sentences]

# Train FastText model
model_fasttext = FastText(sentences=preprocessed_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Get word vector for 'learning'
word_vector = model_fasttext.wv['learning']
print(word_vector)


[ 9.1823016e-04 -3.0900480e-04 -1.4241622e-03  3.6915974e-04
 -1.6989922e-03 -4.7368856e-04 -9.6330949e-04 -8.1712339e-04
  1.4180044e-03  3.7115382e-04 -1.1983200e-04  2.9081755e-04
 -9.9326216e-04  1.0794357e-03  5.1063707e-04 -1.1530268e-03
 -1.5505445e-03 -1.9562184e-03  5.7489559e-04 -2.0934530e-03
 -1.8944317e-03 -2.0447427e-03  1.0821981e-03 -2.2537062e-04
 -5.0764071e-04 -1.0467614e-03 -6.9561160e-05 -7.7934202e-04
 -3.2007642e-04 -1.4727070e-03 -1.4016990e-03  1.8091477e-03
  7.5201882e-04  4.1703676e-04 -1.6488129e-04  1.2592709e-03
 -4.0854918e-04  5.9077330e-04 -1.3326405e-03 -8.8910357e-04
 -1.0974610e-03 -1.0485642e-03 -7.7783951e-04  6.8859063e-04
 -1.1026027e-03  3.0985163e-04 -8.7897031e-05  4.3270687e-04
  2.0680524e-04  7.2805583e-04  1.5761263e-03 -4.8463579e-04
 -3.9394345e-04  1.9359115e-03  9.9428324e-04  7.9648633e-04
 -2.5278129e-04  2.9308398e-04 -6.4702188e-05 -9.9841051e-04
  3.2134666e-04 -6.8074651e-04 -1.5585172e-03  1.4566220e-03
 -1.5276216e-03 -1.03553

###Word Frequency

We can now convert a given text into tokens and perform statistical analysis on it. This analysis can give us various insights, such as common words or unique words in the text:

In [None]:
import spacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")
complete_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech company. He is"
    " interested in learning Natural Language Processing."
    " There is a developer conference happening on 21 July"
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number'
    " available at +44-1234567891. Gus is helping organize it."
    " He keeps organizing local Python meetups and several"
    " internal talks at his workplace. Gus is also presenting"
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    " Apart from his work, he is very passionate about music."
    " Gus is learning to play the Piano. He has enrolled"
    " himself in the weekend batch of Great Piano Academy."
    " Great Piano Academy is situated in Mayfair or the City"
    " of London and has world-class piano instructors."
)
complete_doc = nlp(complete_text)

words = [
    token.text
    for token in complete_doc
    if not token.is_stop and not token.is_punct
]

print(Counter(words).most_common(5))


[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]


By looking just at the common words, we can probably assume that the text is about Gus, London, and Natural Language Processing. That’s a significant finding! If we can just look at the most common words, that may save us a lot of reading, because we can immediately tell if the text is about something that interests us or not.

In [None]:
Counter(
    [token.text for token in complete_doc if not token.is_punct]
).most_common(5)

[('is', 10), ('a', 5), ('in', 5), ('Gus', 4), ('of', 4)]

Four out of five of the most common words are stop words that don’t really tell us much about the summarized text. This is why stop words are often considered noise for many applications.

###Part of Speech Tagging

Part of speech or POS is a grammatical role that explains how a particular word is used in a sentence. There are typically eight parts of speech:

* Noun
* Pronoun
* Adjective
* Verb
* Adverb
* Preposition
* Conjunction
* Interjection

Part-of-speech tagging is the process of assigning a POS tag to each token depending on its usage in the sentence. POS tags are useful for assigning a syntactic category like noun or verb to each word

In spaCy, POS tags are available as an attribute on the Token object:

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)
about_doc = nlp(about_text)
for token in about_doc:
    print(
        f"""
TOKEN: {str(token)}
=====
TAG: {str(token.tag_):10} POS: {token.pos_}
EXPLANATION: {spacy.explain(token.tag_)}"""
    )



TOKEN: Gus
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: Proto
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: is
=====
TAG: VBZ        POS: AUX
EXPLANATION: verb, 3rd person singular present

TOKEN: a
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: Python
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: developer
=====
TAG: NN         POS: NOUN
EXPLANATION: noun, singular or mass

TOKEN: currently
=====
TAG: RB         POS: ADV
EXPLANATION: adverb

TOKEN: working
=====
TAG: VBG        POS: VERB
EXPLANATION: verb, gerund or present participle

TOKEN: for
=====
TAG: IN         POS: ADP
EXPLANATION: conjunction, subordinating or preposition

TOKEN: a
=====
TAG: DT         POS: DET
EXPLANATION: determiner

TOKEN: London
=====
TAG: NNP        POS: PROPN
EXPLANATION: noun, proper singular

TOKEN: -
=====
TAG: HYPH       POS: PUNCT
EXPLANATION: punctuation mark, hyphen

TOKEN: based
=====
TAG

By using POS tags, we can extract a particular category of words:

In [None]:
nouns = []
adjectives = []
for token in about_doc:
    if token.pos_ == "NOUN":
        nouns.append(token)
    if token.pos_ == "ADJ":
        adjectives.append(token)


In [None]:
nouns

[developer, company]

In [None]:
adjectives

[interested]

###Visualization: Using displaCy

spaCy comes with a built-in visualizer called displaCy. We can use it to visualize a dependency parse or named entities in a browser. We can use displaCy to find POS tags for tokens:

In [None]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

about_interest_text = (
    "He is interested in learning Natural Language Processing."
)
about_interest_doc = nlp(about_interest_text)
# displacy.serve(about_interest_doc, style="dep")
displacy.render(about_interest_doc, style="dep", jupyter=True)

###Preprocessing Functions

A preprocessing function converts text to an analyzable format. It's typical for most NLP tasks. Here's an example:

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
complete_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech company. He is"
    " interested in learning Natural Language Processing."
    " There is a developer conference happening on 21 July"
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number'
    " available at +44-1234567891. Gus is helping organize it."
    " He keeps organizing local Python meetups and several"
    " internal talks at his workplace. Gus is also presenting"
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    " Apart from his work, he is very passionate about music."
    " Gus is learning to play the Piano. He has enrolled"
    " himself in the weekend batch of Great Piano Academy."
    " Great Piano Academy is situated in Mayfair or the City"
    " of London and has world-class piano instructors."
)

In [None]:
complete_doc = nlp(complete_text)
def is_token_allowed(token):
    return bool(
        token
        and str(token).strip()
        and not token.is_stop
        and not token.is_punct
    )

def preprocess_token(token):
    return token.lemma_.strip().lower()

complete_filtered_tokens = [
    preprocess_token(token)
    for token in complete_doc
    if is_token_allowed(token)
]

complete_filtered_tokens

['gus',
 'proto',
 'python',
 'developer',
 'currently',
 'work',
 'london',
 'base',
 'fintech',
 'company',
 'interested',
 'learn',
 'natural',
 'language',
 'processing',
 'developer',
 'conference',
 'happen',
 '21',
 'july',
 '2019',
 'london',
 'title',
 'application',
 'natural',
 'language',
 'processing',
 'helpline',
 'number',
 'available',
 '+44',
 '1234567891',
 'gus',
 'helping',
 'organize',
 'keep',
 'organize',
 'local',
 'python',
 'meetup',
 'internal',
 'talk',
 'workplace',
 'gus',
 'present',
 'talk',
 'talk',
 'introduce',
 'reader',
 'use',
 'case',
 'natural',
 'language',
 'processing',
 'fintech',
 'apart',
 'work',
 'passionate',
 'music',
 'gus',
 'learn',
 'play',
 'piano',
 'enrol',
 'weekend',
 'batch',
 'great',
 'piano',
 'academy',
 'great',
 'piano',
 'academy',
 'situate',
 'mayfair',
 'city',
 'london',
 'world',
 'class',
 'piano',
 'instructor']

###Rule-Based Matching Using spaCy

Rule-based matching is one of the steps in extracting information from unstructured text. It's used to identify and extract tokens and phrases according to patterns (such as lowercase) and grammatical features (such as part of speech).

While you can use regular expressions to extract entities (such as phone numbers), rule-based matching in spaCy is more powerful than regex alone, because you can include semantic or grammatical filters.

For example, with rule-based matching, you can extract a first name and a last name, which are always proper nouns:

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)
about_doc = nlp(about_text)


In [None]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

def extract_full_name(nlp_doc):
    pattern = [{"POS": "PROPN"}, {"POS": "PROPN"}]
    matcher.add("FULL_NAME", [pattern])
    matches = matcher(nlp_doc)
    for _, start, end in matches:
        span = nlp_doc[start:end]
        yield span.text



In [None]:
next(extract_full_name(about_doc))

'Gus Proto'

###Dependency Parsing Using spaCy

Dependency parsing is the process of extracting the dependency graph of a sentence to represent its grammatical structure. It defines the dependency relationship between headwords and their dependents. The head of a sentence has no dependency and is called the root of the sentence. The verb is usually the root of the sentence. All other words are linked to the headword.

The dependencies can be mapped in a directed graph representation where:

Words are the nodes.
Grammatical relationships are the edges.
Dependency parsing helps you know what role a word plays in the text and how different words relate to each other.

Here's how you can use dependency parsing to find the relationships between words:

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
piano_text = "Gus is learning piano"
piano_doc = nlp(piano_text)
for token in piano_doc:
    print(
        f"""
TOKEN: {token.text}
=====
{token.tag_ = }
{token.head.text = }
{token.dep_ = }"""
    )


TOKEN: Gus
=====
token.tag_ = 'NNP'
token.head.text = 'learning'
token.dep_ = 'nsubj'

TOKEN: is
=====
token.tag_ = 'VBZ'
token.head.text = 'learning'
token.dep_ = 'aux'

TOKEN: learning
=====
token.tag_ = 'VBG'
token.head.text = 'learning'
token.dep_ = 'ROOT'

TOKEN: piano
=====
token.tag_ = 'NN'
token.head.text = 'learning'
token.dep_ = 'dobj'


In this example, the sentence contains three relationships:

nsubj is the subject of the word, and its headword is a verb.
aux is an auxiliary word, and its headword is a verb.
dobj is the direct object of the verb, and its headword is also a verb.
The list of relationships isn't particular to spaCy. Rather, it's an evolving field of linguistics research.

You can also use displaCy to visualize the dependency tree of the sentence:

In [None]:
displacy.serve(piano_doc, style="dep")

Shutting down server on port 5000.


###Tree and Subtree Navigation

The dependency graph has all the properties of a tree. This tree contains information about sentence structure and grammar and can be traversed in different ways to extract relationships.

spaCy provides attributes like .children, .lefts, .rights, and .subtree to make navigating the parse tree easier. Here are a few examples of using those attributes:

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
one_line_about_text = (
    "Gus Proto is a Python developer"
    " currently working for a London-based Fintech company"
)
one_line_about_doc = nlp(one_line_about_text)

# Extract children of `developer`
print([token.text for token in one_line_about_doc[5].children])


# Extract previous neighboring node of `developer`
print (one_line_about_doc[5].nbor(-1))


# Extract next neighboring node of `developer`
print (one_line_about_doc[5].nbor())


# Extract all tokens on the left of `developer`
print([token.text for token in one_line_about_doc[5].lefts])


# Extract tokens on the right of `developer`
print([token.text for token in one_line_about_doc[5].rights])


# Print subtree of `developer`
print (list(one_line_about_doc[5].subtree))

['a', 'Python', 'working']
Python
currently
['a', 'Python']
['working']
[a, Python, developer, currently, working, for, a, London, -, based, Fintech, company]


###Shallow Parsing

Shallow parsing, or chunking, is the process of extracting phrases from unstructured text. This involves chunking groups of adjacent tokens into phrases on the basis of their POS tags. There are some standard well-known chunks such as noun phrases, verb phrases, and prepositional phrases.

Noun Phrase Detection
A noun phrase is a phrase that has a noun as its head. It could also include other kinds of words, such as adjectives, ordinals, and determiners. Noun phrases are useful for explaining the context of the sentence. They help you understand what the sentence is about.

spaCy has the property .noun_chunks on the Doc object. You can use this property to extract noun phrases:

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

conference_text = (
    "There is a developer conference happening on 21 July 2019 in London."
)
conference_doc = nlp(conference_text)

# Extract Noun Phrases
for chunk in conference_doc.noun_chunks:
    print (chunk)

a developer conference
21 July
London


**Verb Phrase Detection**

A verb phrase is a syntactic unit composed of at least one verb. This verb can be joined by other chunks, such as noun phrases. Verb phrases are useful for understanding the actions that nouns are involved in.

spaCy has no built-in functionality to extract verb phrases, so you’ll need a library called textacy. You can use pip to install textacy:

In [1]:
!pip install textacy

Collecting textacy
  Downloading textacy-0.13.0-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting cytoolz>=0.10.1 (from textacy)
  Downloading cytoolz-0.12.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting floret~=0.10.0 (from textacy)
  Downloading floret-0.10.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.3/320.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jellyfish>=0.8.0 (from textacy)
  Downloading jellyfish-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyphen

In [2]:
import textacy

about_talk_text = (
    "The talk will introduce reader about use"
    " cases of Natural Language Processing in"
    " Fintech, making use of"
    " interesting examples along the way."
)

patterns = [{"POS": "AUX"}, {"POS": "VERB"}]
about_talk_doc = textacy.make_spacy_doc(
    about_talk_text, lang="en_core_web_sm"
)
verb_phrases = textacy.extract.token_matches(
    about_talk_doc, patterns=patterns
)

# Print all verb phrases
for chunk in verb_phrases:
    print(chunk.text)



# Extract noun phrase to explain what nouns are involved
for chunk in about_talk_doc.noun_chunks:
    print (chunk)

will introduce
The talk
reader
use cases
Natural Language Processing
Fintech
use
interesting examples
the way


In this example, the verb phrase introduce indicates that something will be introduced. By looking at the noun phrases, you can piece together what will be introduced—again, without having to read the whole text.