In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [9]:
introduction_text = ('This tutorial is about Natural'
                    ' Language Processing in Spacy.')
introduction_doc = nlp(introduction_text)
# Extract tokens for the given doc
print ([token.text for token in introduction_doc])
['This', 'tutorial', 'is', 'about', 'Natural', 'Language',
'Processing', 'in', 'Spacy', '.']

['This', 'tutorial', 'is', 'about', 'Natural', 'Language', 'Processing', 'in', 'Spacy', '.']


['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'Spacy',
 '.']

In [12]:
introduction_file_text = open(r'C:\Users\49173\Desktop\introduction.txt').read()
introduction_file_doc = nlp(introduction_file_text)
# Extract tokens for the given doc
print ([token.text for token in introduction_file_doc])
['This', 'tutorial', 'is', 'about', 'Natural', 'Language',
'Processing', 'in', 'Spacy', '.', '\n']

['This', 'tutorial', 'is', 'about', "Natural''Language", 'Processing', 'in', 'Spacy']


['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'Spacy',
 '.',
 '\n']

In [15]:
about_text = ('Gus Proto is a Python developer currently'
                  ' working for a London-based Fintech'
                  ' company. He is interested in learning'
                  ' Natural Language Processing.')
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)
for sentence in sentences:
    print (sentence)

Gus Proto is a Python developer currently working for a London-based Fintech company.
He is interested in learning Natural Language Processing.


In [18]:
def set_custom_boundaries(doc):
        # Adds support to use `...` as the delimiter for sentence detection
        for token in doc[:-1]:
            if token.text == '...':
                doc[token.i+1].is_sent_start = True
        return doc  
ellipsis_text = ('Gus, can you, ... never mind, I forgot'
                     ' what I was saying. So, do you think'
                     ' we should ...')
# Load a new model instance
custom_nlp = spacy.load('en_core_web_sm')
custom_nlp.add_pipe(set_custom_boundaries, before='parser')
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
       print(sentence)

# Sentence Detection with no customization
ellipsis_doc = nlp(ellipsis_text)
ellipsis_sentences = list(ellipsis_doc.sents)
for sentence in ellipsis_sentences:
    print(sentence)

Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...
Gus, can you, ... never mind, I forgot what I was saying.
So, do you think we should ...


In [19]:
for token in about_doc:
    print (token, token.idx)

Gus 0
Proto 4
is 10
a 13
Python 15
developer 22
currently 32
working 42
for 50
a 54
London 56
- 62
based 63
Fintech 69
company 77
. 84
He 86
is 89
interested 92
in 103
learning 106
Natural 115
Language 123
Processing 132
. 142


In [20]:
for token in about_doc:
    print (token, token.idx, token.text_with_ws,
              token.is_alpha, token.is_punct, token.is_space,
               token.shape_, token.is_stop)

Gus 0 Gus  True False False Xxx False
Proto 4 Proto  True False False Xxxxx False
is 10 is  True False False xx True
a 13 a  True False False x True
Python 15 Python  True False False Xxxxx False
developer 22 developer  True False False xxxx False
currently 32 currently  True False False xxxx False
working 42 working  True False False xxxx False
for 50 for  True False False xxx True
a 54 a  True False False x True
London 56 London True False False Xxxxx False
- 62 - False True False - False
based 63 based  True False False xxxx False
Fintech 69 Fintech  True False False Xxxxx False
company 77 company True False False xxxx False
. 84 .  False True False . False
He 86 He  True False False Xx True
is 89 is  True False False xx True
interested 92 interested  True False False xxxx False
in 103 in  True False False xx True
learning 106 learning  True False False xxxx False
Natural 115 Natural  True False False Xxxxx False
Language 123 Language  True False False Xxxxx False
Processing 132 Pro

In [23]:
import re
import spacy
from spacy.tokenizer import Tokenizer
custom_nlp = spacy.load('en_core_web_sm')
prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)
infix_re = re.compile(r'''[-~]''')
def customize_tokenizer(nlp):
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                      infix_finditer=infix_re.finditer,
                     token_match=None
                      )
custom_nlp.tokenizer = customize_tokenizer(custom_nlp)
custom_tokenizer_about_doc = custom_nlp(about_text)
print([token.text for token in custom_tokenizer_about_doc])

['Gus', 'Proto', 'is', 'a', 'Python', 'developer', 'currently', 'working', 'for', 'a', 'London', '-', 'based', 'Fintech', 'company', '.', 'He', 'is', 'interested', 'in', 'learning', 'Natural', 'Language', 'Processing', '.']


In [24]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)
for stop_word in list(spacy_stopwords)[:10]:
     print(stop_word)

herein
his
is
several
very
does
twenty
through
other
thru


In [25]:
for token in about_doc:
     if not token.is_stop:
            print (token)

Gus
Proto
Python
developer
currently
working
London
-
based
Fintech
company
.
interested
learning
Natural
Language
Processing
.


In [26]:
about_no_stopword_doc = [token for token in about_doc if not token.is_stop]
print (about_no_stopword_doc)

[Gus, Proto, Python, developer, currently, working, London, -, based, Fintech, company, ., interested, learning, Natural, Language, Processing, .]


In [27]:
conference_help_text = ('Gus is helping organize a developer'
     'conference on Applications of Natural Language'
     ' Processing. He keeps organizing local Python meetups'
     ' and several internal talks at his workplace.')
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
     print (token, token.lemma_)

Gus Gus
is be
helping help
organize organize
a a
developerconference developerconference
on on
Applications Applications
of of
Natural Natural
Language Language
Processing Processing
. .
He -PRON-
keeps keep
organizing organize
local local
Python Python
meetups meetup
and and
several several
internal internal
talks talk
at at
his -PRON-
workplace workplace
. .


In [30]:
from collections import Counter
complete_text = ('Gus Proto is a Python developer currently'
     'working for a London-based Fintech company. He is'
     ' interested in learning Natural Language Processing.'
     ' There is a developer conference happening on 21 July'
     ' 2019 in London. It is titled "Applications of Natural'
     ' Language Processing". There is a helpline number '
     ' available at +1-1234567891. Gus is helping organize it.'
     ' He keeps organizing local Python meetups and several'
     ' internal talks at his workplace. Gus is also presenting'
     ' a talk. The talk will introduce the reader about "Use'
     ' cases of Natural Language Processing in Fintech".'
     ' Apart from his work, he is very passionate about music.'
     ' Gus is learning to play the Piano. He has enrolled '
     ' himself in the weekend batch of Great Piano Academy.'
     ' Great Piano Academy is situated in Mayfair or the City'
     ' of London and has world-class piano instructors.')

complete_doc = nlp(complete_text)
 # Remove stop words and punctuation symbols
words = [token.text for token in complete_doc
          if not token.is_stop and not token.is_punct]
word_freq = Counter(words)
 # 5 commonly occurring words with their frequencies
common_words = word_freq.most_common(5)
print (common_words)
 # Unique words
unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print (unique_words)

[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]
['Proto', 'currentlyworking', 'based', 'company', 'interested', 'conference', 'happening', '21', 'July', '2019', 'titled', 'Applications', 'helpline', 'number', 'available', '+1', '1234567891', 'helping', 'organize', 'keeps', 'organizing', 'local', 'meetups', 'internal', 'talks', 'workplace', 'presenting', 'introduce', 'reader', 'Use', 'cases', 'Apart', 'work', 'passionate', 'music', 'play', 'enrolled', 'weekend', 'batch', 'situated', 'Mayfair', 'City', 'world', 'class', 'piano', 'instructors']


In [31]:
words_all = [token.text for token in complete_doc if not token.is_punct]
word_freq_all = Counter(words_all)
# 5 commonly occurring words with their frequencies
common_words_all = word_freq_all.most_common(5)
print (common_words_all)

[('is', 10), ('a', 5), ('in', 5), ('Gus', 4), ('of', 4)]


In [32]:
for token in about_doc:
     print (token, token.tag_, token.pos_, spacy.explain(token.tag_))

Gus NNP PROPN noun, proper singular
Proto NNP PROPN noun, proper singular
is VBZ AUX verb, 3rd person singular present
a DT DET determiner
Python NNP PROPN noun, proper singular
developer NN NOUN noun, singular or mass
currently RB ADV adverb
working VBG VERB verb, gerund or present participle
for IN ADP conjunction, subordinating or preposition
a DT DET determiner
London NNP PROPN noun, proper singular
- HYPH PUNCT punctuation mark, hyphen
based VBN VERB verb, past participle
Fintech NNP PROPN noun, proper singular
company NN NOUN noun, singular or mass
. . PUNCT punctuation mark, sentence closer
He PRP PRON pronoun, personal
is VBZ AUX verb, 3rd person singular present
interested JJ ADJ adjective
in IN ADP conjunction, subordinating or preposition
learning VBG VERB verb, gerund or present participle
Natural NNP PROPN noun, proper singular
Language NNP PROPN noun, proper singular
Processing NNP PROPN noun, proper singular
. . PUNCT punctuation mark, sentence closer


In [35]:
nouns = []
adjectives = []
for token in about_doc:
    if token.pos_ == 'NOUN':
        nouns.append(token)
    if token.pos_ == 'ADJ':
        adjectives.append(token)

print(nouns)
print(adjectives)

[developer, company]
[interested]


In [37]:
from spacy import displacy
about_interest_text = ('He is interested in learning'
    ' Natural Language Processing.')
about_interest_doc = nlp(about_interest_text)
displacy.serve(about_interest_doc, style='dep')


[W011] It looks like you're calling displacy.serve from within a Jupyter notebook or a similar environment. This likely means you're already running a local web server, so there's no need to make displaCy start another one. Instead, you should be able to replace displacy.serve with displacy.render to show the visualization.




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [41]:
def is_token_allowed(token):
    if (not token or not token.string.strip() or token.is_stop or token.is_punct):
         return False
    return True

def preprocess_token(token):
     # Reduce token to its lowercase lemma form
    return token.lemma_.strip().lower()

complete_filtered_tokens = [preprocess_token(token)
for token in complete_doc if is_token_allowed(token)]
complete_filtered_tokens

['gus',
 'proto',
 'python',
 'developer',
 'currentlyworke',
 'london',
 'base',
 'fintech',
 'company',
 'interested',
 'learn',
 'natural',
 'language',
 'processing',
 'developer',
 'conference',
 'happen',
 '21',
 'july',
 '2019',
 'london',
 'title',
 'applications',
 'natural',
 'language',
 'processing',
 'helpline',
 'number',
 'available',
 '+1',
 '1234567891',
 'gus',
 'help',
 'organize',
 'keep',
 'organize',
 'local',
 'python',
 'meetup',
 'internal',
 'talk',
 'workplace',
 'gus',
 'present',
 'talk',
 'talk',
 'introduce',
 'reader',
 'use',
 'case',
 'natural',
 'language',
 'processing',
 'fintech',
 'apart',
 'work',
 'passionate',
 'music',
 'gus',
 'learn',
 'play',
 'piano',
 'enrol',
 'weekend',
 'batch',
 'great',
 'piano',
 'academy',
 'great',
 'piano',
 'academy',
 'situate',
 'mayfair',
 'city',
 'london',
 'world',
 'class',
 'piano',
 'instructor']

In [45]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
def extract_full_name(nlp_doc):
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('FULL_NAME', None, pattern)
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text

extract_full_name(about_doc)

'Gus Proto'

In [46]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
conference_org_text = ('There is a developer conference'
     'happening on 21 July 2019 in London. It is titled'
     ' "Applications of Natural Language Processing".'
     ' There is a helpline number available'
     ' at (123) 456-789')

def extract_phone_number(nlp_doc):
    pattern = [{'ORTH': '('}, {'SHAPE': 'ddd'},
                {'ORTH': ')'}, {'SHAPE': 'ddd'},
                {'ORTH': '-', 'OP': '?'},
                {'SHAPE': 'ddd'}]
    matcher.add('PHONE_NUMBER', None, pattern)
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text
conference_org_doc = nlp(conference_org_text)
extract_phone_number(conference_org_doc)

'(123) 456-789'

In [47]:
piano_text = 'Gus is learning piano'
piano_doc = nlp(piano_text)
for token in piano_doc:
    print (token.text, token.tag_, token.head.text, token.dep_)

Gus NNP learning nsubj
is VBZ learning aux
learning VBG learning ROOT
piano NN learning dobj


In [49]:
one_line_about_text = ('Gus Proto is a Python developer'
     ' currently working for a London-based Fintech company')
one_line_about_doc = nlp(one_line_about_text)
# Extract children of `developer`
print([token.text for token in one_line_about_doc[5].children])
print (one_line_about_doc[5].nbor(-1))
print (one_line_about_doc[5].nbor())
print([token.text for token in one_line_about_doc[5].lefts])
print([token.text for token in one_line_about_doc[5].rights])
print (list(one_line_about_doc[5].subtree))

['a', 'Python', 'working']
Python
currently
['a', 'Python']
['working']
[a, Python, developer, currently, working, for, a, London, -, based, Fintech, company]


In [51]:
def flatten_tree(tree):
     return ''.join([token.text_with_ws for token in list(tree)]).strip
print (flatten_tree(one_line_about_doc[5].subtree))

<built-in method strip of str object at 0x00000200F865CD50>


In [53]:
conference_text = ('There is a developer conference'
     ' happening on 21 July 2019 in London.')
conference_doc = nlp(conference_text)
# Extract Noun Phrases
for chunk in conference_doc.noun_chunks:
    print (chunk)

a developer conference
21 July
London
