In [1]:
import spacy

nlp = spacy.load('en_core_web_lg')

* nlp(text) creates a 'Doc' object

In [2]:
sample_text = 'It is a random sentence for checking minimum 10 of features of spacy, just to check $5 for currency, aj@gmail.co.in & wwww.aj.com for checking on emails'

doc = nlp(text = sample_text)

type(doc)

spacy.tokens.doc.Doc

* Inside the 'Doc' object, each of the words are tokens

In [3]:
[type(text) for text in doc]

[spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token]

### Checking dependencies

In [4]:
[(token.text, token.dep_, spacy.explain(token.dep_)) for token in doc]

[('It', 'nsubj', 'nominal subject'),
 ('is', 'ROOT', None),
 ('a', 'det', 'determiner'),
 ('random', 'amod', 'adjectival modifier'),
 ('sentence', 'attr', 'attribute'),
 ('for', 'prep', 'prepositional modifier'),
 ('checking', 'pcomp', 'complement of preposition'),
 ('minimum', 'amod', 'adjectival modifier'),
 ('10', 'dobj', 'direct object'),
 ('of', 'prep', 'prepositional modifier'),
 ('features', 'pobj', 'object of preposition'),
 ('of', 'prep', 'prepositional modifier'),
 ('spacy', 'pobj', 'object of preposition'),
 (',', 'punct', 'punctuation'),
 ('just', 'advmod', 'adverbial modifier'),
 ('to', 'aux', 'auxiliary'),
 ('check', 'advcl', 'adverbial clause modifier'),
 ('$', 'nmod', 'modifier of nominal'),
 ('5', 'dobj', 'direct object'),
 ('for', 'prep', 'prepositional modifier'),
 ('currency', 'pobj', 'object of preposition'),
 (',', 'punct', 'punctuation'),
 ('aj@gmail.co.in', 'conj', 'conjunct'),
 ('&', 'cc', 'coordinating conjunction'),
 ('wwww.aj.com', 'conj', 'conjunct'),
 ('fo

### Check if the token is an alphabet

In [5]:
[(token.text, token.is_alpha) for token in doc]

[('It', True),
 ('is', True),
 ('a', True),
 ('random', True),
 ('sentence', True),
 ('for', True),
 ('checking', True),
 ('minimum', True),
 ('10', False),
 ('of', True),
 ('features', True),
 ('of', True),
 ('spacy', True),
 (',', False),
 ('just', True),
 ('to', True),
 ('check', True),
 ('$', False),
 ('5', False),
 ('for', True),
 ('currency', True),
 (',', False),
 ('aj@gmail.co.in', False),
 ('&', False),
 ('wwww.aj.com', False),
 ('for', True),
 ('checking', True),
 ('on', True),
 ('emails', True)]

### Check for currency in the text

In [6]:
[doc[index+1] for index, token in enumerate(doc) if token.is_currency]

[5]

### check for stopwords

In [7]:
[(token.text, token.is_stop) for token in doc]

[('It', True),
 ('is', True),
 ('a', True),
 ('random', False),
 ('sentence', False),
 ('for', True),
 ('checking', False),
 ('minimum', False),
 ('10', False),
 ('of', True),
 ('features', False),
 ('of', True),
 ('spacy', False),
 (',', False),
 ('just', True),
 ('to', True),
 ('check', False),
 ('$', False),
 ('5', False),
 ('for', True),
 ('currency', False),
 (',', False),
 ('aj@gmail.co.in', False),
 ('&', False),
 ('wwww.aj.com', False),
 ('for', True),
 ('checking', False),
 ('on', True),
 ('emails', False)]

In [8]:
list_without_stopwords = []
[list_without_stopwords.append((str(token))) for token in doc if not token.is_stop]
print('Before removing the stopwords: \n', doc)
print()
print('After removing the stopwords: \n', [' '.join(list_without_stopwords)][0])

Before removing the stopwords: 
 It is a random sentence for checking minimum 10 of features of spacy, just to check $5 for currency, aj@gmail.co.in & wwww.aj.com for checking on emails

After removing the stopwords: 
 random sentence checking minimum 10 features spacy , check $ 5 currency , aj@gmail.co.in & wwww.aj.com checking emails


### Lemmatization
* Also for normalization or making the words to the root form

In [9]:
for token in doc:
    print(token.text, '\t\t', token.lemma_)

It 		 -PRON-
is 		 be
a 		 a
random 		 random
sentence 		 sentence
for 		 for
checking 		 check
minimum 		 minimum
10 		 10
of 		 of
features 		 feature
of 		 of
spacy 		 spacy
, 		 ,
just 		 just
to 		 to
check 		 check
$ 		 $
5 		 5
for 		 for
currency 		 currency
, 		 ,
aj@gmail.co.in 		 aj@gmail.co.in
& 		 &
wwww.aj.com 		 wwww.aj.com
for 		 for
checking 		 check
on 		 on
emails 		 email


In [10]:
try_list = []
[try_list.append(str(token.lemma_)) for token in doc]
print('Original sentence:\n', doc)
print()
print('Lemmatized or normalized sentence:\n', [' '.join(try_list)][0])

Original sentence:
 It is a random sentence for checking minimum 10 of features of spacy, just to check $5 for currency, aj@gmail.co.in & wwww.aj.com for checking on emails

Lemmatized or normalized sentence:
 -PRON- be a random sentence for check minimum 10 of feature of spacy , just to check $ 5 for currency , aj@gmail.co.in & wwww.aj.com for check on email


### Check for emails in the text

In [11]:
[token for token in doc if token.like_email]

[aj@gmail.co.in]

### Check for url in the text

In [12]:
[token for token in doc if token.like_url]

[wwww.aj.com]

### Parts of speech for each word

In [13]:
[(token.text, token.pos_) for token in doc]

[('It', 'PRON'),
 ('is', 'AUX'),
 ('a', 'DET'),
 ('random', 'ADJ'),
 ('sentence', 'NOUN'),
 ('for', 'ADP'),
 ('checking', 'VERB'),
 ('minimum', 'NOUN'),
 ('10', 'NUM'),
 ('of', 'ADP'),
 ('features', 'NOUN'),
 ('of', 'ADP'),
 ('spacy', 'ADJ'),
 (',', 'PUNCT'),
 ('just', 'ADV'),
 ('to', 'PART'),
 ('check', 'VERB'),
 ('$', 'SYM'),
 ('5', 'NUM'),
 ('for', 'ADP'),
 ('currency', 'NOUN'),
 (',', 'PUNCT'),
 ('aj@gmail.co.in', 'PROPN'),
 ('&', 'CCONJ'),
 ('wwww.aj.com', 'X'),
 ('for', 'ADP'),
 ('checking', 'VERB'),
 ('on', 'ADP'),
 ('emails', 'NOUN')]

### Practise the features of tokens in a PDF document

In [14]:
from pdfminer.high_level import extract_text

In [15]:
raw_text = extract_text('Brochure for Deep Learning and Its Application13.06.2020.pdf')
doc = nlp(text = raw_text)

### Search for url in the text

In [16]:
[token for token in doc if token.like_url]

[www.nitp.ac.in/ict/,
 https://forms.gle/zJkAGnega2VajhUF7,
 www.nitp.ac.in,
 http://www.nitp.ac.in/ict/]

### Search for emails

In [17]:
[token for token in doc if token.like_email]

[mukesh.kumar@nitp.ac.in]

### Search for currency

In [18]:
[(doc[index], doc[index+1: index+4]) for index, token in enumerate(doc) if (str(token) == 'Rs')]

[(Rs,
  . 500/- 
    ),
 (Rs, .  500/-),
 (Rs,
  . 1000/- 
  )]

### Search for dates
* This is the not the best method, but with basic knowledge of spacy, we can detect dates, however this data is not so userful

In [19]:
[(token.text, token.label_) for token in doc.ents if token.label_ == 'DATE']

[('2020', 'DATE'),
 ('1886', 'DATE'),
 ('1924', 'DATE'),
 ('One-week', 'DATE'),
 ('6-8 years', 'DATE'),
 ('17th to 22nd ,', 'DATE'),
 ('June', 'DATE'),
 ('2020', 'DATE'),
 ('years', 'DATE')]

* Just with the help of functions available for tokens, we can get the information on the emails, urls, currency with a bit of help very accurately, not a single email or url is missed.