# POS tagging examples using NLTK and spaCy

read blog post https://jenniferkwentoh.com/part-of-speech-tagging-examples-in-python/

**Using NLTK library**

In [1]:
import nltk
from nltk import word_tokenize, pos_tag
import nltk.help
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\JenniferKwentoh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\JenniferKwentoh\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

Tokenization

In [2]:
text = 'Abuja is a beautiful city'
tokens = word_tokenize(text)
tokens

['Abuja', 'is', 'a', 'beautiful', 'city']

In [3]:
pos_tag(tokens)

[('Abuja', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('beautiful', 'JJ'),
 ('city', 'NN')]

In [4]:
text = '''Washing your hands is easy, and it’s one of the most effective ways to prevent the spread of germs. Clean hands can stop germs from spreading from one person to another and throughout an entire community—from your home and workplace to childcare facilities and hospitals.
Follow these five steps every time.
Wet your hands with clean, running water (warm or cold), turn off the tap, and apply soap.
Lather your hands by rubbing them together with the soap. Lather the backs of your hands, between your fingers, and under your nails.
Scrub your hands for at least 20 seconds. Need a timer? Hum the “Happy Birthday” song from beginning to end twice.
Rinse your hands well under clean, running water.
Dry your hands using a clean towel or air dry them.'''

tokens = word_tokenize(text)
pos_tag(tokens)

[('Washing', 'VBG'),
 ('your', 'PRP$'),
 ('hands', 'NNS'),
 ('is', 'VBZ'),
 ('easy', 'JJ'),
 (',', ','),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('’', 'VBD'),
 ('s', 'JJ'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('most', 'RBS'),
 ('effective', 'JJ'),
 ('ways', 'NNS'),
 ('to', 'TO'),
 ('prevent', 'VB'),
 ('the', 'DT'),
 ('spread', 'NN'),
 ('of', 'IN'),
 ('germs', 'NNS'),
 ('.', '.'),
 ('Clean', 'JJ'),
 ('hands', 'NNS'),
 ('can', 'MD'),
 ('stop', 'VB'),
 ('germs', 'NNS'),
 ('from', 'IN'),
 ('spreading', 'VBG'),
 ('from', 'IN'),
 ('one', 'CD'),
 ('person', 'NN'),
 ('to', 'TO'),
 ('another', 'DT'),
 ('and', 'CC'),
 ('throughout', 'IN'),
 ('an', 'DT'),
 ('entire', 'JJ'),
 ('community—from', 'NN'),
 ('your', 'PRP$'),
 ('home', 'NN'),
 ('and', 'CC'),
 ('workplace', 'NN'),
 ('to', 'TO'),
 ('childcare', 'VB'),
 ('facilities', 'NNS'),
 ('and', 'CC'),
 ('hospitals', 'NNS'),
 ('.', '.'),
 ('Follow', 'IN'),
 ('these', 'DT'),
 ('five', 'CD'),
 ('steps', 'NNS'),
 ('every', 'DT'),
 ('time', 'NN'),
 

NLTK documentation for tagset

In [5]:
nltk.help.upenn_tagset('VB')

VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...


**Using spaCy**

In [6]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

In [7]:
text = '''Washing your hands is easy, and it’s one of the most effective ways to prevent the spread of germs. Clean hands can stop germs from spreading from one person to another and throughout an entire community—from your home and workplace to childcare facilities and hospitals.
Follow these five steps every time.
Wet your hands with clean, running water (warm or cold), turn off the tap, and apply soap.
Lather your hands by rubbing them together with the soap. Lather the backs of your hands, between your fingers, and under your nails.
Scrub your hands for at least 20 seconds. Need a timer? Hum the “Happy Birthday” song from beginning to end twice.
Rinse your hands well under clean, running water.
Dry your hands using a clean towel or air dry them.'''

doc = nlp(text)

Tokenization

In [8]:
[token.text for token in doc]

['Washing',
 'your',
 'hands',
 'is',
 'easy',
 ',',
 'and',
 'it',
 '’s',
 'one',
 'of',
 'the',
 'most',
 'effective',
 'ways',
 'to',
 'prevent',
 'the',
 'spread',
 'of',
 'germs',
 '.',
 'Clean',
 'hands',
 'can',
 'stop',
 'germs',
 'from',
 'spreading',
 'from',
 'one',
 'person',
 'to',
 'another',
 'and',
 'throughout',
 'an',
 'entire',
 'community',
 '—',
 'from',
 'your',
 'home',
 'and',
 'workplace',
 'to',
 'childcare',
 'facilities',
 'and',
 'hospitals',
 '.',
 '\n',
 'Follow',
 'these',
 'five',
 'steps',
 'every',
 'time',
 '.',
 '\n',
 'Wet',
 'your',
 'hands',
 'with',
 'clean',
 ',',
 'running',
 'water',
 '(',
 'warm',
 'or',
 'cold',
 ')',
 ',',
 'turn',
 'off',
 'the',
 'tap',
 ',',
 'and',
 'apply',
 'soap',
 '.',
 '\n',
 'Lather',
 'your',
 'hands',
 'by',
 'rubbing',
 'them',
 'together',
 'with',
 'the',
 'soap',
 '.',
 'Lather',
 'the',
 'backs',
 'of',
 'your',
 'hands',
 ',',
 'between',
 'your',
 'fingers',
 ',',
 'and',
 'under',
 'your',
 'nails',
 '.

In [9]:
for token in doc:
    print (token.text, token.pos_, token.tag_)

Washing VERB VBG
your DET PRP$
hands NOUN NNS
is AUX VBZ
easy ADJ JJ
, PUNCT ,
and CCONJ CC
it PRON PRP
’s VERB VBZ
one NUM CD
of ADP IN
the DET DT
most ADV RBS
effective ADJ JJ
ways NOUN NNS
to PART TO
prevent VERB VB
the DET DT
spread NOUN NN
of ADP IN
germs NOUN NNS
. PUNCT .
Clean ADJ JJ
hands NOUN NNS
can VERB MD
stop VERB VB
germs NOUN NNS
from ADP IN
spreading VERB VBG
from ADP IN
one NUM CD
person NOUN NN
to ADP IN
another DET DT
and CCONJ CC
throughout ADP IN
an DET DT
entire ADJ JJ
community NOUN NN
— PUNCT :
from ADP IN
your DET PRP$
home NOUN NN
and CCONJ CC
workplace NOUN NN
to PART TO
childcare VERB VB
facilities NOUN NNS
and CCONJ CC
hospitals NOUN NNS
. PUNCT .

 SPACE _SP
Follow VERB VB
these DET DT
five NUM CD
steps NOUN NNS
every DET DT
time NOUN NN
. PUNCT .

 SPACE _SP
Wet VERB VB
your DET PRP$
hands NOUN NNS
with ADP IN
clean ADJ JJ
, PUNCT ,
running VERB VBG
water NOUN NN
( PUNCT -LRB-
warm ADJ JJ
or CCONJ CC
cold ADJ JJ
) PUNCT -RRB-
, PUNCT ,
turn VERB VB
off A

In [10]:
text = "Abuja is a beautiful city"
doc2 = nlp(text)

In [11]:
displacy.render(doc2, style="dep", jupyter=True)