# Perform POS Tagging on text

In [2]:
from nltk.tokenize import word_tokenize
text = "This is one simple example for natural language programming using POS tagging"

In [4]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
displacy.render(doc)
displacy.render(doc, style='ent')

In [8]:
import nltk
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/anant/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/anant/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
from nltk.tokenize import TreebankWordTokenizer
from nltk import pos_tag

twt = TreebankWordTokenizer()
tags = pos_tag(twt.tokenize(text), tagset='universal')
tags

[('This', 'DET'),
 ('is', 'VERB'),
 ('one', 'NUM'),
 ('simple', 'ADJ'),
 ('example', 'NOUN'),
 ('for', 'ADP'),
 ('natural', 'ADJ'),
 ('language', 'NOUN'),
 ('programming', 'VERB'),
 ('using', 'VERB'),
 ('POS', 'NOUN'),
 ('tagging', 'VERB')]

In [10]:
spans = [i for i in twt.span_tokenize(text)]
spans

[(0, 4),
 (5, 7),
 (8, 11),
 (12, 18),
 (19, 26),
 (27, 30),
 (31, 38),
 (39, 47),
 (48, 59),
 (60, 65),
 (66, 69),
 (70, 77)]

In [11]:
import random 
pos = list(set(i[1] for i in tags))
colors = dict.fromkeys(pos)
for i in colors.keys():
    colors[i] = ''.join(random.sample(('salmon', 'lime', 'khaki', 'blue', 'orange', 'red', 'green'), k=1))
ents = []
for tag, span in zip(tags, spans):
    ents.append({'start':span[0], 'end':span[1], 'label':tag[1]})
ents

[{'start': 0, 'end': 4, 'label': 'DET'},
 {'start': 5, 'end': 7, 'label': 'VERB'},
 {'start': 8, 'end': 11, 'label': 'NUM'},
 {'start': 12, 'end': 18, 'label': 'ADJ'},
 {'start': 19, 'end': 26, 'label': 'NOUN'},
 {'start': 27, 'end': 30, 'label': 'ADP'},
 {'start': 31, 'end': 38, 'label': 'ADJ'},
 {'start': 39, 'end': 47, 'label': 'NOUN'},
 {'start': 48, 'end': 59, 'label': 'VERB'},
 {'start': 60, 'end': 65, 'label': 'VERB'},
 {'start': 66, 'end': 69, 'label': 'NOUN'},
 {'start': 70, 'end': 77, 'label': 'VERB'}]

In [16]:
doc = {'text':text, 'ents':ents}
options = {'ents':pos, 'colors':colors}
displacy.render(docs=doc, options=options, style='ent', manual=True)