# Basic Part of Speech Tagging

Imaging that we have a token stream and we are interesting in learning more about the structure of the sentece. For example, assume that we want to identify the various parts of speech in a sentece.

In [None]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
treebank_tokenizer = TreebankWordTokenizer()

input = "Specifically, we reviewed the AN/ASQ‑235 Airborne Mine Neutralization System (AMNS), Airborne Laser Mine Detection System (ALMDS), and Coastal Battlefield Reconnaissance and Analysis (COBRA) Block I systems."
tokens = treebank_tokenizer.tokenize(input)
print(tokens)

pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

Is this useful? Can we do anything with the actual tags?

In [None]:
for pos_token in pos_tags:
    print(pos_token)

In [None]:
for pos_token in pos_tags:
    print(pos_token[0] + " -- " + pos_token[1])

In [None]:
nouns = [pos_token[0] for pos_token in pos_tags if pos_token[1] == "NNP"]
print(nouns)

# Attempting the same with Spacy

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
token_stream = nlp(input)
for i in token_stream:
    # the crucial difference is that Spacy marks up the whole document and then provides properties on each token.
    # One of the Token properties you can look at is the part of speech
    print(i, " -- ", i.pos_)
