# AdvNLP Lab B: using PoS taggers

## Session goal

The goal of this session is to help you familiarize with PoS tagging. We'll be using NLTK, Stanza, and Spacy.



In [None]:
! pip install stanza
import stanza


from nltk.tag import PerceptronTagger
from nltk.tokenize import word_tokenize

from nltk import download
download('averaged_perceptron_tagger')
download('punkt')

import spacy
!python -m spacy download en_core_web_sm

text='I really like this class. This lab is going to be fun.'
spacy_analyzer = spacy.load('en_core_web_sm')

stanza.download('en')
stanza_pipeline = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma')


def run_stanza(text):

    pairs=[]
    doc = stanza_pipeline(text)
    for sent in doc.sentences:
        for word in sent.words:
            pairs.append((word.text, word.xpos))
    return pairs

def run_spacy(text):

    doc = spacy_analyzer(text)
    return [(token, token.tag_) for token in doc]

def run_nltk (text):

    tagger = PerceptronTagger()
    return tagger.tag(word_tokenize(text))




Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

In [None]:
def visualize_pos_results (text):

    stanza_pairs = run_stanza(text)
    spacy_pairs = run_spacy(text)
    nltk_pairs = run_nltk(text)

    if len(stanza_pairs)==len(spacy_pairs)==len(nltk_pairs):
        tokens = [x[0] for x in stanza_pairs]
        stanza_tags = [x[1] for x in stanza_pairs]
        spacy_tags = [x[1] for x in spacy_pairs]
        nltk_tags = [x[1] for x in nltk_pairs]

        import pandas as pd
        df=pd.DataFrame(columns = ['tokens','Stanza', 'NLTK', 'Spacy'])
        df['tokens']=tokens
        df['Stanza']=stanza_tags
        df['NLTK']=nltk_tags
        df['Spacy']=spacy_tags

        print (df)

    else:
        print ('-'*30)
        print ('Stanza')
        print (stanza_pairs)
        print ('-'*30)
        print ('NLTK')
        print (nltk_pairs)
        print ('-'*30)
        print ('Spacy')
        print (spacy_pairs)



In [None]:
from nltk.data import load
from nltk import download
download('punkt_tab')
download('tagsets')
download('averaged_perceptron_tagger_eng')
tagdict = load('help/tagsets/upenn_tagset.pickle')
tagdict['IN'][0]

In [None]:
sentence = "Traffic congestion in the Shire is getting worse. After we landed at Baggins international airport, we got stuck on the ring road around Hobbiton."
visualize_pos_results(sentence)


In [None]:
sentence_1='Back me up.'
visualize_pos_results(sentence_1)

sentence_2='I asked them to back me up.'
visualize_pos_results(sentence_2)

**When** can have many multiple PoS tags.

In [None]:
sentences=['When did you last go to Bern?',   # interrogative adverb
'Raise your hand when you\'re finished']  # conjunction

for sentence in sentences:
    dflist = visualize_pos_results(sentence)

In [None]:
tagdict['WRB'][0]

In [None]:
tagdict['PRP$'][0]

What's happening in the following example? Which PoS tagger does better?

In [None]:
sentences=['An experienced man should always man the ship',
'Never has so much been owed to so many by so few',
           'A nation will not survive morally or economically \
when so few have so much and so many have so little']

for sentence in sentences:
    dflist = visualize_pos_results(sentence)