# Context-Free Grammars & Part-Of-Speech Tags

This notebook will explore the Python implementation of Context-Free Grammars and Part-Of-Speech Tags

In [1]:
import nltk
from nltk import word_tokenize

In [2]:
# Sample sentences
sentence = "While hunting in Africa, I shot an elephant in my pajamas. How he got into my pajamas, I don't know."
sentence_2 = 'usain_bolt broke the 100m record'

In [4]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"

In [5]:
#Tokenize text, treat all apostrophies as one word

groucho_marx_tokens_raw = nltk.regexp_tokenize(sentence, pattern)
nltk.pos_tag(groucho_marx_tokens_raw)

[('While', 'IN'),
 ('hunting', 'VBG'),
 ('in', 'IN'),
 ('Africa', 'NNP'),
 ('I', 'PRP'),
 ('shot', 'VBP'),
 ('an', 'DT'),
 ('elephant', 'NN'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('pajamas', 'NN'),
 ('How', 'WRB'),
 ('he', 'PRP'),
 ('got', 'VBD'),
 ('into', 'IN'),
 ('my', 'PRP$'),
 ('pajamas', 'NN'),
 ('I', 'PRP'),
 ("don't", 'VBP'),
 ('know', 'VB')]

In [6]:
#NLTK automatic POS tags for sentence 1
groucho_marx_tokens = word_tokenize(sentence)
nltk.pos_tag(groucho_marx_tokens)


[('While', 'IN'),
 ('hunting', 'VBG'),
 ('in', 'IN'),
 ('Africa', 'NNP'),
 (',', ','),
 ('I', 'PRP'),
 ('shot', 'VBP'),
 ('an', 'DT'),
 ('elephant', 'NN'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('pajamas', 'NN'),
 ('.', '.'),
 ('How', 'WRB'),
 ('he', 'PRP'),
 ('got', 'VBD'),
 ('into', 'IN'),
 ('my', 'PRP$'),
 ('pajamas', 'NN'),
 (',', ','),
 ('I', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('know', 'VB'),
 ('.', '.')]

In [7]:
#NLTK automatic POS tags for sentence 2
tokenized_sent_2 = word_tokenize(sentence_2)
nltk.pos_tag(tokenized_sent_2)

[('usain_bolt', 'JJ'),
 ('broke', 'VBD'),
 ('the', 'DT'),
 ('100m', 'CD'),
 ('record', 'NN')]

# Viewing Parse Tree

This is a highly manual process that demands grammar be written prior

1.2   Ubiquitous Ambiguity, From: https://www.nltk.org/book/ch08.html

In [12]:
# Define the grammar

groucho_grammar = nltk.CFG.fromstring("""
    S -> NP VP
    PP -> P NP
    NP -> Det N | Det N PP | 'I'
    VP -> V NP | VP PP
    Det -> 'an' | 'my'
    N -> 'elephant' | 'pajamas'
    V -> 'shot'
    P -> 'in'
    """)

In [14]:
# View the parse tree

sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)
for tree in parser.parse(sent):
     print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))
