# Language Parsing

## Compiling and Matching
- .compile() turns your regex string into a regular expression object
- .match() will only find matches at the start of a string

In [1]:
#here we are compiling a regular expression and matching to a string
import re

# characters are defined
character_1 = "Dorothy"

# compile your regular expression here
regular_expression = re.compile("\w{7}")

# check for a match to character_1 here
result_1 = regular_expression.match(character_1)

# store and print the matched text here
match_1 = result_1.group(0)
print(match_1)

Dorothy


In [2]:
# Same thing can be done in less code (combining match and compile steps by using re's .match() method)
character_1 = "Dorothy"

result_1 = re.match("\w{7}", character_1)
print("match object:", result_1)
      
match_1 = result_1.group(0)
print("match:       ",match_1)

match object: <re.Match object; span=(0, 7), match='Dorothy'>
match:        Dorothy


In [3]:
# Notice how this code won't match for something shorter than seven characters:
character_2 = "Henry"

result_2 = re.match("\w{7}", character_2)
print("match object:",result_2)
      
#match_2 = result_2.group(0)
#print(match_2)

match object: None


## Searching and Finding
- .match() will only find matches at the start of a string
- .search() will look left to right through an entire piece of text and return a match object for the first match to the regular expression given
- .findall() will return a list of all non-overlapping matches of the regular expression in the string

In [4]:
# import L. Frank Baum's The Wonderful Wizard of Oz
oz_text = open("the_wizard_of_oz_text.txt",encoding='utf-8').read().lower()

# search oz_text for an occurrence of 'wizard' here
found_wizard = re.search("wizard", oz_text)
print(found_wizard.group(0))

wizard


In [5]:
# find all the occurrences of 'lion' in oz_text here
all_lions = re.findall("lion", oz_text)
print(all_lions)

# store and print the length of all_lions here
number_lions = len(all_lions)
print(number_lions)

['lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion', 'lion',

## Part of Speech Tagging

We start by sentence tokenizing the text, followed by word tokenizing each sentence

In [6]:
from nltk.tokenize import word_tokenize, sent_tokenize

# first we SENTENCE tokenize the text
sent_tokenized_oz = sent_tokenize(oz_text)

# cleaning each sentence
cleaned_sent_oz = []
for sentence in sent_tokenized_oz:
    cleaned_sent_oz.append(re.sub('\W+', ' ', sentence).lower())

# then we WORD tokenize each sentence
word_tokenized_oz = []
for sentence in cleaned_sent_oz:
    word_tokenized_oz.append(word_tokenize(sentence))

# checking sentences --> Now word_tokenized_oz is a list of sentences each of which is a list of words!
i = 95
while i < 98:
    print(f"Sentence {i}")
    print(word_tokenized_oz[i], "\n")
    i += 1

Sentence 95
['she', 'was', 'the', 'wicked', 'witch', 'of', 'the', 'east', 'as', 'i', 'said', 'answered', 'the', 'little', 'woman'] 

Sentence 96
['she', 'has', 'held', 'all', 'the', 'munchkins', 'in', 'bondage', 'for', 'many', 'years', 'making', 'them', 'slave', 'for', 'her', 'night', 'and', 'day'] 

Sentence 97
['now', 'they', 'are', 'all', 'set', 'free', 'and', 'are', 'grateful', 'to', 'you', 'for', 'the', 'favor', 'who', 'are', 'the', 'munchkins', 'inquired', 'dorothy'] 



In [7]:
# Part of Speech Tagging
import nltk
from nltk import pos_tag

pos_tagged_oz = pos_tag(word_tokenized_oz[95])
print(pos_tagged_oz)

pos_tagged_oz = []
for sentence in word_tokenized_oz:
    pos_tagged_oz.append(pos_tag(sentence))

print(pos_tagged_oz[95])

[('she', 'PRP'), ('was', 'VBD'), ('the', 'DT'), ('wicked', 'JJ'), ('witch', 'NN'), ('of', 'IN'), ('the', 'DT'), ('east', 'NN'), ('as', 'IN'), ('i', 'NN'), ('said', 'VBD'), ('answered', 'VBD'), ('the', 'DT'), ('little', 'JJ'), ('woman', 'NN')]
[('she', 'PRP'), ('was', 'VBD'), ('the', 'DT'), ('wicked', 'JJ'), ('witch', 'NN'), ('of', 'IN'), ('the', 'DT'), ('east', 'NN'), ('as', 'IN'), ('i', 'NN'), ('said', 'VBD'), ('answered', 'VBD'), ('the', 'DT'), ('little', 'JJ'), ('woman', 'NN')]


## Chunking
- grouping words by their part of speech
- With chunking in nltk, you can define a pattern of parts-of-speech tags using a modified notation of regular expressions:
  - `chunk_grammar = "AN: {<JJ><NN>}"` with `JJ` being an adjective and `NN` being a noun
- You can then find non-overlapping matches, or chunks of words, in the part-of-speech tagged sentences of a text.

In [8]:
from nltk import RegexpParser, Tree

# define adjective-noun chunk grammar here
chunk_grammar = "AN:{<JJ><NN>}"

# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)

# chunk the pos-tagged sentence at index 95 in pos_tagged_oz here
scaredy_cat = chunk_parser.parse(pos_tagged_oz[95])
print(scaredy_cat)

# pretty_print the chunked sentence here
Tree.fromstring(str(scaredy_cat)).pretty_print()

(S
  she/PRP
  was/VBD
  the/DT
  (AN wicked/JJ witch/NN)
  of/IN
  the/DT
  east/NN
  as/IN
  i/NN
  said/VBD
  answered/VBD
  the/DT
  (AN little/JJ woman/NN))
                                                   S                                                                             
    _______________________________________________|__________________________________________________________________            
   |       |      |      |     |       |      |    |      |          |         |               AN                     AN         
   |       |      |      |     |       |      |    |      |          |         |         ______|_____           ______|_____      
she/PRP was/VBD the/DT of/IN the/DT east/NN as/IN i/NN said/VBD answered/VBD the/DT wicked/JJ     witch/NN little/JJ     woman/NN



## Noun Phrase Chunking
- NP chunking is linguistically helpful for determining meaning and bias in a piece of text
- A popular form of noun phrase begins with a determiner `DT`, which specifies the noun being referenced, followed by any number of adjectives `JJ`, which describe the noun, and ends with a noun `NN`.
  - `chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"`
- Chunks are NON-overlapping


In [9]:
from nltk import RegexpParser

# define noun-phrase chunk grammar here
chunk_grammar = "NP:{<DT>?<JJ>?<NN>}"

# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)

# parsing for noun phrase
sentence_95 = chunk_parser.parse(pos_tagged_oz[95])
Tree.fromstring(str(sentence_95)).pretty_print()

                                                             S                                                               
    _________________________________________________________|_________________________________________________               
   |       |      |     |      |          |                  NP                     NP          NP             NP            
   |       |      |     |      |          |          ________|________         _____|_____      |      ________|________      
she/PRP was/VBD of/IN as/IN said/VBD answered/VBD the/DT wicked/JJ witch/NN the/DT     east/NN i/NN the/DT little/JJ woman/NN



In [10]:
# function to count most common np chunks
from collections import Counter

# function that pulls chunks out of chunked sentence and finds the most common chunks
def np_chunk_counter(chunked_sentences):

    # create a list to hold chunks
    chunks = list()

    # for-loop through each chunked sentence to extract noun phrase chunks
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'NP'):
            chunks.append(tuple(subtree))

    # create a Counter object
    chunk_counter = Counter()

    # for-loop through the list of chunks
    for chunk in chunks:
        # increase counter of specific chunk by 1
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(15)

In [11]:
# create a list to hold noun-phrase chunked sentences
np_chunked_oz = []

# create a for loop through each pos-tagged sentence in pos_tagged_oz here
for sentence in pos_tagged_oz:
  # chunk each sentence and append to np_chunked_oz here
  np_chunked_oz.append(chunk_parser.parse(sentence))

# store and print the most common np-chunks here
most_common_np_chunks = np_chunk_counter(np_chunked_oz)
for np in most_common_np_chunks:
    print(np)

((('i', 'NN'),), 293)
((('the', 'DT'), ('scarecrow', 'NN')), 212)
((('dorothy', 'NN'),), 157)
((('the', 'DT'), ('lion', 'NN')), 147)
((('the', 'DT'), ('tin', 'NN')), 122)
((('woodman', 'NN'),), 113)
((('gutenberg', 'NN'),), 74)
((('oz', 'NN'),), 69)
((('toto', 'NN'),), 67)
((('the', 'DT'), ('wicked', 'JJ'), ('witch', 'NN')), 57)
((('the', 'DT'), ('woodman', 'NN')), 57)
((('head', 'NN'),), 55)
((('the', 'DT'), ('emerald', 'JJ'), ('city', 'NN')), 50)
((('the', 'DT'), ('witch', 'NN')), 49)
((('the', 'DT'), ('girl', 'NN')), 46)


## Verb Phrase Chunking
 - A verb phrase is a phrase that contains a verb and its complements, objects, or modifiers
 - Verb phrases can take a variety of structures, and here you will consider two:
    1. The first structure begins with a verb `VB` of any tense, followed by a noun phrase, and ends with an optional adverb `RB` of any form.
        * `chunk_grammar = "VP: {<VB.*><DT>?<JJ>*<NN><RB.?>?}"`
        * This would match `(('said', 'VBD'), ('the', 'DT'), ('cowardly', 'JJ'), ('lion', 'NN'))`
   2. The second structure switches the order of the verb and the noun phrase, but also ends with an optional adverb.
        * `chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}"`
        * This would match `(('the', 'DT'), ('cowardly', 'JJ'), ('lion', 'NN'), ('said', 'VBD'))`

In [12]:
# first defining function for counting verb phrases
from collections import Counter

# function that pulls chunks out of chunked sentence and finds the most common chunks
def vp_chunk_counter(chunked_sentences):

    # create a list to hold chunks
    chunks = list()

    # for-loop through each chunked sentence to extract verb phrase chunks
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'VP'):
            chunks.append(tuple(subtree))
            
    # create a Counter object
    chunk_counter = Counter()

    # for-loop through the list of chunks
    for chunk in chunks:
        # increase counter of specific chunk by 1
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(15)

In [13]:
from nltk import RegexpParser

###################
# FIRST STRUCTURE #
###################

# define verb phrase chunk grammar here
chunk_grammar = "VP:{<VB.?><DT>?<JJ>*<NN><RB.?>?}"

# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)

# create a list to hold verb-phrase chunked sentences
vp_chunked_oz = list()

# create for loop through each pos-tagged sentence in pos_tagged_oz here
for sentence in pos_tagged_oz:
  # chunk each sentence and append to vp_chunked_oz here
  vp_chunked_oz.append(chunk_parser.parse(sentence))

# store and print the most common vp-chunks here
most_common_vp_chunks = vp_chunk_counter(vp_chunked_oz)
for chunk in most_common_vp_chunks:
    print(chunk)

((('said', 'VBD'), ('the', 'DT'), ('scarecrow', 'NN')), 33)
((('said', 'VBD'), ('the', 'DT'), ('tin', 'NN')), 19)
((('said', 'VBD'), ('the', 'DT'), ('lion', 'NN')), 15)
((('said', 'VBD'), ('dorothy', 'NN')), 11)
((('said', 'VBD'), ('the', 'DT'), ('girl', 'NN')), 10)
((('asked', 'VBD'), ('the', 'DT'), ('scarecrow', 'NN')), 10)
((('said', 'VBD'), ('the', 'DT'), ('cowardly', 'JJ'), ('lion', 'NN')), 8)
((('said', 'VBD'), ('oz', 'NN')), 8)
((('pass', 'VB'), ('the', 'DT'), ('night', 'NN')), 6)
((('asked', 'VBD'), ('the', 'DT'), ('girl', 'NN')), 6)
((('don', 'VBP'), ('t', 'NN')), 6)
((('set', 'VBN'), ('forth', 'NN')), 6)
((('asked', 'VBN'), ('dorothy', 'NN')), 5)
((('answered', 'VBD'), ('the', 'DT'), ('scarecrow', 'NN')), 5)
((('thought', 'VBD'), ('i', 'NN')), 5)


In [14]:
###################
# SECOND STRUCTURE #
###################

# define verb phrase chunk grammar here
chunk_grammar = "VP:{<DT>?<JJ>*<NN><VB.*><RB.?>?}"

# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)

# create a list to hold verb-phrase chunked sentences
vp_chunked_oz = list()

# create for loop through each pos-tagged sentence in pos_tagged_oz here
for sentence in pos_tagged_oz:
  # chunk each sentence and append to vp_chunked_oz here
  vp_chunked_oz.append(chunk_parser.parse(sentence))

most_common_vp_chunks = vp_chunk_counter(vp_chunked_oz)
for chunk in most_common_vp_chunks:
    print(chunk)

((('i', 'NN'), ('am', 'VBP')), 31)
((('i', 'NN'), ('was', 'VBD')), 17)
((('dorothy', 'NN'), ('was', 'VBD')), 14)
((('i', 'NN'), ('had', 'VBD')), 8)
((('project', 'NN'), ('gutenberg', 'VBZ')), 8)
((('i', 'NN'), ('know', 'VBP')), 7)
((('dorothy', 'NN'), ('had', 'VBD')), 7)
((('oz', 'NN'), ('was', 'VBD')), 6)
((('i', 'NN'), ('want', 'VBP')), 6)
((('oz', 'NN'), ('had', 'VBD')), 6)
((('toto', 'NN'), ('did', 'VBD'), ('not', 'RB')), 5)
((('dorothy', 'NN'), ('looked', 'VBD')), 5)
((('i', 'NN'), ('have', 'VBP'), ('never', 'RB')), 5)
((('the', 'DT'), ('wicked', 'JJ'), ('witch', 'NN'), ('had', 'VBD')), 5)
((('i', 'NN'), ('have', 'VBP')), 5)


## Chunk Filtering
* You can alternateively define Noun Phrases (or whatever kind of phrase you want) by filtering out verbs and prepositions to leave only Noun Phrase contents
* `chunk_grammar = "NP: {<.*>+}}<VB.?|IN>+{"`
    * `{<.*>+}` matches every part of speech
    * `}<VB.?|IN>+{` excludes verbs or prepositions

In [15]:
from nltk import RegexpParser, Tree

# See how it's all one big chunk

# define chunk grammar to chunk an entire sentence together
grammar = "Chunk: {<.*>+}"

# create RegexpParser object
parser = RegexpParser(grammar)

# chunk the pos-tagged sentence at index 230 in pos_tagged_oz
my_chunks = parser.parse(pos_tagged_oz[95])
print(my_chunks)

(S
  (Chunk
    she/PRP
    was/VBD
    the/DT
    wicked/JJ
    witch/NN
    of/IN
    the/DT
    east/NN
    as/IN
    i/NN
    said/VBD
    answered/VBD
    the/DT
    little/JJ
    woman/NN))


In [16]:
# define noun phrase chunk grammar using chunk filtering here
chunk_grammar = """NP: {<.*>+}
                       }<VB.?|IN>+{"""

# create RegexpParser object here
chunk_parser = RegexpParser(chunk_grammar)

# chunk and filter the pos-tagged sentence at index 95 in pos_tagged_oz here
filtered_dancers = chunk_parser.parse(pos_tagged_oz[95])
print(filtered_dancers)


# pretty_print the chunked and filtered sentence here
Tree.fromstring(str(filtered_dancers)).pretty_print()

(S
  (NP she/PRP)
  was/VBD
  (NP the/DT wicked/JJ witch/NN)
  of/IN
  (NP the/DT east/NN)
  as/IN
  (NP i/NN)
  said/VBD
  answered/VBD
  (NP the/DT little/JJ woman/NN))
                                                             S                                                               
    _________________________________________________________|_________________________________________________               
   |      |     |      |          |          NP              NP                     NP          NP             NP            
   |      |     |      |          |          |       ________|________         _____|_____      |      ________|________      
was/VBD of/IN as/IN said/VBD answered/VBD she/PRP the/DT wicked/JJ witch/NN the/DT     east/NN i/NN the/DT little/JJ woman/NN

