<a href="https://colab.research.google.com/github/babupallam/Msc_AI_Module2_Natural_Language_Processing/blob/main/L05-Analyzing%20Sentence%20Structure/Note_5_Research_Observations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. **Parsing Sentences with Context-Free Grammar (CFG)**
  - Write a CFG that can parse simple sentences involving nouns, verbs, and prepositions. Parse the sentence "The cat sat on the mat" and print the resulting parse tree.


In [3]:
# Import necessary modules from the NLTK library.
import nltk
from nltk import CFG, ChartParser

# Define the context-free grammar (CFG) for a simple English-like language.
# The grammar specifies how sentences (S), noun phrases (NP), verb phrases (VP), and prepositional phrases (PP) are constructed.

grammar = CFG.fromstring("""
  S -> NP VP
  NP -> Det N | Det N PP
  VP -> V NP | V PP
  PP -> P NP
  Det -> 'The' | 'a' | 'the'
  N -> 'cat' | 'mat'
  V -> 'sat'
  P -> 'on'
""")

# Tokenize the sentence by splitting it into individual words, ensuring proper capitalization.
sentence = "The cat sat on the mat".split()

# Create a ChartParser instance using the defined CFG.
parser = ChartParser(grammar)

# Parse the sentence using the chart parser and print the resulting parse tree(s).
for tree in parser.parse(sentence):
    print(tree)  # Print each possible parse tree.
    tree.pretty_print()  # Pretty-print each parse tree.


(S
  (NP (Det The) (N cat))
  (VP (V sat) (PP (P on) (NP (Det the) (N mat)))))
             S                     
      _______|_______               
     |               VP            
     |        _______|___           
     |       |           PP        
     |       |    _______|___       
     NP      |   |           NP    
  ___|___    |   |        ___|___   
Det      N   V   P      Det      N 
 |       |   |   |       |       |  
The     cat sat  on     the     mat




# 2. **Ambiguity in Parsing**
  - Write a CFG that can parse the ambiguous sentence "I saw the man with the telescope".
  - Use NLTK to generate all possible parse trees and discuss the ambiguity.



In [4]:

grammar_ambiguous = CFG.fromstring("""
    S -> NP VP
    NP -> Det N | Det N PP | 'I'
    VP -> V NP | VP PP
    PP -> P NP
    Det -> 'the'
    N -> 'man' | 'telescope'
    V -> 'saw'
    P -> 'with'
""")

# The ambiguous sentence is tokenized (split into words) and stored as a list of tokens.
ambiguous_sentence = "I saw the man with the telescope".split()

# Assuming 'grammar_ambiguous' is a CFG that is defined elsewhere, we use it to create a ChartParser.
parser_ambiguous = ChartParser(grammar_ambiguous)

# Parse the ambiguous sentence using the ChartParser and print the resulting parse tree(s).
for tree in parser_ambiguous.parse(ambiguous_sentence):
    print(tree)  # Each possible parse tree is printed, representing a different interpretation.
    tree.pretty_print()  # Pretty-print each parse tree for better readability.



(S
  (NP I)
  (VP
    (VP (V saw) (NP (Det the) (N man)))
    (PP (P with) (NP (Det the) (N telescope)))))
     S                                    
  ___|___________                          
 |               VP                       
 |        _______|________                 
 |       VP               PP              
 |    ___|___         ____|___             
 |   |       NP      |        NP          
 |   |    ___|___    |     ___|______      
 NP  V  Det      N   P   Det         N    
 |   |   |       |   |    |          |     
 I  saw the     man with the     telescope

(S
  (NP I)
  (VP
    (V saw)
    (NP (Det the) (N man) (PP (P with) (NP (Det the) (N telescope))))))
     S                                
  ___|_______                          
 |           VP                       
 |    _______|___                      
 |   |           NP                   
 |   |    _______|____                 
 |   |   |   |        PP              
 |   |   |   |    ____|___          

# 3. **Dependency Parsing**
  - Define a dependency grammar for the sentence "The dog chased the cat" and use NLTK's ProjectiveDependencyParser to parse the sentence.


In [5]:

dependency_grammar = nltk.DependencyGrammar.fromstring("""
  'chased' -> 'dog' | 'cat'
  'dog' -> 'The'
  'cat' -> 'the'
""")

# Import necessary modules from NLTK (we assume ProjectiveDependencyParser and dependency_grammar are already imported).
dep_parser = nltk.ProjectiveDependencyParser(dependency_grammar)

# The sentence to be parsed, split into individual words (tokens).
sentence_dep = "The dog chased the cat".split()

# Parse the sentence using the dependency parser.
for tree in dep_parser.parse(sentence_dep):
    print(tree)               # Print the tree in textual form.
    tree.pretty_print()        # Pretty-print the tree in a visual format (ascii art).


(chased (dog The) (cat the))
    chased    
  ____|_____   
dog        cat
 |          |  
The        the



# 4. **CFG Development and Parsing Longer Sentences**
  - Create a CFG that includes recursive rules for constructing longer sentences.
  - Parse the sentence "The small cat chased the big dog with a collar".


In [10]:

grammar_recursive = CFG.fromstring("""
  S -> NP VP
  NP -> Det Adj N | Det N PP
  VP -> V NP | VP PP
  PP -> P NP
  Det -> 'The' | 'a'|'the'
  Adj -> 'small' | 'big'
  N -> 'cat' | 'dog' | 'collar'
  V -> 'chased'
  P -> 'with'
""")

# Tokenize the longer sentence "The small cat chased the big dog with a collar" into individual words.
long_sentence = "The small cat chased the big dog with a collar".split()

# Assuming 'grammar_recursive' is a CFG that allows recursive structures (like nested noun phrases or prepositional phrases),
# create a ChartParser instance using this recursive grammar.
parser_recursive = ChartParser(grammar_recursive)

# Parse the sentence using the recursive chart parser.
# The parser will explore all possible parse trees that conform to the recursive grammar for the given sentence.
for tree in parser_recursive.parse(long_sentence):
    # Print each parsed tree in a structured textual format.
    print(tree)

    # Display the parse tree in an ASCII-style tree structure for better visual understanding of the parse.
    tree.pretty_print()



# 5. **Evaluating Grammar on Treebank Data**
  - Load the Penn Treebank sample from NLTK.
  - Extract a few sentences and compare how well a custom CFG matches the syntactic structure of the Penn Treebank.


In [12]:

from nltk.corpus import treebank
nltk.download('treebank')

treebank_sentence = treebank.parsed_sents('wsj_0001.mrg')[0]
print(treebank_sentence)



[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


# 6. **Exploring Ambiguity with Probabilistic Parsing**
  - Create a probabilistic CFG (PCFG) that assigns different probabilities to ambiguous constructions.
  - Parse the sentence "I saw the man with the telescope" and observe which parse tree has the highest probability.


In [16]:
grammar_pcfg = nltk.PCFG.fromstring("""
  S -> NP VP [1.0]
  NP -> Det N [0.4] | Det N PP [0.4] | 'I' [0.2]
  VP -> V NP [0.7] | VP PP [0.3]
  PP -> P NP [1.0]
  Det -> 'the' [1.0]
  N -> 'man' [0.5] | 'telescope' [0.5]
  V -> 'saw' [1.0]
  P -> 'with' [1.0]
""")

# Initialize a Viterbi parser using a probabilistic context-free grammar (PCFG).
viterbi_parser = nltk.ViterbiParser(grammar_pcfg)

# Parse the ambiguous sentence using the Viterbi parser.
# The Viterbi algorithm will determine the most probable parse tree for the sentence based on the grammar's probabilities.
for tree in viterbi_parser.parse(ambiguous_sentence):
    # Print the most probable parse tree (based on production rule probabilities).
    print(tree)
    tree.pretty_print()  # Pretty-print the parse tree for better readability.

(S
  (NP I)
  (VP
    (V saw)
    (NP
      (Det the)
      (N man)
      (PP (P with) (NP (Det the) (N telescope)))))) (p=0.0056)
     S                                
  ___|_______                          
 |           VP                       
 |    _______|___                      
 |   |           NP                   
 |   |    _______|____                 
 |   |   |   |        PP              
 |   |   |   |    ____|___             
 |   |   |   |   |        NP          
 |   |   |   |   |     ___|______      
 NP  V  Det  N   P   Det         N    
 |   |   |   |   |    |          |     
 I  saw the man with the     telescope



# 7. **Recursive Parsing of Nested Sentences**
  - Write a CFG that can handle recursive sentences such as "The cat that the dog chased sat on the mat".
  - Parse the sentence and display the resulting parse tree.


In [20]:

grammar_nested = CFG.fromstring("""
  S -> NP VP
  NP -> Det N | Det N RelClause
  VP -> V NP | V PP
  PP -> P NP
  RelClause -> RelPron VP
  Det -> 'The' | 'the'
  N -> 'cat' | 'dog' | 'mat'
  V -> 'chased' | 'sat'
  P -> 'on'
  RelPron -> 'that'
""")

# Tokenize the nested sentence "The cat that the dog chased sat on the mat" into individual words (tokens).
nested_sentence = "The cat that the dog chased sat on the mat".split()

# Assuming 'grammar_nested' is a CFG that supports nested clauses (e.g., relative clauses like "that the dog chased"),
# create a ChartParser instance using this nested grammar.
parser_nested = ChartParser(grammar_nested)

# Parse the sentence using the chart parser designed for nested structures.
# The parser will generate all valid parse trees that conform to the grammar for this complex sentence.
for tree in parser_nested.parse(nested_sentence):
    # Print the parsed tree in a structured textual format.
    print(tree)

    # Display the parse tree in an ASCII-style tree structure for clearer visualization of the nested syntax.
    tree.pretty_print()


# 8. **Parsing Coordinated Phrases**
  - Create a CFG to parse coordinated noun phrases such as "The cat and the dog sat on the mat".
  - Generate the parse tree and discuss the structure of coordination.


In [22]:

grammar_coordination = CFG.fromstring("""
  S -> NP VP
  NP -> NP Conj NP | Det N
  VP -> V PP | V
  PP -> P NP
  Det -> 'The' | 'the'
  N -> 'cat' | 'dog' | 'mat'
  V -> 'sat'
  P -> 'on'
  Conj -> 'and'
""")

coordinated_sentence = "The cat and the dog sat on the mat".split()
parser_coordination = ChartParser(grammar_coordination)
for tree in parser_coordination.parse(coordinated_sentence):
  print(tree)
  tree.pretty_print()


(S
  (NP (NP (Det The) (N cat)) (Conj and) (NP (Det the) (N dog)))
  (VP (V sat) (PP (P on) (NP (Det the) (N mat)))))
                          S                         
              ____________|___________               
             |                        VP            
             |                 _______|___           
             NP               |           PP        
      _______|________        |    _______|___       
     NP      |        NP      |   |           NP    
  ___|___    |     ___|___    |   |        ___|___   
Det      N  Conj Det      N   V   P      Det      N 
 |       |   |    |       |   |   |       |       |  
The     cat and  the     dog sat  on     the     mat



# 9. **Analyzing Prepositional Phrase Attachment**
  - Write a CFG that can handle prepositional phrase attachment ambiguity, such as "The boy saw the girl with a telescope".
  - Generate all possible parse trees and analyze the different attachments.


In [25]:
grammar_pp_attachment = CFG.fromstring("""
  S -> NP VP
  NP -> Det N | Det N PP | 'The' N
  VP -> V NP | VP PP
  PP -> P NP
  Det -> 'the' | 'a'
  N -> 'boy' | 'girl' | 'telescope'
  V -> 'saw'
  P -> 'with'
""")

#The '#' symbol was part of a comment. CFG.fromstring() does not recognize comments.
#Remove the comment or place it on a separate line that starts with '#'


pp_attachment_sentence = "The boy saw the girl with a telescope".split()
parser_pp_attachment = ChartParser(grammar_pp_attachment)
for tree in parser_pp_attachment.parse(pp_attachment_sentence):
  print(tree)
  tree.pretty_print()

(S
  (NP The (N boy))
  (VP
    (VP (V saw) (NP (Det the) (N girl)))
    (PP (P with) (NP (Det a) (N telescope)))))
                 S                                 
      ___________|_______                           
     |                   VP                        
     |            _______|_________                 
     |           VP                PP              
     |        ___|___          ____|___             
     NP      |       NP       |        NP          
  ___|___    |    ___|___     |     ___|______      
 |       N   V  Det      N    P   Det         N    
 |       |   |   |       |    |    |          |     
The     boy saw the     girl with  a      telescope

(S
  (NP The (N boy))
  (VP
    (V saw)
    (NP (Det the) (N girl) (PP (P with) (NP (Det a) (N telescope))))))
                 S                             
      ___________|___                           
     |               VP                        
     |        _______|____                      
 

# 10. **Building a Simple Chunk Parser**
  - Write a simple chunk grammar to identify noun phrases (NP) in a sentence.
  - Use the RegexpParser in NLTK to extract all noun phrases from the sentence "The quick brown fox jumps over the lazy dog".


In [30]:

from nltk import RegexpParser, pos_tag, word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [38]:
import nltk
from nltk import RegexpParser, pos_tag, word_tokenize

# Define a simple chunk grammar for noun phrases (NP). The pattern specifies:
# - An optional determiner (<DT>),
# - Followed by zero or more adjectives (<JJ>*),
# - And a mandatory noun (<NN>).
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# Create a chunk parser using the regular expression-based chunk grammar.
chunk_parser = RegexpParser(chunk_grammar)

# Define a sentence for chunking.
chunk_sentence = "The quick brown fox jumps over the lazy dog"

# Tokenize the sentence into words and assign part-of-speech (POS) tags.
# The pos_tag function tags each word with its part-of-speech (like <DT>, <JJ>, <NN>, etc.).
chunk_pos_tags = pos_tag(word_tokenize(chunk_sentence))

# Parse the POS-tagged sentence to identify chunks (based on the chunk grammar).
chunk_tree = chunk_parser.parse(chunk_pos_tags)

# Print the resulting chunk tree structure, which identifies the noun phrases (NP).
print(chunk_tree)

# Use pretty_print() to display the chunk tree in a structured format.
chunk_tree.pretty_print()


(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN))
                                     S                                 
     ________________________________|______________________            
    |        |              NP               NP             NP         
    |        |       _______|________        |       _______|______     
jumps/VBZ over/IN The/DT quick/JJ brown/NN fox/NN the/DT lazy/JJ dog/NN



# 11. **Creating a Left-Corner Parser**
  - Implement a simple left-corner parser using the provided CFG.
  - Parse the sentence "The cat chased the dog" using your left-corner parser.

(Note: Implementing a left-corner parser can be challenging. This exercise encourages exploration of advanced parsing techniques in NLTK.)

Placeholder for left-corner parser implementation and usage.



# 12. **Probabilistic Parsing with Hidden Markov Models (HMM)**
  - Train an HMM using a tagged corpus from NLTK (such as Brown).
  - Use the trained HMM to parse a sentence and analyze the tags.


In [43]:

from nltk.corpus import brown
from nltk.tag import hmm

nltk.download('brown')
nltk.download('punkt')



[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [44]:

tags = brown.tagged_sents(categories='news')[:500]  # Training data (first 500 tagged sentences)

# Train the HMM Tagger using the HiddenMarkovModelTrainer.
trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = trainer.train(tags)

# Print out information about the trained tagger.
print("Trained HMM Tagger:")
print(hmm_tagger)

# Example sentence for tagging.
test_sentence = "The quick brown fox jumps over the lazy dog"

# Tokenize the sentence.
test_tokens = word_tokenize(test_sentence)

# Use the HMM tagger to tag the tokens.
tagged_sentence = hmm_tagger.tag(test_tokens)
print("\nTagged Sentence:")
print(tagged_sentence)

# Evaluate the HMM tagger on a sample from the Brown corpus.
# Use a different portion of the corpus (e.g., sentences 501-600).
test_data = brown.tagged_sents(categories='news')[500:600]

# Calculate accuracy of the tagger on the test dataset.
accuracy = hmm_tagger.evaluate(test_data)
print(f"\nAccuracy on Brown corpus (sentences 501-600): {accuracy:.2f}")

# Tagging multiple sentences for demonstration.
additional_sentences = [
    "She enjoys playing the piano.",
    "The weather today is exceptionally sunny and warm.",
    "Artificial Intelligence is transforming various industries.",
    "The new product launched yesterday."
]

print("\nAdditional Tagged Sentences:")
for sentence in additional_sentences:
    tokens = word_tokenize(sentence)
    tagged = hmm_tagger.tag(tokens)
    print(tagged)


Trained HMM Tagger:
<HiddenMarkovModelTagger 122 states and 2946 output symbols>


  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])
  P[i] = self._priors.logprob(si)
  O[i, k] = self._output_logprob(si, self._symbols[k])
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  accuracy = hmm_tagger.evaluate(test_data)



Tagged Sentence:
[('The', 'AT'), ('quick', 'AT'), ('brown', 'AT'), ('fox', 'AT'), ('jumps', 'AT'), ('over', 'AT'), ('the', 'AT'), ('lazy', 'AT'), ('dog', 'AT')]

Accuracy on Brown corpus (sentences 501-600): 0.17

Additional Tagged Sentences:
[('She', 'AT'), ('enjoys', 'AT'), ('playing', 'AT'), ('the', 'AT'), ('piano', 'AT'), ('.', 'AT')]
[('The', 'AT'), ('weather', 'AT'), ('today', 'AT'), ('is', 'AT'), ('exceptionally', 'AT'), ('sunny', 'AT'), ('and', 'AT'), ('warm', 'AT'), ('.', 'AT')]
[('Artificial', 'AT'), ('Intelligence', 'AT'), ('is', 'AT'), ('transforming', 'AT'), ('various', 'AT'), ('industries', 'AT'), ('.', 'AT')]
[('The', 'AT'), ('new', 'JJ'), ('product', 'NN'), ('launched', 'VBD'), ('yesterday', 'NR'), ('.', '.')]
