In [None]:
import nltk
import sys

TERMINALS = """
A -> "small" | "white"
N -> "cats" | "trees"
V -> "climb" | "run"
"""

NONTERMINALS = """
S -> NP V
NP -> N | A NP
"""

grammar = nltk.CFG.fromstring(NONTERMINALS + TERMINALS)
parser = nltk.ChartParser(grammar)


def main():
    while True:
        # If filename specified, read sentence from file
        if len(sys.argv) == 2:
            with open(sys.argv[1]) as f:
                s = f.read()

        # Otherwise, get sentence as input
        else:
            s = input("Sentence: ")
            
        if s == "q": quit()

        # Convert input into list of words
        s = preprocess(s)

        # Attempt to parse sentence
        try:
            trees = list(parser.parse(s))
        except ValueError as e:
            print(e)
            return
        if not trees:
            print("Could not parse sentence.")
            #return
        else:
            # Print each tree with noun phrase chunks
            for tree in trees:
                tree.pretty_print()

                print("Noun Phrase Chunks")
                for np in np_chunk(tree):
                    print(" ".join(np.flatten()))


def preprocess(sentence):
    """
    Convert `sentence` to a list of its words.
    Pre-process sentence by converting all characters to lowercase
    and removing any word that does not contain at least one alphabetic
    character.
    """
    # CG: make the sentence all lowercase and tokenize it:
    tokenized_sentence = nltk.tokenize.word_tokenize(sentence.lower())

    # CG: create a copy to iter over:
    working_sentence = tokenized_sentence.copy()

    # Loop over all words in the copy of the tokenized list:
    for aword in working_sentence:

        # CG: let's initialize our indicator as False:
        word_is_OK = False

        # CG: loop over all chars in a word:
        for achar in aword:

            # CG: check if there are alphabetic characters in the word:
            if achar in "abcdefghijklmnopqrstuvwxyz":

                # CG: if there are, make signal to True:
                word_is_OK = True

        # CG: Any word that doesn’t contain at least one alphabetic character (e.g. . or 28) should be excluded from the returned list:
        if not word_is_OK:

            # CG: remove the word from the list:
            tokenized_sentence.remove (aword)

    # CG: return the tokenized sentence:
    return tokenized_sentence


def np_chunk(tree):
    """
    Return a list of all noun phrase chunks in the sentence tree.
    A noun phrase chunk is defined as any subtree of the sentence
    whose label is "NP" that does not itself contain any other
    noun phrases as subtrees.
    """
    # CG: initialize resulting list:
    result = []

    # CG: loop over all 3rd-level branches of a tree:
    for subtree in tree.subtrees(filter=lambda st: st.label() == 'NP'):

        # CG: a noun phrase chunk is a subtree of the original tree whose label is NP and that does not itself contain other noun phrases as subtrees. 
        # CG: place all subtrees' subtrees that contain 'NP' in a string:
        string_subtree=str(list(subtree.subtrees(lambda st: st.label() == 'NP')))

        # CG: ... and use string count() method to check if there's no other 'NP' contained in an 'NP':
        if string_subtree.count('NP') == 1:

            # CG: make sure it does not repeat in the result:
            if subtree not in result:

                # CG: add the subtree branch to the resulting list:
                result.append (subtree)

    # CG: return the resulting list:
    return result



def np_chunk_(tree):
    """
    Return a list of all noun phrase chunks in the sentence tree.
    A noun phrase chunk is defined as any subtree of the sentence
    whose label is "NP" that does not itself contain any other
    noun phrases as subtrees.
    """
    # CG: initialize resulting list:
    result = []

    # CG: loop over all 3rd-level branches of a tree:
    for subtree in tree.subtrees(filter=lambda t: t.height() == 3):

        # CG: check if the subtree is an NP:
        if subtree.label() == 'NP':

            # CG: make sure it does not repeat in the result:
            if subtree not in result:
                
                # CG: add the subtree branch to the resulting list:
                result.append (subtree)

    # CG: return the resulting list:
    return result


if __name__ == "__main__":
    main()




Sentence:  cats run


      S     
  ____|___   
 NP       | 
 |        |  
 N        V 
 |        |  
cats     run

Noun Phrase Chunks
cats


Sentence:  Cats climb trees


Could not parse sentence.


Sentence:  Small cats run


           S      
        ___|____   
       NP       | 
   ____|___     |  
  |        NP   | 
  |        |    |  
  A        N    V 
  |        |    |  
small     cats run

Noun Phrase Chunks
cats


Sentence:  Small white cats climb


             S            
         ____|_________    
        NP             |  
   _____|____          |   
  |          NP        |  
  |      ____|___      |   
  |     |        NP    |  
  |     |        |     |   
  A     A        N     V  
  |     |        |     |   
small white     cats climb

Noun Phrase Chunks
cats
