Types of parsing:

Shallow Parsing (or Chunking): It adds a bit more structure to a POS tagged sentence. The most common operation is grouping words into Noun Phrases (NP). You can also group stuff into VP (Verb Phrases) and PP (Prepositional Phrases).

Dependency Parsing: Probably the most popular type of parse. It implies finding the dependencies between the words and also their type.

# Shallow Parsing

In [7]:
from pattern.en import parsetree, Chunk
from nltk.tree import Tree

sentence = 'The brown fox is quick and he is jumping over the lazy dog'

tree = parsetree(sentence)
tree

[Sentence('The/DT/B-NP/O brown/JJ/I-NP/O fox/NN/I-NP/O is/VBZ/B-VP/O quick/JJ/B-ADJP/O and/CC/O/O he/PRP/B-NP/O is/VBZ/B-VP/O jumping/VBG/I-VP/O over/IN/B-PP/B-PNP the/DT/B-NP/I-PNP lazy/JJ/I-NP/I-PNP dog/NN/I-NP/I-PNP')]

The popular IOB
notation used in chunking, that represent Inside, Outside, and Beginning. The B- prefix
before a tag indicates it is the beginning of a chunk, and I- prefix indicates that it is inside
a chunk. The O tag indicates that the token does not belong to any chunk. The B- tag is
always used when there are subsequent tags following it of the same type without the
presence of O tags between them.

In [11]:
for sentence_tree in tree:
    print(sentence_tree.chunks)

[Chunk('The brown fox/NP'), Chunk('is/VP'), Chunk('quick/ADJP'), Chunk('he/NP'), Chunk('is jumping/VP'), Chunk('over/PP'), Chunk('the lazy dog/NP')]


In [12]:
for sentence_tree in tree:
    for chunk in sentence_tree.chunks:
        print (chunk.type, '->', [(word.string, word.type) for word in chunk.words])

NP -> [('The', 'DT'), ('brown', 'JJ'), ('fox', 'NN')]
VP -> [('is', 'VBZ')]
ADJP -> [('quick', 'JJ')]
NP -> [('he', 'PRP')]
VP -> [('is', 'VBZ'), ('jumping', 'VBG')]
PP -> [('over', 'IN')]
NP -> [('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


In [13]:
def create_sentence_tree(sentence, lemmatize=False):
    sentence_tree = parsetree(sentence, 
                              relations=True, 
                              lemmata=lemmatize)
    return sentence_tree[0]
    
def get_sentence_tree_constituents(sentence_tree):
    return sentence_tree.constituents()
    
def process_sentence_tree(sentence_tree):
    
    tree_constituents = get_sentence_tree_constituents(sentence_tree)
    processed_tree = [
                        (item.type,
                         [
                             (w.string, w.type)
                             for w in item.words
                         ]
                        )
                        if type(item) == Chunk
                        else ('-',
                              [
                                   (item.string, item.type)
                              ]
                             )
                             for item in tree_constituents
                    ]
    
    return processed_tree
    
def print_sentence_tree(sentence_tree):
    

    processed_tree = process_sentence_tree(sentence_tree)
    processed_tree = [
                        Tree( item[0],
                             [
                                 Tree(x[1], [x[0]])
                                 for x in item[1]
                             ]
                            )
                            for item in processed_tree
                     ]

    tree = Tree('S', processed_tree )
    print (tree)
    
def visualize_sentence_tree(sentence_tree):
    
    processed_tree = process_sentence_tree(sentence_tree)
    processed_tree = [
                        Tree( item[0],
                             [
                                 Tree(x[1], [x[0]])
                                 for x in item[1]
                             ]
                            )
                            for item in processed_tree
                     ]
    tree = Tree('S', processed_tree )
    tree.draw()

In [14]:
t = create_sentence_tree(sentence)
t

Sentence('The/DT/B-NP/O/NP-SBJ-1 brown/JJ/I-NP/O/NP-SBJ-1 fox/NN/I-NP/O/NP-SBJ-1 is/VBZ/B-VP/O/VP-1 quick/JJ/B-ADJP/O/O and/CC/O/O/O he/PRP/B-NP/O/NP-SBJ-2 is/VBZ/B-VP/O/VP-2 jumping/VBG/I-VP/O/VP-2 over/IN/B-PP/B-PNP/O the/DT/B-NP/I-PNP/O lazy/JJ/I-NP/I-PNP/O dog/NN/I-NP/I-PNP/O')

In [15]:
pt = process_sentence_tree(t)
pt

[('NP', [('The', 'DT'), ('brown', 'JJ'), ('fox', 'NN')]),
 ('VP', [('is', 'VBZ')]),
 ('ADJP', [('quick', 'JJ')]),
 ('-', [('and', 'CC')]),
 ('NP', [('he', 'PRP')]),
 ('VP', [('is', 'VBZ'), ('jumping', 'VBG')]),
 ('PP', [('over', 'IN')]),
 ('NP', [('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')])]

In [18]:
print_sentence_tree(t)

(S
  (NP (DT The) (JJ brown) (NN fox))
  (VP (VBZ is))
  (ADJP (JJ quick))
  (- (CC and))
  (NP (PRP he))
  (VP (VBZ is) (VBG jumping))
  (PP (IN over))
  (NP (DT the) (JJ lazy) (NN dog)))


In [19]:
visualize_sentence_tree(t)

The lowest level indicates
the values of the actual tokens; the next level indicates the POS tags for each token;
and the next higher level indicates the chunk phrasal tags.

# Dependency-based Parsing 

In [27]:
sentence = 'The brown fox is quick and he is jumping over the lazy dog'

from spacy.lang.en import English
parser = English()
parsed_sent = parser(str(sentence))

dependency_pattern = '{left}<---{word}[{w_type}]--->{right}\n--------'
for token in parsed_sent:
    print (dependency_pattern.format(word=token.orth_, 
                                  w_type=token.dep_,
                                  left=[t.orth_ 
                                            for t 
                                            in token.lefts],
                                  right=[t.orth_ 
                                             for t 
                                             in token.rights]))
                                             



[]<---The[]--->[]
--------
[]<---brown[]--->[]
--------
[]<---fox[]--->[]
--------
[]<---is[]--->[]
--------
[]<---quick[]--->[]
--------
[]<---and[]--->[]
--------
[]<---he[]--->[]
--------
[]<---is[]--->[]
--------
[]<---jumping[]--->[]
--------
[]<---over[]--->[]
--------
[]<---the[]--->[]
--------
[]<---lazy[]--->[]
--------
[]<---dog[]--->[]
--------


In [29]:
# set java path
import os
java_path = r'C:/Program Files/Java/jdk-10.0.2/bin/java.exe'
os.environ['JAVAHOME'] = java_path
                                             
from nltk.parse.stanford import StanfordDependencyParser
sdp = StanfordDependencyParser(path_to_jar='C:/Users/ankit/Finpy/stanford-parser-full-2015-04-20/stanford-parser.jar',
                               path_to_models_jar='C:/Users/ankit/Finpy/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')    
result = list(sdp.raw_parse(sentence))  

result[0]

Please use nltk.parse.corenlp.CoreNLPDependencyParser instead.
  


Exception: Cannot find the dot binary from Graphviz package

<DependencyGraph with 14 nodes>

In [30]:
[item for item in result[0].triples()]

dep_tree = [parse.tree() for parse in result][0]
dep_tree

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('quick', [Tree('fox', ['The', 'brown']), 'is', 'and', Tree('jumping', ['he', 'is', Tree('dog', ['over', 'the', 'lazy'])])])

In [31]:
dep_tree.draw()