This is just setting up the imports and the basic sentence/paragraph that is going to be used
throughout the demonstration.

In [5]:
# Set up spaCy
import nltk
from nltk import Tree
from nltk.corpus import verbnet as vn
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from spacy.en import English
from __future__ import unicode_literals
from spacy.symbols import nsubj,VERB
parser = English()


# Test Data
sentence = "The quick brown fox jumps over the lazy dog."
                 
 
# Remember that this has to be in unicode in order to be used correctly


# all you have to do to parse text is this:
#note: the first time you run spaCy in a file it takes a little while to load up its modules
document = parser(sentence)

# Let's look at the tokens
# All you have to do is iterate through the parsedData
# Each token is an object with lots of different properties
# A property with an underscore at the end returns the string representation
# while a property without the underscore returns an index (int) into spaCy's vocabulary
# The probability estimate is based on counts from a 3 billion word
# corpus, smoothed using the Simple Good-Turing method.
for i, token in enumerate(document):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")
    if i > 1:
        break
        

(u'original:', 566, u'The')
(u'lowercased:', 501, u'the')
(u'lemma:', 501, u'the')
(u'shape:', 567, u'Xxx')
(u'prefix:', 568, u'T')
(u'suffix:', 566, u'The')
(u'log probability:', -5.774222373962402)
(u'Brown cluster id:', 30)
----------------------------------------
(u'original:', 2703, u'quick')
(u'lowercased:', 2703, u'quick')
(u'lemma:', 2703, u'quick')
(u'shape:', 515, u'xxxx')
(u'prefix:', 994, u'q')
(u'suffix:', 1806, u'ick')
(u'log probability:', -9.907200813293457)
(u'Brown cluster id:', 295)
----------------------------------------
(u'original:', 4883, u'brown')
(u'lowercased:', 4883, u'brown')
(u'lemma:', 4883, u'brown')
(u'shape:', 515, u'xxxx')
(u'prefix:', 537, u'b')
(u'suffix:', 766, u'own')
(u'log probability:', -10.879706382751465)
(u'Brown cluster id:', 215)
----------------------------------------


In [2]:
# shown as: original token, dependency tag, head word, left dependents, right dependents
for token in document:
    print(token.orth_, token.dep_, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights])

(u'The', u'det', u'jumps', [], [])
(u'quick', u'amod', u'jumps', [], [])
(u'brown', u'amod', u'jumps', [], [])
(u'fox', u'nsubj', u'jumps', [], [])
(u'jumps', u'ROOT', u'jumps', [u'The', u'quick', u'brown', u'fox'], [u'over', u'.'])
(u'over', u'prep', u'jumps', [], [u'dog'])
(u'the', u'det', u'dog', [], [])
(u'lazy', u'amod', u'dog', [], [])
(u'dog', u'pobj', u'over', [u'the', u'lazy'], [])
(u'.', u'punct', u'jumps', [], [])


In [3]:

for token in document:
    print(token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)")

print("-------------- entities only ---------------")
ents = list(document.ents)
for entity in ents:
    print(entity.label, entity.label_, ' '.join(t.orth_ for t in entity))

(u'The', u'(not an entity)')
(u'quick', u'(not an entity)')
(u'brown', u'(not an entity)')
(u'fox', u'(not an entity)')
(u'jumps', u'(not an entity)')
(u'over', u'(not an entity)')
(u'the', u'(not an entity)')
(u'lazy', u'(not an entity)')
(u'dog', u'(not an entity)')
(u'.', u'(not an entity)')
-------------- entities only ---------------


This is for Determining the Verb Context portion of the Jupyter notebook
The easiest way to determine what hte context of a verb is, is by first finding 
what the subject of the sentence is. A verb alone doesn't tell you the context
but its relationship to the subject often gives a better idea. So first we must
define what the subject of the sentence is, then we can look for patterns and 
the relationship between the verb and the subject as well as the other words in
the sentence. This is a tree that shows a simple heirarchy between the words:

In [6]:
def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_


[to_nltk_tree(sent.root).pretty_print() for sent in document.sents]

                jumps                  
  ________________|____________         
 |    |     |     |    |      over     
 |    |     |     |    |       |        
 |    |     |     |    |      dog      
 |    |     |     |    |    ___|____    
The quick brown  fox   .  the      lazy



[None]

From the tree we can somewhat see the relationship between all the words and how they
are interconnected. This isn't perfect though because we still don't necessarily know
the parts of speech associated with each of the words that have been mapped out. So 
now we are going to look at the relationship by using a few libraries that have built
in methods to parse the words, then develop their dependecies. First we will look at
the parts of speech.

In [8]:
print("WORD: POS\n--------")
for w in document:
    print"%s: %s" % (w,w.pos_)

WORD: POS
--------
The: DET
quick: ADJ
brown: ADJ
fox: NOUN
jumps: VERB
over: ADP
the: DET
lazy: ADJ
dog: NOUN
.: PUNCT


Now we can see the representation of the parts of speech that are within the sentence. This can give us a clue as to how to build common patterns within the sentence, so that we can better understand what common sentence structures are and the context given 'X' pattern. Now we can use spaCy to look into the relationship between the verbs and the words around it.

In [9]:
print("WORD: DEP\n--------")
sentence = next(document.sents) 
for word in sentence:
    print "%s: %s" % (word,word.dep_)

WORD: DEP
--------
The: det
quick: amod
brown: amod
fox: nsubj
jumps: ROOT
over: prep
the: det
lazy: amod
dog: pobj
.: punct


Since we have both the parts of speech as well as the dependencies, we can now look specifically at the interactions between the verbs and the words around them. We can pull out the words that are designated as verbs, and make an n-gram out of them. From this we can look for synonyms that would fit well in that position of the sentence.

In [12]:
stop_words = set(stopwords.words('english'))
filtered_sent = [w for w in document if not w in stop_words]
filtered_sent = []

# pulling out the stop words from the
# tokenized sentence. Adding them to
# list of filtered words.
for w in document:
    if w not in stop_words:
        filtered_sent.append(w)
        
# Replace 'take' with verb in sentence.        
word = vn.classids('take')
print(word)

# Splitting the classes from the word
name, theme = zip(*(s.split("-") for s in word))
dictionary = dict(zip(name, theme))

print
print(dictionary)

[u'bring-11.3', u'characterize-29.2', u'convert-26.6.2', u'cost-54.2', u'fit-54.3', u'performance-26.7-2', u'steal-10.5']

{u'convert': u'26.6.2', u'fit': u'54.3', u'characterize': u'29.2', u'bring': u'11.3', u'cost': u'54.2', u'performance': u'26.7', u'steal': u'10.5'}


We now have the synonyms and their respective identifier with eachother. This makes it easier to reference and loop though to find the themes within the verbs.