In [2]:

# Import spacy and English models
import spacy

nlp = spacy.load('en')

In [3]:
# Process the sentences 
print("=> This is the original sentence:")
print("Hello, world. Natural Language Processing in 10 lines of code.\n")

doc = nlp(u"Hello, world. Natural Language Processing in 10 lines of code.")


# Get first token of the processed document
token = doc[0]
print("\n=> This is the first token:")
print(token)

# Print sentences (one sentence per line)
print("\n=> Printing sentences (one sentence per line)")
for sent in doc.sents:
    print(sent)


=> This is the original sentence:
Hello, world. Natural Language Processing in 10 lines of code.


=> This is the first token:
Hello

=> Printing sentences (one sentence per line)
Hello, world.
Natural Language Processing in 10 lines of code.


In [4]:
print("\n*********** Part of Speech tagging ***********\n")
# Part of Speech taggin*

# For each token, print corresponding part of speech tag
for token in doc:
    print('{} - {}'.format(token, token.pos_))


*********** Part of Speech tagging ***********

Hello - INTJ
, - PUNCT
world - NOUN
. - PUNCT
Natural - PROPN
Language - PROPN
Processing - PROPN
in - ADP
10 - NUM
lines - NOUN
of - ADP
code - NOUN
. - PUNCT


In [5]:
print("\n*********** Named Entities Example ***********\n")

doc_2 = nlp(u"I went to Paris where I met my old friend Jack from uni.")
for ent in doc_2.ents:
    print('{} - {}'.format(ent, ent.label_))


*********** Named Entities Example ***********

Paris - GPE
Jack - PERSON


In [6]:
print("*** \n Print noun chunks for: I went to Paris where I met my old friend Jack from uni.\n****")
# Print noun chunks for doc_2
print([chunk for chunk in doc_2.noun_chunks])

*** 
 Print noun chunks for: I went to Paris where I met my old friend Jack from uni.
****
[I, Paris, I, my old friend, uni]


In [13]:
# Unigram Probabilities based on a large corpus from spacy

print("\n***\n For every token in doc_2, print log-probability of the word, \nestimated from counts from a large corpus\n****\n")
# The probability estimate is based on counts from a 3 billion word
# corpus, smoothed using the Simple Good-Turing method.
for token in doc_2:
    print(token, '=>', token.prob)


***
 For every token in doc_2, print log-probability of the word, 
estimated from counts from a large corpus
****

(I, '=>', -4.064180850982666)
(went, '=>', -8.474893569946289)
(to, '=>', -3.83851957321167)
(Paris, '=>', -11.6917724609375)
(where, '=>', -7.183883190155029)
(I, '=>', -4.064180850982666)
(met, '=>', -9.784490585327148)
(my, '=>', -5.918124675750732)
(old, '=>', -7.7954816818237305)
(friend, '=>', -8.825821876525879)
(Jack, '=>', -11.20296573638916)
(from, '=>', -6.028810501098633)
(uni, '=>', -19.579313278198242)
(., '=>', -3.0729479789733887)


In [12]:
print("\n **** Word embedding based Similarity  ****  \n")

print("Example 1: The King ordered the man to remove the woman and kill his queen")

doc3 = nlp(u"The King ordered the man to remove the woman and kill his queen.")
king = doc3[1]
man = doc3[4]
woman = doc3[8]
queen = doc3[-1]
print("Similarity between king and man:",king.similarity(man))

print("Similarity between king and woman:",king.similarity(woman))

print("Similarity between king and queen:",king.similarity(queen))


print("\n \nExample 2: The King ordered the man to remove the woman and kill his queen.")

# For a given document, calculate similarity between 'apples' and 'oranges' and 'boots' and 'hippos'
doc4 = nlp(u"Apples and oranges are similar. Boots and hippos aren't.")
apples = doc4[0]
oranges = doc4[2]
boots = doc4[6]
hippos = doc4[8]
print("Apples and oranges similarity score:", apples.similarity(oranges))
print("Boots and hippos similarity score:", boots.similarity(hippos))



 **** Word embedding based Similarity  ****  

Example 1: The King ordered the man to remove the woman and kill his queen
('Similarity between king and man:', 0.40884606921875327)
('Similarity between king and woman:', 0.26556595924212256)
('Similarity between king and queen:', 0.23910408661703836)

 
Example 2: The King ordered the man to remove the woman and kill his queen.
('Apples and oranges similarity score:', 0.0)
('Boots and hippos similarity score:', 0.0)


In [11]:
print("\n\n We can also do: Similarity between sentence and a word.\n")
# Print similarity between sentence and word 'fruit'
apples_sent, boots_sent = doc4.sents
fruit = doc4.vocab[u'fruit']
print("Similarity between the apple's sentence and the word ")
print(apples_sent.similarity(fruit))
print(boots_sent.similarity(fruit))



 We can also do: Similarity between sentence and a word.

Similarity between the apple's sentence and the word 
0.569403188405
0.323890854232
