In [1]:
import nltk
import numpy as np
import pandas as pd
import spacy
model_path = '/home/ayush/spaCy/models/en_core_web_sm-2.0.0/en_core_web_sm/en_core_web_sm-2.0.0'

nlp = spacy.load(model_path)

In [2]:
S = "Scientists know many things about the Sun. They know how old it is. The Sun is more than 4½ billion years old. They also know the Sun’s size. The Sun may seem small, but that is because it is so far away. It is about 93 million miles (150 million kilometers) away from the Earth. The Sun is so large that the diameter of the Sun is  109 times the Earth’s diameter. The Sun also weighs as much as 333,000 Earths. The Sun is the center of our Solar System. Besides the Sun, the Solar System is made up of the planets,  moons, asteroid belt, comets, meteors, and other objects."

In [3]:
words = nltk.pos_tag(nltk.word_tokenize(S))

In [4]:
words_stop = [w for w in words if w not in nltk.corpus.stopwords.words()]
print len(words_stop)

129


In [5]:
S2 = "Vijay Bhatkar is the chancellor of Nalanda University. He lives in Rohini Delhi."

In [10]:
"""
1. Tokenization
2. Pos Tagging
3. NER (Named Entity Recognition) : 
    a. Split the sentences.
    b. For the sentence use the ne_chunk using its tags
    c. return the pos_tags, named_entities
"""
def preprocess(S):
    sentences = S.split('.')
    pos_tags = []
    named_entity = []
    for sentence in sentences:
        pos_tags.append(nltk.pos_tag(nltk.word_tokenize(sentence)))
        parse_tree = nltk.ne_chunk(nltk.tag.pos_tag(sentence.split()), binary=True)
        for tree in parse_tree.subtrees():
            if tree.label()=='NE':
                entity = ""
                for t in tree:
                    entity+=t[0]+" "
                named_entity.append(entity[:-1])
        
    return (pos_tags, named_entity)

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return nltk.Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_


def dependency_tree(sentence):
    doc = nlp(sentence)

    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
    
    [to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

In [7]:
(pos_tags, named_entities) = preprocess(S2)
print named_entities

['Vijay Bhatkar', 'Nalanda University', 'Rohini Delhi']


In [8]:
temp_s = S.split('. ')
print len(temp_s)
print temp_s
for i in range(len(temp_s)):
    temp_s[i] = unicode(temp_s[i], 'utf-8')
print temp_s

10
['Scientists know many things about the Sun', 'They know how old it is', 'The Sun is more than 4\xc2\xbd billion years old', 'They also know the Sun\xe2\x80\x99s size', 'The Sun may seem small, but that is because it is so far away', 'It is about 93 million miles (150 million kilometers) away from the Earth', 'The Sun is so large that the diameter of the Sun is  109 times the Earth\xe2\x80\x99s diameter', 'The Sun also weighs as much as 333,000 Earths', 'The Sun is the center of our Solar System', 'Besides the Sun, the Solar System is made up of the planets,  moons, asteroid belt, comets, meteors, and other objects.']
[u'Scientists know many things about the Sun', u'They know how old it is', u'The Sun is more than 4\xbd billion years old', u'They also know the Sun\u2019s size', u'The Sun may seem small, but that is because it is so far away', u'It is about 93 million miles (150 million kilometers) away from the Earth', u'The Sun is so large that the diameter of the Sun is  109 times

In [11]:
for s in temp_s:
    dependency_tree(s)

(u'Sun', 38, 41, u'ORG')
           know             
     _______|_____           
    |           things      
    |        _____|______    
    |       |          about
    |       |            |   
    |       |           Sun 
    |       |            |   
Scientists many         the 

     know        
  ____|____       
 |         is    
 |     ____|___   
 |    |       old
 |    |        |  
They  it      how

(u'Sun', 4, 7, u'ORG')
(u'more than 4\xbd billion', 11, 31, u'MONEY')
(u'years', 32, 37, u'DATE')
          is            
  ________|_____         
 |             old      
 |              |        
 |            years     
 |              |        
Sun          billion    
 |    __________|_____   
The more       than   4½

(u'Sun', 19, 22, u'ORG')
     know             
  ____|________        
 |    |       size    
 |    |     ___|____   
 |    |    |       Sun
 |    |    |        |  
They also the       ’s

(u'Sun', 4, 7, u'ORG')
                  seem                

In [45]:
print 5

5
