In [4]:
import nltk
import numpy as np
import pandas as pd
import spacy
import regex
model_path = '/home/ayush/spaCy/models/en_core_web_sm-2.0.0/en_core_web_sm/en_core_web_sm-2.0.0'

nlp = spacy.load(model_path)

In [5]:
S = "The Scientists know many things about the Sun. They know how old it is. The Sun is more than 4½ billion years old. They also know the Sun’s size. The Sun may seem small, but that is because it is so far away. It is about 93 million miles (150 million kilometers) away from the Earth. The Sun is so large that the diameter of the Sun is  109 times the Earth’s diameter. The Sun also weighs as much as 333,000 Earths. The Sun is the center of our Solar System. Besides the Sun, the Solar System is made up of the planets,  moons, asteroid belt, comets, meteors, and other objects."

In [6]:
words = nltk.pos_tag(nltk.word_tokenize(S))

In [7]:
words_stop = [w for w in words if w not in nltk.corpus.stopwords.words()]
print len(words_stop)

130


In [8]:
S2 = "Vijay Bhatkar is the chancellor of Nalanda University. He lives in Rohini Delhi."

In [9]:
"""
1. Tokenization
2. Pos Tagging
3. NER (Named Entity Recognition) : 
    a. Split the sentences.
    b. For the sentence use the ne_chunk using its tags
    c. return the pos_tags, named_entities
"""
def preprocess(S):
    sentences = S.split('.')
    pos_tags = []
    named_entity = []
    for sentence in sentences:
        pos_tags.append(nltk.pos_tag(nltk.word_tokenize(sentence)))
        parse_tree = nltk.ne_chunk(nltk.tag.pos_tag(sentence.split()), binary=True)
        for tree in parse_tree.subtrees():
            if tree.label()=='NE':
                entity = ""
                for t in tree:
                    entity+=t[0]+" "
                named_entity.append(entity[:-1])
        
    return (pos_tags, named_entity)

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return nltk.Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_


def dependency_tree(sentence):
    doc = nlp(sentence)

    for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
    
    [to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

In [10]:
# for s in temp_s:
#     dependency_tree(s)

In [11]:
from stanfordcorenlp import StanfordCoreNLP
from stanfordcorenlp import StanfordCoreNLP as stnlp
nlp = stnlp(r'/home/ayush/stanford-corenlp-full-2018-01-31')

In [170]:
def get3tags(sentence):
    #Sentence is of form NP - VP - NP, NP - VP(NP)
    #PARSE TREE GENERATION
    sentence = nlp.parse(sentence)
    sentence = str(sentence)

    ar = [a.strip() for a in sentence.split('\n')]
    sentence = ''.join(ar)
    mapping = getMapping(sentence)
       
    (subject, idxSubjEnd) = getSubject(sentence, mapping)
    remainString = sentence[idxSubjEnd+1:]
    idxVerb = idxSubjEnd+remainString.index('VP')
    idxVerbEnd = mapping[idxVerb]+1

    verbString = sentence[idxVerb:idxVerbEnd]
    remainString = sentence[idxVerbEnd+1:]

    #VERB contains VP or VP(NP)
    verb =  sentence[idxVerb:idxVerbEnd]

    try:
        #VP - NP
        idxObj = idxVerbEnd+remainString.index('NP')
        idxObjEnd = mapping[idxObj]
        obj = sentence[idxObj:idxObjEnd]
    except:
        try:
            #VP(NP)
            idxObj = idxVerb+verbString.index('NP')-1
            idxObjEnd = mapping[idxObj]
            verb = sentence[idxVerb:idxObj]
            obj = sentence[idxObj:idxObjEnd]
        except:
            #No Object
            obj = ""    
        
    return (subject, verb, obj)

def getSubject(sentence, mapping):
    subject = ""
    idxSubjEnd = 0
    try:
        idxSubj = sentence.index('NP')-1
        idxSubjEnd = mapping[idxSubj]+1
        subject = sentence[idxSubj:idxSubjEnd]

        remainString = sentence[mapping[idxSubj]+1:]

        #SUBJECT
        try:
            idxVerb = idxSubjEnd+remainString.index('VP')-1
            subject = subject+sentence[idxSubjEnd+1:idxVerb]
            idxSubjEnd = idxVerb
        except:
            pass
    except:
        pass
    return (subject, idxSubjEnd)

def getMapping(sentence):
    open_bracket = []
    mapping = {}
    for i,j in enumerate(sentence):
        if(j=='('):
            open_bracket.append(i)
        elif (j==')'):
            mapping[open_bracket[-1]] = i
            del open_bracket[-1]
    return mapping

In [172]:
sen = "Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary."
sen = sen.split('.')
del sen[-1]
sen

['Architecturally, the school has a Catholic character',
 " Atop the Main Building's gold dome is a golden statue of the Virgin Mary",
 ' Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes"',
 ' Next to the Main Building is the Basilica of the Sacred Heart',
 ' Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection',
 ' It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858',
 ' At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary']

In [174]:
for st in sen:
    print st
    # Removes brackets and its contents
    st =  regex.subf(r"\((?:[^()]++|(?R))*+\)", "", string)
    (sub, verb, obj) = get3tags(st)
    print "SUBJECT : ", sub
    print "VERB : ", verb
    print "OBJECT : ", obj
    print "--------------------------------------------------------------------------------------------------------------"

Architecturally, the school has a Catholic character
SUBJECT :  (NP(NP (DT the) (NN end))(PP (IN of)(NP (DT the) (JJ main) (NN drive))))(, ,)
VERB :  (VP (VBZ is))
OBJECT :  (NP(NP (DT a) (JJ simple) (, ,) (JJ modern) (NN stone) (NN statue))(PP (IN of)(NP (NNP Mary)))
--------------------------------------------------------------------------------------------------------------
 Atop the Main Building's gold dome is a golden statue of the Virgin Mary
SUBJECT :  (NP(NP (DT the) (NN end))(PP (IN of)(NP (DT the) (JJ main) (NN drive))))(, ,)
VERB :  (VP (VBZ is))
OBJECT :  (NP(NP (DT a) (JJ simple) (, ,) (JJ modern) (NN stone) (NN statue))(PP (IN of)(NP (NNP Mary)))
--------------------------------------------------------------------------------------------------------------
 Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes"
SUBJECT :  (NP(NP (DT the) (NN end))(PP (IN of)(NP (DT the) (JJ main) (NN dr