In [1]:
import nltk
import numpy as np
import pandas as pd
import spacy
import regex
nlpSpacy = spacy.load('en_core_web_sm')
from stanfordcorenlp import StanfordCoreNLP as stnlp
nlp = stnlp(r'/home/ayush/stanford-corenlp-full-2018-01-31')

In [108]:
########################   NER Using SPACY  ###############################
def preprocess(sentence):
    named_entity = []
    
    parse_tree = nltk.ne_chunk(nltk.tag.pos_tag(sentence.split()), binary=True)
    for tree in parse_tree.subtrees():
        if tree.label()=='NE':
            entity = ""
            for t in tree:
                entity+=t[0]+" "
            named_entity.append(entity[:-1])
        
    return named_entity


#######################    All the Proper Nouns   ########################
def getAllNNP(sen):
    entities = []
    for s in sen:
        parse = nlp.parse(s)
        parse = [str(p).strip() for p in parse.split("\n")]
        for p in parse:
            if p.startswith("(NP"):
                idxes = [m.start() for m in regex.finditer('\(NNP ', p)]
                if len(idxes)>0:
                    prevEnd = -1
                    for idx in idxes:
                        start = idx+len('\(NNP ')-1
                        end = start+1
                        while p[end]!=')':
                            end+=1
                        if prevEnd+2==idx:
                            entities[-1] = entities[-1]+" "+p[start:end]
                        else:
                            entities.append(p[start:end])
                        prevEnd = end
    
    new_entities = []
    for ent1 in entities:
        count = 0
        for ent2 in entities:
            if ent1!=ent2:
                count+=1 if ent1 in ent2 else 0
        if count==0:
            new_entities.append(ent1)
    return set(new_entities)

In [17]:
#########################   Bracket Mapping of a Parse Tree   ################################
def getMapping(sentence):
    open_bracket = []
    mapping = {}
    for i,j in enumerate(sentence):
        if(j=='('):
            open_bracket.append(i)
        elif (j==')'):
            mapping[open_bracket[-1]] = i
            del open_bracket[-1]
    return mapping

In [18]:
##########################   SUBJECT ---  VERB --- OBJECT   ###################################
def SVO(sentence):
    sentence = nlp.parse(sentence)
    sentence = str(sentence)
    ar = [a.strip() for a in sentence.split('\n')]
    sentence = ''.join(ar)
    return getTags(sentence)

def getTags(sentence):
    mapping = getMapping(sentence)
    if '(NP' in sentence:
        if '(VP' in sentence:
            if sentence.index('(NP')<sentence.index('(VP'):
                return getNP(sentence, 0, mapping)
            else:
                return getVP(sentence, 0, mapping)
        else:
            return getNP(sentence, 0, mapping)
    elif '(VP' in sentence:
        return getVP(sentence, 0, mapping)
    
def getNP(sentence, idxStart, mapping):
    res = []
    idxNP = sentence.index('(NP')
    idxNPend = mapping[idxStart + idxNP] - idxStart
    subj = sentence[idxNP:idxNPend]
    npAdded = False
    
    if '(VP' in sentence[idxNP:idxNPend]:
        #NP(VP)
        remain = sentence[idxNP:idxNPend]
        verblist = getVP(remain, idxNP+idxStart, mapping)
        if type(verblist) == type('str'):
            temp = verblist
        else:
            temp = verblist[0]
        while (type(temp)!=type('str')):
            temp = temp[0]
        if temp in subj:
            idxx = subj.index(temp)
            subj = subj[:idxx]
        res.append(subj)
        res.append(verblist)
        npAdded = True    
    
    remain = sentence[idxNPend+1:]
    if '(VP' in remain:
        #NP-VP
        verblist = getVP(remain, idxNPend+1+idxStart, mapping)
        if not npAdded:
            res.append(subj)
        res.append(verblist)
    else:
        #NP
        if not subj in res:
            res.append(subj) 
    return res
        
def getVP(sentence, idxStart, mapping):
    res = []
    idxVP = sentence.index('(VP')
    idxVPend = mapping[idxStart + idxVP] - idxStart
    verb = sentence[idxVP:idxVPend]
    
    verbAdded = False
    if '(NP' in sentence[idxVP:idxVPend]:
        remain = sentence[idxVP:idxVPend]
        nplist = getNP(remain, idxVP+idxStart, mapping)
        if type(nplist) == type('str'):
            temp = nplist
        else:
            temp = nplist[0]
        while (type(temp)!=type('str')):
            temp = temp[0]
        if temp in verb:
            idxx = verb.index(temp)
            verb = verb[:idxx]
        res.append(verb)
        res.append(nplist)
        verbAdded = True
    
    remain = sentence[idxVPend+1:]
    if '(NP' in remain:
        nplist = getNP(remain, idxVPend+1+idxStart, mapping)
        if not verbAdded:
            res.append(verb)
        res.append(nplist)
    else:
        if not verb in res:
            res.append(verb)
    return res

In [43]:
sen =  "Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary."
sen = sen.split('.')
del sen[-1]

stop_words = ['a', 'an', 'the']
replace = ["\'s", "\'"]
for i in range(len(sen)):
    sen[i] = regex.sub(r' \(.*\)', '', sen[i])
    sen[i] = regex.sub(r'\(.*\)', '', sen[i])
    for r in replace:
        sen[i] = sen[i].replace(r, "")
    sen[i] = " ".join([s for s in sen[i].split(" ") if s not in stop_words])
    sen[i] = " ".join(sen[i].split())
entities = []
for s in sen:
    entities.extend(preprocess(s))
entities = set(entities)
print entities

set(['Catholic', 'Basilica', 'Atop Main', 'Christ', 'Saint Bernadette Soubirous', 'Virgin', 'Sacred Heart', 'Main Building'])


In [109]:
getAllNNP(sen)

{'Christ',
 'France',
 'Grotto',
 'Heart',
 'Lourdes',
 'Main Building',
 'Marian',
 'Saint Bernadette Soubirous',
 'Virgin Mary'}

In [105]:
print nlp.parse("Sister has dog")

(ROOT
  (S
    (NP (NN Sister))
    (VP (VBZ has)
      (NP (NN dog)))))
