In [1]:
import nltk
import numpy as np
import pandas as pd
import spacy
import regex
nlpSpacy = spacy.load('en_core_web_sm')
from stanfordcorenlp import StanfordCoreNLP as stnlp
nlp = stnlp(r'/home/ayush/stanford-corenlp-full-2018-01-31')
from neuralcoref import Coref

In [2]:
########################   NER Using SPACY  ###############################
def preprocess(sentence):
    named_entity = []
    
    parse_tree = nltk.ne_chunk(nltk.tag.pos_tag(sentence.split()), binary=True)
    for tree in parse_tree.subtrees():
        if tree.label()=='NE':
            entity = ""
            for t in tree:
                entity+=t[0]+" "
            named_entity.append(entity[:-1])
        
    return named_entity


#######################    All the Proper Nouns   ########################
def getAllNNP(sen):
    entities = []
    for s in sen:
        parse = nlp.parse(s)
        parse = [str(p).strip() for p in parse.split("\n")]
        for p in parse:
            if p.startswith("(NP"):
                idxes = [m.start() for m in regex.finditer('\(NNP ', p)]
                if len(idxes)>0:
                    prevEnd = -1
                    for idx in idxes:
                        start = idx+len('\(NNP ')-1
                        end = start+1
                        while p[end]!=')':
                            end+=1
                        if prevEnd+2==idx:
                            entities[-1] = entities[-1]+" "+p[start:end]
                        else:
                            entities.append(p[start:end])
                        prevEnd = end
    
    new_entities = []
    for ent1 in entities:
        count = 0
        for ent2 in entities:
            if ent1!=ent2:
                count+=1 if ent1 in ent2 else 0
        if count==0:
            new_entities.append(ent1)
    return set(new_entities)

In [3]:
#########################   Bracket Mapping of a Parse Tree   ################################
def getMapping(sentence):
    open_bracket = []
    mapping = {}
    for i,j in enumerate(sentence):
        if(j=='('):
            open_bracket.append(i)
        elif (j==')'):
            mapping[open_bracket[-1]] = i
            del open_bracket[-1]
    return mapping

In [4]:
##########################   SUBJECT ---  VERB --- OBJECT   ###################################
def SVO(sentence):
    sentence = nlp.parse(sentence)
    sentence = str(sentence)
    ar = [a.strip() for a in sentence.split('\n')]
    sentence = ''.join(ar)
    return getTags(sentence)

def getTags(sentence):
    mapping = getMapping(sentence)
    if '(NP' in sentence:
        if '(VP' in sentence:
            if sentence.index('(NP')<sentence.index('(VP'):
                return getNP(sentence, 0, mapping)
            else:
                return getVP(sentence, 0, mapping)
        else:
            return getNP(sentence, 0, mapping)
    elif '(VP' in sentence:
        return getVP(sentence, 0, mapping)
    
def getNP(sentence, idxStart, mapping):
    res = []
    idxNP = sentence.index('(NP')
    idxNPend = mapping[idxStart + idxNP] - idxStart
    subj = sentence[idxNP:idxNPend]
    npAdded = False
    
    if '(VP' in sentence[idxNP:idxNPend]:
        #NP(VP)
        remain = sentence[idxNP:idxNPend]
        verblist = getVP(remain, idxNP+idxStart, mapping)
        if type(verblist) == type('str'):
            temp = verblist
        else:
            temp = verblist[0]
        while (type(temp)!=type('str')):
            temp = temp[0]
        if temp in subj:
            idxx = subj.index(temp)
            subj = subj[:idxx]
        res.append(subj)
        res.append(verblist)
        npAdded = True    
    
    remain = sentence[idxNPend+1:]
    if '(VP' in remain:
        #NP-VP
        verblist = getVP(remain, idxNPend+1+idxStart, mapping)
        if not npAdded:
            res.append(subj)
        res.append(verblist)
    else:
        #NP
        if not subj in res:
            res.append(subj) 
    return res
        
def getVP(sentence, idxStart, mapping):
    res = []
    idxVP = sentence.index('(VP')
    idxVPend = mapping[idxStart + idxVP] - idxStart
    verb = sentence[idxVP:idxVPend]
    
    verbAdded = False
    if '(NP' in sentence[idxVP:idxVPend]:
        remain = sentence[idxVP:idxVPend]
        nplist = getNP(remain, idxVP+idxStart, mapping)
        if type(nplist) == type('str'):
            temp = nplist
        else:
            temp = nplist[0]
        while (type(temp)!=type('str')):
            temp = temp[0]
        if temp in verb:
            idxx = verb.index(temp)
            verb = verb[:idxx]
        res.append(verb)
        res.append(nplist)
        verbAdded = True
    
    remain = sentence[idxVPend+1:]
    if '(NP' in remain:
        nplist = getNP(remain, idxVPend+1+idxStart, mapping)
        if not verbAdded:
            res.append(verb)
        res.append(nplist)
    else:
        if not verb in res:
            res.append(verb)
    return res

In [5]:
#####################################  COREFERENCE RESOLUTION  ##############################
def coref(sen):
    coref = Coref()
    cluster=coref.continuous_coref(utterances=unicode(sen, 'utf-8'))
#     print coref.get_resolved_utterances()
    return coref.get_resolved_utterances()[0].encode('ascii', 'ignore')

In [6]:
def kuchbhi(offset,s,mapping):
    if(s==''):return ''
    if('(' not in s and ')' not in s):
        words = s.split(' ')
        if(len(words)>1):return words[1]
        else: return ''
    firstOpenBracket = s.index('(')
    correspondingClosingBracket = mapping[firstOpenBracket+offset]-offset
    return kuchbhi(offset+firstOpenBracket+1,s[firstOpenBracket+1:correspondingClosingBracket],mapping)+' '+ \
           kuchbhi(correspondingClosingBracket+offset+1,s[correspondingClosingBracket+1:],mapping)
    
def converter(s):
    openCount,closingCount = 0,0
    for i in s:
        if(i=='('):openCount+=1
        elif(i==')'):closingCount+=1
    if(openCount>closingCount):
        s = s+')'*(openCount-closingCount)
    elif(closingCount>openCount):
        s = '('*(closingCount-openCount)+s
    mapping = getMapping(s)
    s = kuchbhi(0,s,mapping)
    return ' '.join(s.split())

In [7]:
####################   GRAPH GENERATION  ########################

class EntityNode:
    def __init__(self, text):
        self.entity = text

class NPNode:
    
    def __init__(self, text, entities):
        self.sentence = text
        self.before = None
        self.entity = None
        self.after = None
        self.adjacent = []
        
        haystack = text.split(".")[0].split(" ")
        needles = entities
        idxes = [(i.entity, haystack.index(i.entity)) for i in needles if i.entity in haystack]
        idxes = sorted(idxes, key=lambda x:x[1])
        if len(idxes)>0 and len(haystack)>0:
            self.before = " ".join(haystack[:idxes[0][1]])
            if len(self.before)==0:
                self.before = None
            
            self.entity = idxes[0][0]
            s = " ".join(haystack[idxes[0][1]+1:])

            afterNode = NPNode(s, entities)
            if not afterNode.isEmpty():
                self.after = afterNode
            elif len(s)>0:
                self.after = s
                
    def isEmpty(self):
        return self.before==None and self.entity==None and self.after==None
    
class VPNode:
    
    def __init__(self, text):
        self.verb = text
        self.adjacent = []

class Graph:
        
    def __init__(self, entity):
        self.NP = []
        self.VP = []
        self.Edges = []
        self.entities = []
        for ent in entity:
            self.entities.append(EntityNode(ent))
    
    def addEdgeNPVP(self, npsentence, verb):
        npNode = [np for np in self.NP if np.sentence==npsentence]
        if npNode is not None and len(npNode)>0:
            npNode = npNode[0]
        else:
            npNode = NPNode(npsentence, self.entities)
            self.NP.append(npNode)
        
#         print verbNode
        verbNode = VPNode(verb)
        self.VP.append(verbNode)
        npNode.adjacent.append(verbNode)
        self.Edges.append((npNode, verbNode))
        self.Edges.append((verbNode, npNode))
#         print verbNode
        return verbNode
    
    def addEdgeVPNP(self, npsentence, verbNode, verb):
        npNode = [np for np in self.NP if np.sentence==npsentence]
        if npNode is not None and len(npNode)>0:
            npNode = npNode[0]
        else:
            npNode = NPNode(npsentence, self.entities)
            self.NP.append(npNode)
        
        if verbNode is None:
            verbNode = VPNode(verb)
            self.VP.append(verbNode)
        verbNode.adjacent.append(npNode)
        self.Edges.append((npNode, verbNode))
        self.Edges.append((verbNode, npNode))
        
    def addEdgeVPVP(self, verb1, verbNode1, verb2):
        if verbNode1 is None:
            verbNode1 = VPNode(verb1)
            self.VP.append(verbNode1)
            
        verbNode2 = VPNode(verb2)
        self.VP.append(verbNode2)
        
        verbNode1.adjacent.append(verbNode2)
        self.Edges.append((verbNode1, verbNode2))
        self.Edges.append((verbNode2, verbNode1))
        return verbNode2
        
    
    def addEdgeNPNP(self, npsent1, npsent2):
        npNode1 = [np for np in self.NP if np.sentence==npsent1]
        if npNode1 is not None and len(npNode1)>0:
            npNode1 = npNode1[0]
        else:
            npNode1 = NPNode(npsent1, self.entities)
            self.NP.append(npNode1)
            
        npNode2 = [np for np in self.NP if np.sentence==npsent2]
        if npNode2 is not None and len(npNode2)>0:
            npNode2 = npNode2[0]
        else:
            npNode2 = NPNode(npsent2, self.entities)
            self.NP.append(npNode2)
            
        npNode1.adjacent.append(npNode2)
        self.Edges.append((npNode1, npNode2))
        self.Edges.append((npNode2, npNode1))
        
    def addNodes(self, relation, vpNode):
        if len(relation)<2:
            return

        for i in range(1, len(relation)):
            prev = relation[i-1]
            if type(prev)==list:
                prev = prev[0]
            curr = relation[i]
            print prev, "---->", curr[0]
            if '(NP' in prev:
                if '(NP' in curr[0]:
                    #NP-NP
                    ##INSTEAD OF prev and curr[0] send converter(prev) and converter(curr[0])
                    self.addEdgeNPNP(converter(prev), converter(curr[0]))
                else:
                    #NP-VP
                    vpNode = self.addEdgeNPVP(converter(prev), converter(curr[0]))
#                     print vpNode.verb
            else:
                if '(NP' in curr[0]:
                    #VP-NP
                    self.addEdgeVPNP(converter(curr[0]), vpNode, converter(prev))
                else:
                    #VP-VP
                    vpNode = self.addEdgeVPVP(converter(prev), vpNode, converter(curr[0]))

            self.addNodes(curr, vpNode)

In [8]:
def getGraph(sen):
    
    replace = ["\'s", "\'", "\""]
    sen = regex.sub(r' \(.*\)', '', sen)
    sen = regex.sub(r'\(.*\)', '', sen)

    for r in replace:
        sen = sen.replace(r, "")
    sen = coref(sen)
    
    sen = sen.split('.')
    del sen[-1]
    
    ner = list(getAllNNP(sen))
    for i in range(len(sen)):
        for n in ner:
            sen[i] = sen[i].replace(n.lower(), n)
            sen[i] = sen[i].strip()
    print sen


    for i in range(len(sen)):
        sen[i] = regex.sub(r' \(.*\)', '', sen[i])
        sen[i] = regex.sub(r'\(.*\)', '', sen[i])
        for r in replace:
            sen[i] = sen[i].replace(r, "")
        sen[i] = " ".join(sen[i].split())
    entities = []
    for s in sen:
        entities.extend(preprocess(s))
    entities = set(entities)
   
    svo = []
    for s in sen:
        svo.append(SVO(s))
        
        
    graph = Graph(ner)
    for rel in svo:
        graph.addNodes(rel, None)
        
    return graph, ner

In [83]:
sen = "Dashrath was the king of Ayodhya. He had three wives namely Kausalya, Kaikeyi and Sumitra. Bharat and Shatrugan were the sons of Kaikeyi. Ram was the son of Kausalya. Laxman was the son of Sumitra. Ram was married to Sita. She was the daughter of Janak. Ravan was the ruler of Lanka. He kidnapped Sita. Ram killed him."
graph, ner = getGraph(sen)

Loading embeddings from /usr/local/lib/python2.7/dist-packages/neuralcoref/weights/static_word
Loading embeddings from /usr/local/lib/python2.7/dist-packages/neuralcoref/weights/tuned_word
['Dashrath was the king of Ayodhya', 'Dashrath had three wives namely Kausalya, Kaikeyi and Sumitra', 'Bharat and Shatrugan were the sons of Kaikeyi', 'Ram was the son of Kausalya', 'Laxman was the son of Sumitra', 'Ram was married to Sita', 'Sita was the daughter of Janak', 'Ravan was the ruler of Lanka', 'Ravan kidnapped Sita', 'Ram killed Ravan']
(NP (NNP Dashrath) ----> (VP (VBD was)
(VP (VBD was) ----> (NP(NP (DT the) (NN king))(PP (IN of)(NP (NNP Ayodhya)))
(NP (NNP Dashrath) ----> (VP (VBD had)
(VP (VBD had) ----> (NP(NP (CD three) (NNS wives))(ADVP (RB namely))(SBAR(S(NP (NNP Kausalya) (, ,) (NNP Kaikeyi)(CC and)(NNP Sumitra))))
(NP (NNP Bharat)(CC and)(NNP Shatrugan) ----> (VP (VBD were)
(VP (VBD were) ----> (NP(NP (DT the) (NNS sons))(PP (IN of)(NP (NNP Kaikeyi)))
(NP (NNP Ram) ----> (VP (V

In [84]:
for np in graph.NP:
    print "NP Sentence : " , np.sentence
    
    print "NP adjacent : ", 
    for vp in np.adjacent:
        try:
            print vp.verb , ", " ,
        except:
            print vp.sentence, ", " ,
    print
    print "-------------------------------------------------------------------------"

NP Sentence :  Dashrath
NP adjacent :  was ,  had , 
-------------------------------------------------------------------------
NP Sentence :  the king of Ayodhya
NP adjacent : 
-------------------------------------------------------------------------
NP Sentence :  three wives namely Kausalya , Kaikeyi and Sumitra
NP adjacent : 
-------------------------------------------------------------------------
NP Sentence :  Bharat and Shatrugan
NP adjacent :  were , 
-------------------------------------------------------------------------
NP Sentence :  the sons of Kaikeyi
NP adjacent : 
-------------------------------------------------------------------------
NP Sentence :  Ram
NP adjacent :  was ,  was married to ,  killed , 
-------------------------------------------------------------------------
NP Sentence :  the son of Kausalya
NP adjacent : 
-------------------------------------------------------------------------
NP Sentence :  Laxman
NP adjacent :  was , 
---------------------------

In [85]:
for vp in graph.VP:
    print "VP Sentence : ", vp.verb
    print "VP Adjacent : ", 
    for np in vp.adjacent:
        try:
            print np.sentence, 
        except:
            print np.verb, 
    print 
    print "-"*100

VP Sentence :  was
VP Adjacent :  the king of Ayodhya
----------------------------------------------------------------------------------------------------
VP Sentence :  had
VP Adjacent :  three wives namely Kausalya , Kaikeyi and Sumitra
----------------------------------------------------------------------------------------------------
VP Sentence :  were
VP Adjacent :  the sons of Kaikeyi
----------------------------------------------------------------------------------------------------
VP Sentence :  was
VP Adjacent :  the son of Kausalya
----------------------------------------------------------------------------------------------------
VP Sentence :  was
VP Adjacent :  the son of Sumitra
----------------------------------------------------------------------------------------------------
VP Sentence :  was married to
VP Adjacent :  Sita
----------------------------------------------------------------------------------------------------
VP Sentence :  was
VP Adjacent :  the daught

In [86]:
ques = "Ravan was killed by whom."
print nlp.parse(ques)
graphQues = getGraph(ques)

(ROOT
  (S
    (NP (NNP Ravan))
    (VP (VBD was)
      (VP (VBN killed)
        (PP (IN by)
          (NP (WP whom)))))
    (. .)))
Loading embeddings from /usr/local/lib/python2.7/dist-packages/neuralcoref/weights/static_word
Loading embeddings from /usr/local/lib/python2.7/dist-packages/neuralcoref/weights/tuned_word
['Ravan was killed by whom']
(NP (NNP Ravan) ----> (VP (VBD was)(VP (VBN killed)(PP (IN by)
(VP (VBD was)(VP (VBN killed)(PP (IN by) ----> (NP (WP whom)


In [87]:
print SVO(ques)

['(NP (NNP Ravan)', ['(VP (VBD was)(VP (VBN killed)(PP (IN by)', ['(NP (WP whom)']]]


In [88]:
ent = 'Ram'
for np in graph.NP:
    print np.sentence

Dashrath
the king of Ayodhya
three wives namely Kausalya , Kaikeyi and Sumitra
Bharat and Shatrugan
the sons of Kaikeyi
Ram
the son of Kausalya
Laxman
the son of Sumitra
Sita
the daughter of Janak
Ravan
the ruler of Lanka


In [89]:
entToNode = {}
for np in graph.NP:
    if np.entity is not None:
        entToNode[np.entity] = np

In [90]:
def stopWordRemoval(ques):
    stop_words = nltk.corpus.stopwords.words('english')
    whWords = ['Who', 'Whom', 'What', 'Where', 'Which', 'How', 'Why', 'When']
    for wh in whWords:
        try:
            idx = stop_words.index(wh)
            del stop_words[idx]
        except:
            pass
        
        try:
            idx = stop_words.index(wh.lower())
            del stop_words[idx]
        except:
            pass
    print ques
    return [q for q in ques.split() if q not in stop_words]    

In [91]:
ques = 'Who was the king of Ayodhya.'
print nlp.parse(ques)

(ROOT
  (SBARQ
    (WHNP (WP Who))
    (SQ (VBD was)
      (NP
        (NP (DT the) (NN king))
        (PP (IN of)
          (NP (NNP Ayodhya)))))
    (. .)))


In [92]:
def findAns(ques):
    graphQues, nerQues = getGraph(ques)
    quesEntity = nerQues[0]
    entNode = entToNode[quesEntity]
    ques = stopWordRemoval(ques[:-1])
    origQues = ' '.join(ques)
    sentence = nlp.parse(origQues)
    sentence = str(sentence)
    ar = [a.strip() for a in sentence.split('\n')]
    sentence = ''.join(ar)
    wpIdx = sentence.find('(WP')
    mapping = getMapping(sentence)
    wp = converter(sentence[wpIdx:mapping[wpIdx]])
    j = ques.index(wp)
    entityNode = entToNode[quesEntity]
    
    
    i = ques.index(quesEntity)
    deli = 1
    if(j<i):deli = -1
    while (i-j)!=1 and (i-j)!=-1: 
        print entityNode.sentence
        input()
        if(ques[i+deli] in entityNode.sentence):
            i+=deli
        else:
            for edge in graph.Edges:
                print entityNode.sentence
                if entityNode==edge[1]:
                    node = edge[0]
                    if(node in graph.NP):
                        if(ques[i+deli] in node.sentence):
                            i+=deli
                            entityNode = node
                            break
                    else:
                        if(ques[i+deli] in node.verb):
                            i+=deli
                            entityNode = node
                            break
    if(entityNode in graph.NP):
        for edge in graph.Edges:
            if(entityNode==edge[1] and (edge[0] not in graph.NP)):
                entityNode = edge[0]
                break
    
    for edge in graph.Edges:
        if(entityNode==edge[1] and (edge[0] in graph.NP)):
            entityNode = edge[0]
            break
    return entityNode.entity

In [93]:
print findAns(ques)

Loading embeddings from /usr/local/lib/python2.7/dist-packages/neuralcoref/weights/static_word
Loading embeddings from /usr/local/lib/python2.7/dist-packages/neuralcoref/weights/tuned_word
['Who was the king of Ayodhya']
Who was the king of Ayodhya
the king of Ayodhya
1
Dashrath
