In [None]:
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words -= {'not'}
from nltk.corpus import wordnet as wn

!pip install spacy
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

import io
import sys
import re

In [267]:
# 如果完全一致则为true
# 如果任意方多了not则为false
# 如果原文中词是hyper或synonym，则为true
# 否则为false

# or语句：拆成两句

In [280]:
def textToComponents(text):
  doc = nlp(text)
  subject_found, aux_found = False, False
  subject, rest = None, None
  prior = []
  
  for i, token in enumerate(doc):
    # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_)
    # print([t for t in token.head.subtree])
    # print([t for t in token.subtree])

    if subject_found and aux_found:
      return [t.text for t in prior], [t.text for t in subject], [t.text for t in rest]
    if not aux_found and (token.dep_ in {'aux', 'auxpass'} or token.pos_ == 'AUX'):
      aux_found = True
      rest = [t for t in doc[i+1:-1]]
    elif not subject_found and token.dep_ in {'nsubj', 'nsubjpass'}:
      subject_found = True
      subject = [t for t in token.subtree]
      prior = [t for t in prior if t not in token.subtree]
    elif not subject_found and token.pos_ != 'PUNCT':
      prior.append(token)
  
  # not found: no auxilary verb
  if subject_found:
    for i, token in enumerate(doc):
      if token not in prior and token not in subject:
        rest = doc[i:-1]
        return [t.text for t in prior], [t.text for t in subject], [t.text for t in rest]

def qToSentense(q, docT):
  # Given a question and the processed declarative sentence, return the context, subject, rest of the sentence in the question
  doc = nlp(q)
  subject_found, aux_found = False, False
  whole_sentence = []
  prior_sentence = []
  subject = []
  rest_sentence = []
  aux = None

  skipTo = len(doc)
  for i, token in enumerate(doc):
    if not aux_found and (token.dep_ in {'aux', 'auxpass'} or token.pos_ == 'AUX'):
      aux_found = True
      aux = token
    if not subject_found and token.dep_ in {'nsubj', 'nsubjpass'}:
      subject_found = True
      subject = [t for t in token.subtree]
      prior_sentence = [w for w in prior_sentence if w not in subject]

      skipTo = i + len(subject) - subject.index(token)
      if skipTo >= len(doc)-1:
        break # parsing is wrong
      rest_sentence = doc[skipTo:-1]
      # print(" ".join([t.text for t in prior_sentence] + [t.text for t in subject] + [aux.text.lower()] + [t.text for t in rest_sentence]) + ".")
      return [t.text for t in prior_sentence], [t.text for t in subject], [t.text for t in rest_sentence]
    if not aux_found and token.pos_ != 'PUNCT':
      prior_sentence.append(token)
  
  # The case where nothing is found
  possibleSubj = " ".join(docT[1])
  matching_subj = re.findall(r'\b{phrase}\b'.format(phrase=possibleSubj), q, re.IGNORECASE)
  if len(matching_subj) > 0:
    fixedSubj = nltk.word_tokenize(matching_subj[0])
  else:
    return None, None, None
  for i, token in enumerate(doc):
    if token not in prior_sentence and token != aux and token.pos_ != 'PUNCT' and token.text not in fixedSubj:
      return [t.text for t in prior_sentence], fixedSubj, [t.text for t in doc[i:-1]]
  return None, None, None

sent = "Was A third technique Hooper employed the off-centre framing of characters?"
text = "A third technique Hooper employed was the off-centre framing of characters."

# text = "The weaving mill scene was filmed at the Queen Street Mill in Burnley."
# sent = "Was the weaving mill scene filmed at the King Street Mill in Burnley?"

print(textToComponents(text))
print()
print(qToSentense(sent, textToComponents(text)))



([], ['A', 'third', 'technique', 'Hooper', 'employed'], ['the', 'off', '-', 'centre', 'framing', 'of', 'characters'])

([], ['A', 'third', 'technique', 'Hooper', 'employed'], ['the', 'off', '-', 'centre', 'framing', 'of', 'characters'])


In [277]:
def isHypernym(largeW, smallW):
  print(largeW, smallW)
  large = wn.synsets(largeW)
  small = wn.synsets(smallW)
  if large and small:
    hypers = set()
    for subsmall in small:
      hypers = hypers.union(set([i for i in subsmall.closure(lambda s:s.hypernyms())]))
    for sublarge in large:
      if sublarge in hypers:
        return True
  return False

def isSynonym(w1, w2):
  print(w1, w2)
  first = set(wn.synsets(w1))
  second = set(wn.synsets(w2))
  return len(first.intersection(second)) > 0

def removeStopWords(wordL):
  return [word.lower() for word in wordL if word.lower() not in stop_words]

def subjectMatch(subjQ, subjT):
  subjQ, subjT = removeStopWords(subjQ), removeStopWords(subjT)
  notQ, notT = ("not" in subjQ), ("not" in subjT)
  if notQ != notT:
    return False
  subjQ, subjT = " ".join(subjQ), " ".join(subjT)
  return re.search(r'\b{phrase}\b'.format(phrase=subjQ), subjT) != None# or isHypernym(subjQ, subjT)

def contextMatch(contextQ, contextT):
  contextQ, contextT = removeStopWords(contextQ), removeStopWords(contextT)
  contextQ, contextT = " ".join(contextQ), " ".join(contextT)
  return not contextQ or re.search(r'\b{phrase}\b'.format(phrase=contextQ), contextT) != None

def restMatch(restQ, restT):
  restQ, restT = removeStopWords(restQ), removeStopWords(restT)
  notQ, notT = ("not" in restQ), ("not" in restT)
  if notQ != notT:
    return False
  for qw in restQ:
    found = False
    if qw in restT:
      found = True
    else:
      print("\t\tTESTING HYPER...", qw)
      for tw in restT:
        if isHypernym(qw, tw) or isSynonym(qw, tw):
          found = True
          break
    if not found: 
      return False
  return True

text = "Yesterday, George VI is captured hunched on the side of a couch at the edge of the frame."
sent = "Is George VI captured hunched on the side of a couch at the edge of the frame yesterday?"
docT = textToComponents(text)
docQ = qToSentense(sent, docT)
print(textToComponents(text), end = "\n\n")
print(qToSentense(sent, docT), end = "\n\n")
print(subjectMatch(docQ[1], docT[1]) and contextMatch(docQ[0], docT[0]) and restMatch(docQ[2], docT[2] + docT[0]))


(['Yesterday'], ['George', 'VI'], ['captured', 'hunched', 'on', 'the', 'side', 'of', 'a', 'couch', 'at', 'the', 'edge', 'of', 'the', 'frame'])

([], ['George', 'VI'], ['captured', 'hunched', 'on', 'the', 'side', 'of', 'a', 'couch', 'at', 'the', 'edge', 'of', 'the', 'frame', 'yesterday'])

True


In [276]:
def findSimilar(q): # should be handled somewhere in answer directly
  return "Seidler and Hooper were convinced of his suitability for the role."

def answerBinary(q):
  sentence = findSimilar(q)
  docT = textToComponents(sentence)
  # print(docT)
  docQ = qToSentense(q, docT)
  # print(docQ)
  if docQ[1] and docQ[2]:
    return subjectMatch(docQ[1], docT[1]) and contextMatch(docQ[0], docT[0]) and restMatch(docQ[2], docT[2] + docT[0])
  return False

q = "Were Seidler and Hooper convinced of his suitability for the role?"
print(answerBinary(q))

True


In [265]:
# Given a binary Q with keyword "or", assume only one word is altered, 
# check which question (after splitting) is true
def splitOr(q):
  words = nltk.word_tokenize(q)
  if "or" in words: # must be true to call this function
    index = words.index("or")
    q1 = words[:index] + words[index+2:-1]
    q2 = words[:index-1] + words[index+1:-1]
    return " ".join(q1)+"?", " ".join(q2)+"?", words[index-1], words[index+1]
  else:
    raise Exception("Question should contain 'or' to make this function valid")

def answerOr(q):
  q1, q2, keyword1, keyword2 = splitOr(q)
  if answerBinary(q1): 
    return keyword1[0].upper() + keyword1[1:] + "."
  return keyword2[0].upper() + keyword2[1:] + "."

answerOr("Is Tom good at painting or boxing?")

Tom is good at painting.


'Boxing.'

**当前**：given the sentence, return True or False (probably not supported for do-lead questions) based on that

to be combined with the wh-answer part to find the most relevant sentence

**What else can be improved**: apposition