In [1]:
# all necessary packages
!pip install bs4
!pip install nltk
!pip install -U spacy==2.1.0
!python -m spacy download en
!pip uninstall -y neuralcoref 
!pip install neuralcoref --no-binary neuralcoref

!pip install sentence_transformers
!pip install transformers

In [2]:
import nltk
nltk.download('punkt')

import spacy
import neuralcoref

nlp = spacy.load('en_core_web_sm')  
neuralcoref.add_to_pipe(nlp)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<spacy.lang.en.English at 0x7fe659110090>

In [5]:
from bs4 import BeautifulSoup as Soup

file_name = "a1.htm"

with open(file_name, 'r') as fp:
    soup = Soup(fp, "html.parser")

# given the header to be found and the size (h2 or h3) of the header,
# return the passage below the section, with coreference resolved
def get_text(title, size):
  res = []
  cT = soup.find(size, text=title)
  nT = cT.next_sibling

  while nT.name not in {'h2', 'h3'}:
    if nT.name in ['p', 'blockquote'] and nT.text != "\n":
      res.append(nT.text)
    nT = nT.next_sibling
  
  res = " ".join(res)
  res = res.replace('\n', '')
  doc = nlp(res) 
  return doc._.coref_resolved

# get all subtitles, get rid of trivial ones, with the text under it
titles2 = soup.find_all('h2')
titles3 = soup.find_all('h3')
# some headers that are not informative
non_set = {'see also', 'notes', 'references', 'external links', 'citations'}

filtered_titles = {} # a dict mapping from headers to text in the section

# get all the texts under h2 or h3 and save to the dictionary
for title in titles2:
  title_standard = title.text.lower()
  if title_standard not in non_set:
    doc = nlp(title_standard)
    res = ""
    for token in doc:
      res += token.lemma_ + " "
    filtered_titles[res[:-1]] = get_text(title.text, 'h2')

for title in titles3:
  title_standard = title.text.lower()
  if title_standard not in non_set:
    doc = nlp(title_standard)
    res = ""
    for token in doc:
      res += token.lemma_ + " "
    filtered_titles[res[:-1]] = get_text(title.text, 'h3')

# print(filtered_titles)


In [20]:
question = "Who composed the music for the film?"

In [21]:
import re

# find the sections most likely corresponding to each question, if one exists
def get_passages(question):
  passages = []
  q_words = []

  doc = nlp(question)
  for w in doc:
    if not w.is_stop and not w.is_punct:
      q_words.append(w.lemma_)
  
  q_words = " ".join(q_words)
  print(q_words)

  for w in filtered_titles.keys():
    if re.search(r'\b{w}\b'.format(w=w), q_words):
      passages.append(filtered_titles[w])
  
  return passages

get_passages(question)

compose music film


['The film\'s music was largely composed by Ludovic Bource, but includes works by other composers such as Alberto Ginastera\'s "Estancia". The soundtrack was recorded in Belgium by the Brussels Philharmonic and was conducted by Ernst Van Tiel; the Brussels Jazz Orchestra also cooperated. The soundtrack took place during six days in April 2011 at Flagey\'s Studio 4 in Brussels. The film\'s climactic scene is set to Bernard Herrmann\'s "Scène d\'amour" from Bernard Herrmann\'s "Scène score to Alfred Hitchcock\'s film Vertigo. In Vertigo, that composition similarly accompanies an extended scene without dialogue. Only one song (sung, with lyrics) is used in The soundtrack, "Pennies from Heaven", sung by Rose "Chi-Chi" Murphy (uncredited). Only one song (sung, with lyrics) was written in 1936 although The film is set between 1927 and 1932. The soundtrack was released on 21 October 2011 through Sony Classical Records.']

In [22]:
# setup for finding most similar sentence

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
sentences = []
sentences += [question]
passages = get_passages(question) # in the case that keyword exists in header
# if keyword exists, we can increase the weight of the answer; otherwise just
# search randomly from the article for the highest answer; return the answer
# with highest score

# create a model containing embedded sentences
for txt in passages:
  for sent in sent_tokenize(txt):
    sentences.append(sent)
embeddings = model.encode(sentences)

compose music film


In [24]:
import heapq

# from all candidate sentences, find the ones closer to the query
msize = 8
savings = [] # keep the top sentences
for idx in range(0, 1):
  print(sentences[idx], "\n")
  size = 0
  pq = []
  heapq.heapify(pq)

  # print("\nQ{idx}: \n".format(idx = idx))
  for (i, res) in enumerate(cosine_similarity([embeddings[idx]], embeddings)[0]):
    if i <= 0: continue
    score = res
    heapq.heappush(pq, (score, sentences[i]))
    size += 1
    if size > msize:
      heapq.heappop(pq)
    
  savings.append(pq.copy())
  while pq:
    print(heapq.heappop(pq))

Who composed the music for the film? 

(0.34589365, 'In Vertigo, that composition similarly accompanies an extended scene without dialogue.')
(0.362118, "The soundtrack took place during six days in April 2011 at Flagey's Studio 4 in Brussels.")
(0.3753062, 'Only one song (sung, with lyrics) was written in 1936 although The film is set between 1927 and 1932.')
(0.39511776, 'Only one song (sung, with lyrics) is used in The soundtrack, "Pennies from Heaven", sung by Rose "Chi-Chi" Murphy (uncredited).')
(0.46096987, 'The soundtrack was recorded in Belgium by the Brussels Philharmonic and was conducted by Ernst Van Tiel; the Brussels Jazz Orchestra also cooperated.')
(0.4777918, 'The soundtrack was released on 21 October 2011 through Sony Classical Records.')
(0.60142684, 'The film\'s climactic scene is set to Bernard Herrmann\'s "Scène d\'amour" from Bernard Herrmann\'s "Scène score to Alfred Hitchcock\'s film Vertigo.')
(0.6697978, 'The film\'s music was largely composed by Ludovic Bour

In [25]:
from transformers import BertForQuestionAnswering, AutoTokenizer

modelname = 'deepset/bert-base-cased-squad2'

model_b = BertForQuestionAnswering.from_pretrained(modelname)
tokenizer = AutoTokenizer.from_pretrained(modelname)

from transformers import pipeline
nlp_b = pipeline('question-answering', model=model_b, tokenizer=tokenizer)

In [26]:
a = []
for i in range(min(8, len(savings[0]))): # 0 here because only 1 question raised
  a.append(savings[0][i][1])

highest_score = 0
best_ans = ""
for txt in a:
  tmp = nlp_b({
      "question": question,
      "context": txt
  })
  print(tmp)
  if tmp['score'] > highest_score:
    highest_score = tmp['score']
    best_ans = tmp['answer']

print("Final Answer:")
print(highest_score, best_ans)

{'score': 2.8897177983822075e-08, 'start': 3, 'end': 10, 'answer': 'Vertigo'}
{'score': 0.007454487029463053, 'start': 59, 'end': 67, 'answer': "Flagey's"}
{'score': 0.00043741127592511475, 'start': 49, 'end': 53, 'answer': '1936'}
{'score': 0.002851171186193824, 'start': 55, 'end': 78, 'answer': 'Sony Classical Records.'}
{'score': 0.47813281416893005, 'start': 37, 'end': 53, 'answer': 'Bernard Herrmann'}
{'score': 0.9547514915466309, 'start': 89, 'end': 103, 'answer': 'Ernst Van Tiel'}
{'score': 0.0008846460259519517, 'start': 107, 'end': 113, 'answer': 'Murphy'}
{'score': 0.9939894676208496, 'start': 41, 'end': 55, 'answer': 'Ludovic Bource'}
Final Answer:
0.9939894676208496 Ludovic Bource
