# BERT Model

In [1]:
# Imports
import torch
import nltk
from transformers import BertForQuestionAnswering, BertTokenizer, AutoTokenizer

In [2]:
# Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [3]:
# Query and Context
from queries import get_text_cli
from get_documents import search

In [4]:
# Encoding
# encoding = tokenizer.encode_plus(text=question, text_pair=context[2])
# inputs = encoding['input_ids']
# sentence_embedding = encoding['token_type_ids']

# inputs = tokenizer(
#     question,
#     context[2],
#     max_length=100,
#     truncation="only_second",
#     stride=50,
#     return_overflowing_tokens=True,
#     return_offsets_mapping=True
# )

# tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

In [5]:
# Utility Functions
def query_and_context():
    term = get_text_cli('Enter a search term')
    context = search(term)
    query = get_text_cli("Enter your question")
    return {
        'query': query, 
        'context_id': context[0], 
        'context_title': context[1], 
        'context': context[2]
    }

def segment_text(text):
    tokens = nltk.word_tokenize(text)
    segments = []
    while tokens:
        segments.append(' '.join(tokens[:512]))
        del tokens[:512]
    
    return segments

In [6]:
# Model Inference
def run_model(query, text):
    # Initialising tokeniser
    inputs = tokenizer(
        query,
        text,
        max_length=100,
        truncation="only_second",
        stride=50,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Running model
    output = model(
        input_ids=torch.tensor([inputs['input_ids'][0]]), 
        token_type_ids=torch.tensor(inputs['token_type_ids'][0])
    )
    
    # Putting answer together
    start_i = torch.argmax(output['start_logits'])
    end_i = torch.argmax(output['end_logits'])
    (start_i, end_i)
    
    answer = ' '.join(tokens[start_i:end_i+1])
    corrected_answer = ''
    for word in answer.split():
        #If it's a subword token
        if word[0:2] == '##':
            corrected_answer += word[2:]
        else:
            corrected_answer += ' ' + word
    
    return corrected_answer

In [7]:
# segs = segment_text(context[2])
# answer_body = ''
# for seg in segs:
#     answer_body += '| ' + run_model(question, seg)

# print(answer_body)

# Workshop

In [8]:
from tf_idf import tokenize
word_dict = query_and_context()
word_dict

Enter a search term: Lionel Messi
Enter your question: Is Lionel Messi married?


{'query': 'is lionel messi married?',
 'context_id': 'Q615',
 'context_title': 'Lionel Messi',

In [9]:
def sent_rank(query, context, n):
    query_set = set(tokenize(query))
    sentences = {sent: tokenize(sent) for sent in nltk.sent_tokenize(context)}
    sent_scores = { sent: 0 for sent in sentences}
    
    for sent in sentences:
        common_words = query_set.intersection(set(sentences[sent]))
        sent_scores[sent] += len(common_words)
    
    ranked_scores = sorted(
        sent_scores.items(),
        key = lambda x: x[1],
    )
    
    return ranked_scores

def build_input_text(ranked_sents):
    input_text = ''
    
    while True:
        new_sent = ranked_sents.pop()[0]
        if len(nltk.word_tokenize(input_text + ' ' + new_sent)) <= 512:
            input_text += ' ' + new_sent
        else:
            break
    
    return input_text

In [10]:
ranked_sents = sent_rank(word_dict['query'], word_dict['context'], 0)
ranked_sents

[('Until leaving the club in 2021, he had spent his entire professional career with Barcelona, where he won a club-record 35 trophies, including 10 La Liga titles, seven Copa del Rey titles and four UEFA Champions Leagues.',
  0),
 ('With his country, he won the 2021 Copa América and the 2022 FIFA World Cup.',
  0),
 ('He has also the most international goals by a South American male (98).',
  0),
 ("During the 2011–12 season, he set the La Liga and European records for most goals scored in a single season, while establishing himself as Barcelona's all-time top scorer.",
  0),
 ('Out of contract, he signed for Paris Saint-Germain in August 2021.', 0),
 ('At youth level, he won the 2005 FIFA World Youth Championship, finishing the tournament with both the Golden Ball and Golden Shoe, and an Olympic gold medal at the 2008 Summer Olympics.',
  0),
 ("As the squad's captain from August 2011, he led Argentina to three consecutive finals: the 2014 FIFA World Cup, for which he won the Golden 

In [11]:
input_text = build_input_text(ranked_sents)
input_text

' On 21 May, Messi was included in Lionel Scaloni\'s final 23-man Argentina squad for the 2019 Copa América. President of Argentina Mauricio Macri urged Messi not to quit, stating, "We are lucky, it is one of life\'s pleasures, it is a gift from God to have the best player in the world in a footballing country like ours... Lionel Messi is the greatest thing we have in Argentina and we must take care of him." On 4 September, Jorge Messi, Lionel\'s father and agent, released a statement in response to La Liga claiming the release clause "is not valid when the termination of the contract is by the player\'s unilateral decision from the end of the 2019–20 season", as stated in Messi\'s contract with Barcelona; moments later, La Liga issued a response reiterating their statement published on 30 August. Argentine professional footballer\nEponyms and public art\nFilms\nFamily\nLionel Andrés Messi[note 1] (Spanish pronunciation:\xa0[ljoˈnel anˈdɾes ˈmesi] (listen); born 24 June 1987), also kno

In [12]:
run_model(word_dict['query'], input_text)

''

In [13]:
def info_extraction_procedure():
    word_dict = query_and_context()
    ranked_sents = sent_rank(word_dict['query'], word_dict['context'], 0)
    print(ranked_sents)
    input_text = build_input_text(ranked_sents)
    model_output = run_model(word_dict['query'], input_text)
    return word_dict['query'], model_output

In [14]:
question, answer = info_extraction_procedure()
print(f'Question: "{question}"', f'Answer: "{answer}"', sep="\n")

Enter a search term: Tom Cruise
Enter your question: Where did Tom Cruise grow up?
[("One of the world's highest-paid actors, he has received various accolades, including an Honorary Palme d'Or and three Golden Globe Awards, in addition to nominations for four Academy Awards.", 0), ('His films have grossed over $4 billion in North America and over $11.1 billion worldwide, making him one of the highest-grossing box-office stars of all time.', 0), ('Critical acclaim came with his roles in the dramas The Color of Money (1986), Rain Man (1988), and Born on the Fourth of July (1989).', 0), ('For his portrayal of Ron Kovic in the latter, he won a Golden Globe Award and received a nomination for the Academy Award for Best Actor.', 0), ('As a leading Hollywood star in the 1990s, he starred in several commercially successful films, including the drama A Few Good Men (1992), the thriller The Firm (1993), the horror film Interview with the Vampire (1994), and the romance Jerry Maguire (1996).', 0

Question: "where did tom cruise grow up?"
Answer: " 1980s"
