# BERT Model

In [1]:
# Imports
import torch
import nltk
from transformers import BertForQuestionAnswering, BertTokenizer, AutoTokenizer
from sentence_transformers import SentenceTransformer, util

In [2]:
# Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [3]:
# Query and Context
from queries import get_text_cli
from get_documents import search

In [4]:
# Encoding
# encoding = tokenizer.encode_plus(text=question, text_pair=context[2])
# inputs = encoding['input_ids']
# sentence_embedding = encoding['token_type_ids']

# inputs = tokenizer(
#     question,
#     context[2],
#     max_length=100,
#     truncation="only_second",
#     stride=50,
#     return_overflowing_tokens=True,
#     return_offsets_mapping=True
# )

# tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

In [5]:
# Utility Functions
def query_and_context():
    term = get_text_cli('Enter a search term')
    context = search(term)
    query = get_text_cli("Enter your question")
    return {
        'query': query, 
        'context_id': context[0], 
        'context_title': context[1], 
        'context': context[2]
    }

def segment_text(text):
    tokens = nltk.word_tokenize(text)
    segments = []
    while tokens:
        segments.append(' '.join(tokens[:512]))
        del tokens[:512]
    
    return segments

In [6]:
# Model Inference
def run_model(query, text):
    # Initialising tokeniser
    inputs = tokenizer(
        query,
        text,
        max_length=100,
        truncation="only_second",
        stride=50,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Running model
    output = model(
        input_ids=torch.tensor([inputs['input_ids'][0]]), 
        token_type_ids=torch.tensor(inputs['token_type_ids'][0])
    )
    
    # Putting answer together
    start_i = torch.argmax(output['start_logits'])
    end_i = torch.argmax(output['end_logits'])
    (start_i, end_i)
    
    answer = ' '.join(tokens[start_i:end_i+1])
    corrected_answer = ''
    for word in answer.split():
        #If it's a subword token
        if word[0:2] == '##':
            corrected_answer += word[2:]
        else:
            corrected_answer += ' ' + word
    
    return corrected_answer

In [7]:
# segs = segment_text(context[2])
# answer_body = ''
# for seg in segs:
#     answer_body += '| ' + run_model(question, seg)

# print(answer_body)

# Workshop

In [8]:
from tf_idf import tokenize
# word_dict = query_and_context()
# word_dict

In [27]:
def sent_rank(query, context, n):
#     query_set = set(tokenize(query))
#     sentences = {sent: tokenize(sent) for sent in nltk.sent_tokenize(context)}
#     sent_scores = { sent: 0 for sent in sentences}
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    sent_scores = {
        sent: text_similarity(query, sent, model)
        for sent in nltk.sent_tokenize(context)
    }
#     for sent in sentences:
#         common_words = query_set.intersection(set(sentences[sent]))
#         sent_scores[sent] += len(common_words)
    
    ranked_scores = sorted(
        sent_scores.items(),
        key = lambda x: x[1],
    )
    
    return ranked_scores

def build_input_text(ranked_sents):
    input_text = ''
    
    while True:
        new_sent = ranked_sents.pop()[0]
        if len(nltk.word_tokenize(input_text + ' ' + new_sent)) <= 512:
            input_text += ' ' + new_sent
        else:
            break
    
    return input_text

def text_similarity(text_1, text_2, model):
    embedding_1= model.encode(text_1, convert_to_tensor=True)
    embedding_2 = model.encode(text_2, convert_to_tensor=True)
    
    return float(util.pytorch_cos_sim(embedding_1, embedding_2))

In [28]:
# ranked_sents = sent_rank(word_dict['query'], word_dict['context'], 0)
# ranked_sents

In [29]:
# input_text = build_input_text(ranked_sents)
# input_text

In [30]:
# run_model(word_dict['query'], input_text)

In [33]:
def info_extraction_procedure():
    word_dict = query_and_context()
    ranked_sents = sent_rank(word_dict['query'], word_dict['context'], 0)
    print(ranked_sents)
    input_text = build_input_text(ranked_sents)
    print(input_text)
    model_output = run_model(word_dict['query'], input_text)
    return word_dict['query'], model_output

In [35]:
question, answer = info_extraction_procedure()
print(f'Question: "{question}"', f'Answer: "{answer}"', sep="\n")

Enter a search term: Tom Cruise
Enter your question: Where did Tom Cruise grow up?
[('YouTube has declined to remove it again, due to the popularity of the video, and subsequent changes to copyright policy of the website.', -0.014301898889243603), ('Lambs was released on November 9, 2007,[citation needed] opening to unimpressive box office revenue and critical reception.', -0.00684901000931859), ('New York law requires all divorce documents remain sealed, so the exact terms of the settlement are not publicly available.', 0.03953713923692703), ('From The Lancet, "He may be right that psychotropic drugs are overused, sometimes misused; and that lifestyle changes (and exercise for depression) can be helpful.', 0.045323532074689865), ('After YouTube investigated this claim, they found that the video did not breach copyright law, as it is covered by the fair use clause.', 0.049348801374435425), ('This drew criticism from medical professionals and firefighters.', 0.04937497898936272), ("That

TypeError: forward() got an unexpected keyword argument 'input_ids'

In [26]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
val = text_similarity('My name is Thanos', 'My name is Thanos', model)
val

tensor([[1.0000]])

In [21]:
float(val)

0.17249435186386108