# BERT Model

In [1]:
# Imports
import torch
import nltk
from transformers import BertForQuestionAnswering, BertTokenizer, AutoTokenizer

In [2]:
# Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [3]:
# Query and Context
from queries import get_text_cli
from get_documents import search

term = get_text_cli('Enter a search term')
context = search(term)
context

Enter a search term: Lionel Messi


('Q615',
 'Lionel Messi',

In [4]:
question = get_text_cli("Enter your question")

Enter your question: When were Messi and his father both found guilty of tax fraud?


In [5]:
# Encoding
# encoding = tokenizer.encode_plus(text=question, text_pair=context[2])
# inputs = encoding['input_ids']
# sentence_embedding = encoding['token_type_ids']

# inputs = tokenizer(
#     question,
#     context[2],
#     max_length=100,
#     truncation="only_second",
#     stride=50,
#     return_overflowing_tokens=True,
#     return_offsets_mapping=True
# )

# tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

In [6]:
def segment_text(text):
    tokens = nltk.word_tokenize(text)
    segments = []
    while tokens:
        segments.append(' '.join(tokens[:512]))
        del tokens[:512]
    
    return segments

In [7]:
def run_model(query, text):
    # Initialising tokeniser
    inputs = tokenizer(
        query,
        text,
        max_length=100,
        truncation="only_second",
        stride=50,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Running model
    output = model(
        input_ids=torch.tensor([inputs['input_ids'][0]]), 
        token_type_ids=torch.tensor(inputs['token_type_ids'][0])
    )
    
    # Putting answer together
    start_i = torch.argmax(output['start_logits'])
    end_i = torch.argmax(output['end_logits'])
    (start_i, end_i)
    
    answer = ' '.join(tokens[start_i:end_i+1])
    corrected_answer = ''
    for word in answer.split():
        #If it's a subword token
        if word[0:2] == '##':
            corrected_answer += word[2:]
        else:
            corrected_answer += ' ' + word
    
    return corrected_answer

In [8]:
segs = segment_text(context[2])
answer_body = ''
for seg in segs:
    answer_body += '| ' + run_model(question, seg)

print(answer_body)

|  24 june 1987|  2005|  four|  barcelona|  16 november 2003|  2005|  17 may|  2007 – 08| |  [SEP]| | | | | |  2015|  14 august|  2016 – 17 season with 54 goals , while his 37 goals in la liga saw him claim both the pichichi and european golden boot awards for the fourth time in his career . messi opened the 2017 – 18|  7 april|  13 january 2019|  [SEP]| |  4 september| |  1 july|  2 january 2022| |  17 august 2005|  2007|  [SEP]|  [SEP]|  [SEP]|  [SEP]|  [SEP]|  2018| |  2 july|  1993|  december , messi made his record 26th world cup finals appearance at lusail stadium . messi scored argentina ' s opening goal with a penalty , becoming in the process the first player since the last - 16 round was introduced in 1986|  [SEP]|  [SEP]|  2021|  2019| |  [SEP]|  2012| |  november 2016


# Workshop

In [16]:
len(context[2])

127661

In [17]:
nltk.sent_tokenize(context[2])

['Argentine professional footballer\nEponyms and public art\nFilms\nFamily\nLionel Andrés Messi[note 1] (Spanish pronunciation:\xa0[ljoˈnel anˈdɾes ˈmesi] (listen); born 24 June 1987), also known as Leo Messi, is an Argentine professional footballer who plays as a forward for  Ligue 1 club Paris Saint-Germain and captains the Argentina national team.',
 "Widely regarded as one of the greatest players of all time, Messi has won a record seven Ballon d'Or awards,[note 2] a record six European Golden Shoes, and in 2020 was named to the Ballon d'Or Dream Team.",
 'Until leaving the club in 2021, he had spent his entire professional career with Barcelona, where he won a club-record 35 trophies, including 10 La Liga titles, seven Copa del Rey titles and four UEFA Champions Leagues.',
 'With his country, he won the 2021 Copa América and the 2022 FIFA World Cup.',
 'A prolific goalscorer and creative playmaker, Messi holds the records for most goals in La Liga (474), most hat-tricks in La Liga

In [22]:
from tf_idf import tokenize
sents = {sent: tokenize(sent) for sent in nltk.sent_tokenize(context[2])}
sents

{'Argentine professional footballer\nEponyms and public art\nFilms\nFamily\nLionel Andrés Messi[note 1] (Spanish pronunciation:\xa0[ljoˈnel anˈdɾes ˈmesi] (listen); born 24 June 1987), also known as Leo Messi, is an Argentine professional footballer who plays as a forward for  Ligue 1 club Paris Saint-Germain and captains the Argentina national team.': ['argentine',
  'professional',
  'footballer',
  'eponyms',
  'public',
  'art',
  'films',
  'family',
  'lionel',
  'andrés',
  'messi',
  'note',
  '1',
  'spanish',
  'pronunciation',
  'ljoˈnel',
  'anˈdɾes',
  'ˈmesi',
  'listen',
  'born',
  '24',
  'june',
  '1987',
  'also',
  'known',
  'leo',
  'messi',
  'argentine',
  'professional',
  'footballer',
  'plays',
  'forward',
  'ligue',
  '1',
  'club',
  'paris',
  'saint-germain',
  'captains',
  'argentina',
  'national',
  'team'],
 "Widely regarded as one of the greatest players of all time, Messi has won a record seven Ballon d'Or awards,[note 2] a record six European Go

In [38]:
def sent_rank(query_set, sentences, n):
    sent_scores = { sent: 0 for sent in sentences}
    
    for sent in sentences:
        common_words = query.intersection(set(sentences[sent]))
        sent_scores[sent] += len(common_words)
    
    ranked_scores = sorted(
        sent_scores.items(),
        key = lambda x: x[1],
        reverse=True
    )
    
    return ranked_scores

In [39]:
sent_rank(set(tokenize(question)), sents, 0)

{'y', 'l', 'r', '?', 'o', 't', 'x', 'a', 'g', 'f', 'b', 's', 'm', 'w', 'h', 'i', 'u', 'e', 'd', ' ', 'n'}


[('After being named player of the tournament in four international pre-season competitions with the Juveniles B, he played only one official match with the team before being promoted to the Juveniles A, where he scored 18 goals in 11 league games.',
  1),
 ("His performance, creating two chances and a shot on goal, impressed the technical staff, and he subsequently began training daily with the club's reserve side, Barcelona B, as well as weekly with the first team.",
  1),
 ('A month later, on 6 March, he made his debut for Barcelona B in the Segunda División B, and his buyout clause automatically increased to €80\xa0million.',
  1),
 ('He played five games with the B team that season but did not score.', 1),
 ('Towards the end of the season, he returned to both youth teams, helping the Juveniles B win the league.',
  1),
 ('During the 2004–05 season, Messi was a guaranteed starter for the B team, playing 17 games throughout the campaign and scoring on six occasions.',
  1),
 ('Argen

In [30]:
set(tokenize(question))

{'father', 'found', 'fraud', 'guilty', 'messi', 'tax'}

In [None]:
model_input = ''