# BERT Model

In [1]:
# Imports
import torch
import nltk
from transformers import BertForQuestionAnswering, BertTokenizer, AutoTokenizer
from sentence_transformers import SentenceTransformer, util

In [2]:
# Model
# model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
# tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [3]:
# Query and Context
from queries import get_text_cli
from get_documents import search

In [4]:
# Encoding
# encoding = tokenizer.encode_plus(text=question, text_pair=context[2])
# inputs = encoding['input_ids']
# sentence_embedding = encoding['token_type_ids']

# inputs = tokenizer(
#     question,
#     context[2],
#     max_length=100,
#     truncation="only_second",
#     stride=50,
#     return_overflowing_tokens=True,
#     return_offsets_mapping=True
# )

# tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

In [5]:
# Utility Functions
def query_and_context():
    term = get_text_cli('Enter a search term')
    context = search(term)
    query = get_text_cli("Enter your question")
    return {
        'query': query, 
        'context_id': context[0], 
        'context_title': context[1], 
        'context': context[2]
    }

def segment_text(text):
    tokens = nltk.word_tokenize(text)
    segments = []
    while tokens:
        segments.append(' '.join(tokens[:512]))
        del tokens[:512]
    
    return segments

In [6]:
# Model Inference
def run_model(query, text):
    # Initialising model
    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    
    # Initialising tokeniser
    tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    
    inputs = tokenizer(
        query,
        text,
        max_length=100,
        truncation="only_second",
        stride=50,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Running model
    output = model(
        input_ids=torch.tensor([inputs['input_ids'][0]]), 
        token_type_ids=torch.tensor(inputs['token_type_ids'][0])
    )
    
    # Putting answer together
    start_i = torch.argmax(output['start_logits'])
    end_i = torch.argmax(output['end_logits'])
    (start_i, end_i)
    
    answer = ' '.join(tokens[start_i:end_i+1])
    corrected_answer = ''
    for word in answer.split():
        #If it's a subword token
        if word[0:2] == '##':
            corrected_answer += word[2:]
        else:
            corrected_answer += ' ' + word
    
    return corrected_answer

In [7]:
# segs = segment_text(context[2])
# answer_body = ''
# for seg in segs:
#     answer_body += '| ' + run_model(question, seg)

# print(answer_body)

## Workshop

In [8]:
from tf_idf import tokenize
# word_dict = query_and_context()
# word_dict

In [9]:
def sent_rank(query, context, n=0):
#     query_set = set(tokenize(query))
#     sentences = {sent: tokenize(sent) for sent in nltk.sent_tokenize(context)}
#     sent_scores = { sent: 0 for sent in sentences}
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    sent_scores = {
        sent: text_similarity(query, sent, model)
        for sent in nltk.sent_tokenize(context)
    }
#     for sent in sentences:
#         common_words = query_set.intersection(set(sentences[sent]))
#         sent_scores[sent] += len(common_words)
    
    ranked_scores = sorted(
        sent_scores.items(),
        key = lambda x: x[1],
    )
    
    return ranked_scores

def build_input_text(ranked_sents, max_output_length=512):
    input_text = ''
    
    while True:
        new_sent = ranked_sents.pop()[0]
        if len(nltk.word_tokenize(input_text + ' ' + new_sent)) <= max_output_length:
            input_text += ' ' + new_sent
        else:
            break
    
    return input_text

def text_similarity(text_1, text_2, model):
    embedding_1= model.encode(text_1, convert_to_tensor=True)
    embedding_2 = model.encode(text_2, convert_to_tensor=True)
    
    return float(util.pytorch_cos_sim(embedding_1, embedding_2))

In [10]:
# ranked_sents = sent_rank(word_dict['query'], word_dict['context'], 0)
# ranked_sents

In [11]:
# input_text = build_input_text(ranked_sents)
# input_text

In [12]:
# run_model(word_dict['query'], input_text)

In [13]:
def info_extraction_procedure():
    word_dict = query_and_context()
    ranked_sents = sent_rank(word_dict['query'], word_dict['context'], 0)
    print(ranked_sents)
    input_text = build_input_text(ranked_sents)
    print(input_text)
    model_output = run_model(word_dict['query'], input_text)
    return word_dict['query'], model_output

In [14]:
question, answer = info_extraction_procedure()
print(f'Question: "{question}"', f'Answer: "{answer}"', sep="\n")

Enter a search term: Tom Cruise
Enter your question: Where did Tom grow up?
[('Lambs was released on November 9, 2007,[citation needed] opening to unimpressive box office revenue and critical reception.', -0.03133797645568848), ('YouTube has declined to remove it again, due to the popularity of the video, and subsequent changes to copyright policy of the website.', -0.02971884235739708), ('From The Lancet, "He may be right that psychotropic drugs are overused, sometimes misused; and that lifestyle changes (and exercise for depression) can be helpful.', 0.000715944916009903), ('After YouTube investigated this claim, they found that the video did not breach copyright law, as it is covered by the fair use clause.', 0.014659376814961433), ('The film was released to widespread critical praise, with many reviewers deeming it superior to its predecessor.', 0.018153276294469833), ('Production began in 2007 of Valkyrie, a thriller based on the July 20, 1944, assassination attempt against Adolf 

Question: "where did tom grow up?"
Answer: " near poverty"


In [15]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
val = text_similarity('My name is Thanos', 'My name is Thanos', model)
val

1.0000001192092896

In [16]:
float(val)

1.0000001192092896

In [17]:
question, answer = info_extraction_procedure()
print(f'Question: "{question}"', f'Answer: "{answer}"', sep="\n")

Enter a search term: Bill Gates
Enter your question: How old is Bill Gates
[('The criticism came due to the possibility of this preventing poorer nations from obtaining adequate vaccines.', -0.11219897866249084), ('Among others, it supports a wide range of public health projects, granting aid to fight transmissible diseases such AIDS, tuberculosis and malaria, as well as widespread vaccine programs to eradicate polio.', -0.0936368927359581), ('The bill should cut the global greenhouse gas emissions in a level similar to "eliminating the annual planet-warming pollution of France and Germany combined" and may help to limit the warming of the planet to 1.5 degrees - the target of the Paris Agreement.', -0.07491849362850189), ('Before we really began to understand disease and the weather and things like that, we sought false explanations for them.', -0.04471825808286667), ('I mean, you know, we ask you to wear pants, and no American says, or very few Americans say, that that\'s, like, some

Question: "how old is bill gates"
Answer: " 1955"


In [18]:
run_model('Who founded Stark Industries?', 'Tony Stark (son of Henry Stark, the creator of Stark Industries) is a billionaire, genius, playboy, philanthropist.')

' henry stark'

# OpenAI

In [19]:
import requests as req
import openai

In [20]:
text_info = query_and_context()

Enter a search term: Barack Obama
Enter your question: How wealthy is Barack Obama?


In [21]:
ranked_sents = sent_rank(text_info['query'], text_info['context'])
input_text = build_input_text(ranked_sents, 3500)

In [22]:
base_url = "https://api.openai.com/v1/completions"
openai.api_key = "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"

openai.Completion.create(
    model="text-davinci-003", 
    prompt=f"Context: {input_text} Query: {text_info['query']}\n\nUsing the context, answer the query.", 
    temperature=0, 
#     max_tokens=7,
)
# res = req.get(
#     base_url, 
#     headers={
#         'Authorization': f'Bearer {api_key}',
# #         'Content-Type': 'application/json'
#     }, 
#     data={
#         "model": "text-davinci-003", 
#         "prompt": "Say this is a test", 
#         "temperature": 0, 
#         "max_tokens": 7,
#     }
# )

<OpenAIObject text_completion id=cmpl-6hZm5reNv4zq0VALz7nbeZbseSuc4 at 0x7fb4d21b1680> JSON: {
  "choices": [
    {
      "finish_reason": "length",
      "index": 0,
      "logprobs": null,
      "text": "\n\nBarack Obama is estimated to be worth as much as $10 million"
    }
  ],
  "created": 1675843641,
  "id": "cmpl-6hZm5reNv4zq0VALz7nbeZbseSuc4",
  "model": "text-davinci-003",
  "object": "text_completion",
  "usage": {
    "completion_tokens": 16,
    "prompt_tokens": 3831,
    "total_tokens": 3847
  }
}

In [24]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
val = text_similarity(text_info['query'], text_info['context'], model)
val

0.5411624908447266