# BERT Model

In [42]:
# Imports
import torch
import nltk
from transformers import BertForQuestionAnswering, AutoModelForQuestionAnswering, BertTokenizer, AutoTokenizer
from sentence_transformers import SentenceTransformer, util

In [2]:
# Model
# model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
# tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [3]:
# Query and Context
from queries import get_text_cli
from get_documents import search

In [4]:
# Utility Functions
def query_and_context():
    term = get_text_cli('Enter a search term')
    context = search(term)
    query = get_text_cli("Enter your question")
    return {
        'query': query, 
        'context_id': context[0], 
        'context_title': context[1], 
        'context': context[2]
    }

def segment_text(text):
    tokens = nltk.word_tokenize(text)
    segments = []
    while tokens:
        segments.append(' '.join(tokens[:512]))
        del tokens[:512]
    
    return segments

In [5]:
# Model Inference
def run_model(query, text):
    # Initialising model
    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    
    # Initialising tokeniser
    tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    
    inputs = tokenizer(
        query,
        text,
        max_length=100,
        truncation="only_second",
        stride=50,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Running model
    output = model(
        input_ids=torch.tensor([inputs['input_ids'][0]]), 
        token_type_ids=torch.tensor(inputs['token_type_ids'][0])
    )
    
    # Putting answer together
    start_i = torch.argmax(output['start_logits'])
    end_i = torch.argmax(output['end_logits'])
    (start_i, end_i)
    
    answer = ' '.join(tokens[start_i:end_i+1])
    corrected_answer = ''
    for word in answer.split():
        #If it's a subword token
        if word[0:2] == '##':
            corrected_answer += word[2:]
        else:
            corrected_answer += ' ' + word
    
    return corrected_answer

## Workshop

In [6]:
from tf_idf import tokenize
# word_dict = query_and_context()
# word_dict

In [7]:
def sent_rank(query, context, n=0):
#     query_set = set(tokenize(query))
#     sentences = {sent: tokenize(sent) for sent in nltk.sent_tokenize(context)}
#     sent_scores = { sent: 0 for sent in sentences}
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    sent_scores = {
        sent: text_similarity(query, sent, model)
        for sent in nltk.sent_tokenize(context)
    }
#     for sent in sentences:
#         common_words = query_set.intersection(set(sentences[sent]))
#         sent_scores[sent] += len(common_words)
    
    ranked_scores = sorted(
        sent_scores.items(),
        key = lambda x: x[1],
    )
    
    return ranked_scores

def build_input_text(ranked_sents, max_length=512):
    input_text = ''
    
    while True:
        new_sent = ranked_sents.pop()[0]
        if len(nltk.word_tokenize(f'{input_text} {new_sent}')) <= max_length:
            input_text += f' {new_sent}'
        else:
            break
    
    return input_text

def text_similarity(text_1, text_2, model):
    embedding_1= model.encode(text_1, convert_to_tensor=True)
    embedding_2 = model.encode(text_2, convert_to_tensor=True)
    
    return float(util.pytorch_cos_sim(embedding_1, embedding_2))

In [8]:
def info_extraction_procedure():
    word_dict = query_and_context()
    ranked_sents = sent_rank(word_dict['query'], word_dict['context'], 0)
    print(ranked_sents)
    input_text = build_input_text(ranked_sents)
    print(input_text)
    model_output = run_model(word_dict['query'], input_text)
    return word_dict['query'], model_output

In [9]:
# question, answer = info_extraction_procedure()
# print(f'Question: "{question}"', f'Answer: "{answer}"', sep="\n")

In [10]:
# question, answer = info_extraction_procedure()
# print(f'Question: "{question}"', f'Answer: "{answer}"', sep="\n")

# OpenAI

In [11]:
import requests as req
import openai

In [12]:
# text_info = query_and_context()

In [14]:
# ranked_sents = sent_rank(text_info['query'], text_info['context'])
# input_text = build_input_text(ranked_sents, 3500)

In [15]:
# base_url = "https://api.openai.com/v1/completions"
# openai.api_key = "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"

# res = openai.Completion.create(
#     model="text-davinci-003", 
#     prompt=f"Context: {input_text} Query: {text_info['query']}\n\nUsing the context, answer the query.", 
#     temperature=0,
# )
# res = req.get(
#     base_url, 
#     headers={
#         'Authorization': f'Bearer {api_key}',
# #         'Content-Type': 'application/json'
#     }, 
#     data={
#         "model": "text-davinci-003", 
#         "prompt": "Say this is a test", 
#         "temperature": 0, 
#         "max_tokens": 7,
#     }
# )

In [16]:
# res.choices[0].text

# DocSearcher Class

In [22]:
# Third-Party Imports
import nltk
import torch

from transformers import BertForQuestionAnswering, BertTokenizer, AutoTokenizer
from sentence_transformers import SentenceTransformer, util

from dotenv import load_dotenv
load_dotenv()

# Standard Library Imports
import os
from string import punctuation
from math import log1p, inf

In [35]:
class DocSearcher():
    def __init__(self):
        self._corpus = dict()
        self._file_matches = 1
        self._sentence_matches = 1
    
    def view_corpus(self):
        return self._corpus

    def load_files(self, dirname):
        main_path = os.path.join(os.path.dirname('__file__'), dirname)

        for file in os.listdir(main_path):
            with open(os.path.join(main_path, file), 'r') as f:
                self._corpus[file] = f.read()
    
    def search(self, query, s_method='tf-idf', e_method='tf-idf', fnames=None):
        if not fnames: fnames = self._corpus.keys()

        if s_method == 'tf-idf':
            joint_context, ranked_sents = self._context_and_sents_idf(query, fnames)
        elif s_method == 'cosine_sim':
            joint_context, ranked_sents = self._context_and_sents_cosine(query, fnames)
        
#         print(joint_context, ranked_sents, sep="\n\n")
        
#         print(ranked_sents[:self._sentence_matches])
        
        if e_method == 'conjoin':
            output_text = self._build_output_text(ranked_sents, inf)
            answer = ' '.join(nltk.sent_tokenize(output_text)[:self._sentence_matches])
        elif e_method == 'bert':
            output_text = self._build_output_text(ranked_sents, 512)
            answer = self._run_model_bert(query, output_text)
        elif e_method == 'openai':
            output_text = self._build_output_text(ranked_sents, 2500)
            answer = self._run_model_openai(query, output_text)
        
        print('\n\nAnd the output is...\n\n', output_text)
        return answer
    
    def _build_output_text(self, ranked_sents, max_length=512):
        output_text = ''

        for sent in ranked_sents:
            new_sent = sent[0]
            if len(nltk.word_tokenize(f'{output_text} {new_sent}')) <= max_length:
                output_text += f' {new_sent}'
            else:
                break

        return output_text
    
    def _run_model_bert(self, query, text):
        # Initialising model
        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

        # Initialising tokeniser
        tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

        inputs = tokenizer(
            query,
            text,
            max_length=100,
            truncation="only_second",
            stride=50,
            return_overflowing_tokens=True,
            return_offsets_mapping=True
        )

        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

        # Running model
        output = model(
            input_ids=torch.tensor([inputs['input_ids'][0]]), 
            token_type_ids=torch.tensor(inputs['token_type_ids'][0])
        )

        # Putting answer together
        start_i = torch.argmax(output['start_logits'])
        end_i = torch.argmax(output['end_logits'])
        (start_i, end_i)

        answer = ' '.join(tokens[start_i:end_i+1])
        corrected_answer = ''
        for word in answer.split():
            #If it's a subword token
            if word[0:2] == '##':
                corrected_answer += word[2:]
            else:
                corrected_answer += ' ' + word

        return corrected_answer
    
    def _run_model_openai(self, query, text):
        openai.api_key = os.getenv("OPENAI_API_KEY")

        res = openai.Completion.create(
            model="text-davinci-003", 
            prompt=f"Context: {query} Query: {text}\n\nUsing only the context given, answer the query.", 
            temperature=0,
            max_tokens=500,
        )
        
        return res.choices[0].text

    def _context_and_sents_idf(self, query, fnames):
        idfs = self._compute_idfs(fnames)
        top_files = self._top_files_idf(query, idfs)

        joint_context = "\n".join(self._corpus[name] for name in top_files)

        ranked_sents = self._sent_rank_idf(query, joint_context, idfs)

        return joint_context, ranked_sents
    
    def _context_and_sents_cosine(self, query, fnames):
        top_files = self._top_files_cosine(query, fnames)
        joint_context = "\n".join(self._corpus[name] for name in top_files)

        ranked_sents = self._sent_rank_cosine(query, joint_context)

        return joint_context, ranked_sents

    def _cosine_similarity(self, text_1, text_2, model):
        embedding_1= model.encode(text_1, convert_to_tensor=True)
        embedding_2 = model.encode(text_2, convert_to_tensor=True)
    
        return float(util.pytorch_cos_sim(embedding_1, embedding_2))
    
    def _compute_idfs(self, fnames):
        file_idfs = dict()
        unique_words = set()
        num_docs = len(fnames)

        for name in fnames:
            for sent in nltk.sent_tokenize(self._corpus[name]):
                unique_words = set().union(unique_words, set(self._word_tokenize(sent)))
                
        for word in unique_words:
            num_apps = sum(1 for name in fnames if word in self._corpus[name])
            if num_apps > 0:
                file_idfs[word] = log1p(num_docs / num_apps)
        
        return file_idfs

    def _top_files_idf(self, query, idfs):
        tf_idfs = { fname: 0 for fname in self._corpus }

        query = self._word_tokenize(query)

        for w in query:
            for fname in self._corpus:
                tf_idfs[fname] += self._corpus[fname].count(w) * idfs.get(w, 0)
        
        ranked_files = sorted(
            tf_idfs.items(),
            key=lambda x: x[1],
            reverse=True
        )

        return [file[0] for file in ranked_files][:self._file_matches]
    
    def _top_files_cosine(self, query, fnames):
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

        ranked_files = sorted([
            (name, self._cosine_similarity(query, self._corpus[name], model))
            for name in fnames
        ], key=lambda x: x[1], reverse=True)

        return [file[0] for file in ranked_files][:self._file_matches]
    
    def _word_tokenize(self, words):
        banned = list(punctuation) + nltk.corpus.stopwords.words("english")

        return [
            w.lower() for w in nltk.word_tokenize(words)
            if w.lower() not in banned
        ]
    
    def _sent_rank_idf(self, query, context, idfs):
        query_set = set(self._word_tokenize(query))
        sent_scores = { sent: [0,0] for sent in nltk.sent_tokenize(context)}

        for sent in sent_scores:
            sent_set = set(self._word_tokenize(sent))
            common_words = query_set.intersection(sent_set)
            sent_scores[sent][0] += sum(idfs.get(w, 0) for w in common_words)
            sent_scores[sent][1] += len(common_words)
        
        ranked_sents = sorted(
            sent_scores.items(),
            key=lambda x: (x[1][0], x[1][1]),
            reverse=True
        )

        return [(sent, score[0]) for sent, score in ranked_sents]

    def _sent_rank_cosine(self, query, context):
        model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
        sent_scores = {
            sent: self._cosine_similarity(query, sent, model)
            for sent in nltk.sent_tokenize(context)
        }
    
        ranked_sents = sorted(
            sent_scores.items(),
            key = lambda x: x[1],
            reverse=True
        )
    
        return ranked_sents

In [36]:
crawler = DocSearcher()
crawler.load_files('corpus')

In [37]:
crawler.search('What is the biggest animal?', s_method='cosine_sim', e_method='conjoin')



And the output is...

  It is the largest cat native to the Americas and the third largest in the world, exceeded in size only by the tiger and the lion. With a body length of up to 1.85 m (6 ft 1 in) and a weight of up to 158 kg (348 lb), it is the largest cat species in the Americas and the third largest in the world. It has powerful jaws with the third-highest bite force of all felids, after the tiger and the lion. An evaluation of JCUs from Mexico to Argentina revealed that they overlap with high-quality habitats of about 1,500 mammals to varying degrees. In South America, the jaguar is larger than the cougar and tends to take larger prey, usually over 22 kg (49 lb). The cougar's prey usually weighs between 2 and 22 kg (4 and 49 lb), which is thought to be the reason for its smaller size. An analysis of 53 studies documenting the diet of the jaguar revealed that its prey ranges in weight from 1 to 130 kg (2.2 to 286.6 lb); it prefers prey weighing 45–85 kg (99–187 lb), with capyb

' It is the largest cat native to the Americas and the third largest in the world, exceeded in size only by the tiger and the lion.'

In [34]:
' '.join(nltk.sent_tokenize('Hello world. This is my story.')[:1])

'Hello world.'

# BERT Playground

In [38]:
def bert_two_point_O():
    # Load model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    
    long_text = query_and_context()['context']
    text_len = len(long_text)
    
    # Chunking
    max_length = 512
    
    input_ids = []
    attention_masks = []
    
    for i in range(0, text_len, max_length):
        chunk = long_text[i:i+max_length]
        encoded_dict = tokenizer.encode_plus(
            chunk,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    # Stack the chunks of input IDs and attention masks
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    # Put the model in evaluation mode
    model.eval()

    # Predict the output
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
        start_logits, end_logits = outputs[:2]
    
    return start_logits, end_logits

In [39]:
bert_two_point_O()

Enter a search term: Tom Cruise
Enter your question: Where did Tom Cruise grow up?


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(tensor([[ -5.5947,  -1.5933,  -3.9772,  ..., -10.4726,  -5.2151,  -5.5965],
         [ -1.2706,  -3.5167,  -5.7223,  ..., -10.2249,  -5.4948,  -1.2709],
         [ -3.8793,  -6.5119,  -7.1051,  ..., -10.4425,  -4.6956,  -3.8807],
         ...,
         [ -5.1307,  -5.4244,  -5.3474,  ..., -10.6289,  -5.3788,  -5.1339],
         [ -3.4033,  -4.9488,  -3.3663,  ..., -10.3960,  -4.9494,  -3.4045],
         [ -5.9335,  -4.4929,  -3.1796,  ..., -10.0086,  -3.7533,  -3.9135]]),
 tensor([[-2.7626, -3.7929, -4.0177,  ..., -9.4337, -2.6506, -2.7634],
         [-0.4309, -2.9114, -6.1873,  ..., -9.5618, -4.1262, -0.4308],
         [-2.3362, -4.5627, -3.4652,  ..., -9.6096, -3.9826, -2.3367],
         ...,
         [-2.9828, -6.8091, -6.2254,  ..., -9.5989, -3.5705, -2.9866],
         [-2.2831, -6.0053, -6.4287,  ..., -9.8670, -3.5833, -2.2835],
         [-2.3178, -5.3763, -3.7816,  ..., -9.5795, -3.0757, -3.1935]]))

In [62]:
def bert_three_point_O():
    # Load the pre-trained model
    model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

    # Define a text passage and question to be answered
    q_and_c = query_and_context()
    question = q_and_c['query']
    text = q_and_c['context']

    # Split the text into smaller segments
    segment_length = 512 - 2 - len(tokenizer.tokenize(question))
    segments = [text[i:i + segment_length] for i in range(0, len(text), segment_length)]

    # Tokenize the question
    question_tokens = tokenizer.tokenize(question)

    # Initialize start and end scores for each segment
    start_scores = []
    end_scores = []

    # Use BERT to get the start and end scores for each segment
    for i, segment in enumerate(segments):
        input_ids = tokenizer.encode(question, segment)
        input_ids = torch.tensor([input_ids]).long()
        with torch.no_grad():
            outputs = model(input_ids)
            start_scores.append(outputs[0][0].squeeze()[:segment_length + 2])
            end_scores.append(outputs[1][0].squeeze()[:segment_length + 2])
    
    # Pad the start and end scores so they have the same shape
    max_len = max([s.shape[0] for s in start_scores])
    start_scores = [torch.cat([s, torch.zeros(max_len - s.shape[0])]) for s in start_scores]
    end_scores = [torch.cat([s, torch.zeros(max_len - s.shape[0])]) for s in end_scores]

    # Combine the start and end scores for all segments
    start_scores = torch.stack(start_scores).mean(dim=0)
    end_scores = torch.stack(end_scores).mean(dim=0)

    # Get the indices of the start and end of the answer
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the tokenized answer
    answer_tokens = input_ids[0][answer_start:answer_end + 1]

    # Convert the tokenized answer back to text
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)

    return answer

In [63]:
bert_three_point_O()

Enter a search term: Tom Cruise
Enter your question: Where did Tom Cruise grow up?
[tensor([-5.7917, -5.3040, -7.2007, -7.2184, -8.5298, -7.5556, -8.0656, -9.2692,
        -5.7916, -3.6321, -5.7237, -6.2394, -8.2390, -8.6294, -6.7272, -5.7723,
        -3.9892, -2.1912, -4.9759, -6.7635, -2.1893, -6.4402, -5.8328, -5.4544,
        -3.2481, -3.6974, -6.9341, -5.1053, -5.7916, -5.2833, -7.8276, -6.5621,
        -4.6200, -8.5311, -8.1247, -7.3066, -8.3818, -7.6481, -5.4548, -7.7884,
        -5.3228, -7.4919, -7.6034, -6.8772, -6.9670, -8.1645, -6.9312, -6.2546,
        -6.0794, -5.7948, -8.4180, -7.7502, -8.8204, -7.8779, -8.1754, -5.0578,
        -4.0676, -6.9783, -7.2519, -8.0699, -7.4471, -8.3274, -9.0270, -6.4452,
        -8.2006, -5.9946, -5.9453, -7.7831, -7.4415, -3.9092, -3.9496, -6.7512,
        -5.2096, -4.3676, -4.1956, -5.9704, -6.7209, -3.5143,  0.7813, -5.1379,
        -7.8496, -5.9744, -5.9725, -7.4712, -8.5640, -8.2159, -7.5045, -4.0412,
        -7.9464, -6.8102, -7.8526, -

''

In [59]:
bert_three_point_O()

Enter a search term: Tom Cruise
Enter your question: Who is Tom Cruise?


''

In [64]:
bert_three_point_O()

Enter a search term: Nico Ditch
Enter your question: Where is Nico Ditch?


TypeError: 'NoneType' object is not subscriptable