In [None]:
!pip install transformers

In [1]:
import numpy as np
import pandas as pd
import os
import json

import transformers
import torch

from typing import List
import warnings

In [15]:
def get_json_names(dirname = '/kaggle/input'):    
    jsons = []
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            if filename.endswith('json'):
                jsons.append(os.path.join(dirname, filename))
    return sorted(jsons)

def read_text_entries(json_file):
    with open(json_file, 'r') as f:
        js = json.loads(f.read())
        text_entries = []
        for entry in js['body_text']:
            text_entries.append(entry['text'])
        all_text = "\n".join(text_entries)
    return all_text 
    
def read_texts(json_files, max_texts=1000):
    if max_texts is None:
        max_texts = len(json_files)
    texts = []
    for i, file_ in enumerate(json_files):
        if i > max_texts:
            return texts
        else:
            text = read_text_entries(file_)
            texts.append(text)
            
BEGIN = "[CLS]"
SEP = "[SEP]"

def _get_top_inds(scores, top_k):
    best_inds = torch.flatten(
        torch.narrow(torch.argsort(scores, descending=True), 
                     dim=1, start=0, length=top_k))
    # print(best_inds)
    best_scores = torch.index_select(scores, 1, best_inds)
    
    return best_inds, best_scores

def _answer_ok(answer):
    return answer.strip() != BEGIN and SEP not in answer and answer != ''

def _get_token_type_ids(input_ids):
    ## Why 102?? unsolved..
    return [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
        
class Answerer:
    def __init__(self, model, tokenizer, texts: List[str], max_tokens = 512, max_answers = 3):
        self.model = model
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens
        self.texts = texts
        self.max_answers = 3
        
    def _encode_question(self, question: str) -> List[int]:
        """
        Encodes the question to integer, truncating and warning if the question
        was longer than self.max_tokens.
        """
        question = f"{BEGIN} {question}"        
        question_ids = self.tokenizer.encode(question)
        if len(question_ids) > self.max_tokens:
            warnings.warn(
                f"question longer than {self.max_tokens} tokens " +
                f"it's {len(question_ids)} tokens long); truncating.")
            question_ids = question_ids[:max_tokens]
        return question_ids
    
    def _encode_text(self, text: str, question_length: int) -> List[List[int]]:
        """
        Encodes the tokens in text to integers, using this objects tokenizer.
        :returns: the full text, encoded as integers, split into partitions of length 
        self.max_tokens - question_length.
        """
        text = f"{SEP} {text} {SEP}"
        text_ids = self.tokenizer.encode(text)
        i = 0
        step = self.max_tokens - question_length
        text_id_subsets = []
        while i < len(text_ids):
            text_id_subset = text_ids[i:(i + step)]
            text_id_subsets.append(text_id_subset)
            i += step
        return text_id_subsets

    def _find_answer(self, input_ids: List[str]) -> str:
        """
        Returns the model's prediction for what the answer is for the input ids.
        
        :param input_ids: a full question; text string, encoded to a List of integer.
        """
        token_type_ids = _get_token_type_ids(input_ids)
        input_ids_tensor = torch.tensor([input_ids])        
        token_type_ids_tensor = torch.tensor([token_type_ids])
        start_scores, end_scores = self.model(input_ids_tensor, token_type_ids=token_type_ids_tensor)
        all_tokens = self.tokenizer.convert_ids_to_tokens(input_ids)  
        ## We should use the scores to get some type of confidence in the answer,
        ## and use that to decide which answer we finally take.
        top_starts_ind, top_starts_score = _get_top_inds(start_scores, 1)
        top_ends_ind, top_ends_score = _get_top_inds(end_scores, 1)
        #print(f"{top_starts_ind}:{top_ends_ind}")
        #print(f"{top_starts_score}|{top_ends_score}")
        answer_str = ' '.join(all_tokens[torch.argmax(start_scores) : 
                              torch.argmax(end_scores) + 1])
        return answer_str, top_starts_score, top_ends_score
    
    def _query_one_text(self, question: str, text: str) -> str:
        """
        Runs the model on each section of text of max_tokens length, returning the
        first answer found.
        """
        question_ids = self._encode_question(question)
        text_id_subsets = self._encode_text(text, len(question))        
        best_starts_score, best_ends_score = 0, 0
        best_answer = None
        for text_ids in text_id_subsets:
            input_ids = question_ids + text_ids
            # print(self.tokenizer.decode(input_ids))
            # print(len(input_ids))
            answer, sub_best_starts_score, sub_best_ends_score = self._find_answer(input_ids)
            if sub_best_ends_score > best_ends_score:
                # print(f"Found better end score ({sub_best_ends_score}) for answer {answer}")
                best_ends_score = sub_best_ends_score
                best_starts_score = sub_best_starts_score
                best_answer = answer
            # print(answer)
            ## Take the first answer found, regardless of anything else. Improve this!
            #if _answer_ok(answer):
            #    break
        return best_answer, best_starts_score, best_ends_score
    
    def answer(self, question: str, max_texts = 1000):
        """
        Runs the model for the input question on all texts (up to max_texts), 
        returning the first answer found.
        """
        if max_texts is None:
            max_texts = len(self.texts)
        best_start_score, best_end_score = 0, 0
        best_answer = None
        for i, text in enumerate(self.texts):
            if i > max_texts:
                break
            answer, text_best_start_score, text_best_end_score = self._query_one_text(question, text)
            if text_best_end_score > best_end_score:
                print(f"Found better end score ({text_best_end_score.item()}) for answer '{answer}'")
                best_end_score = text_best_end_score
                best_start_score = text_best_start_score
                best_answer = answer
            
            print(answer)
            ## Again take the first answer found, regardless of anything else. Improve this too!               
            #if _answer_ok(answer):
            #    break
        return best_answer
    


In [3]:
json_files = get_json_names()

In [4]:
texts = read_texts(json_files)

In [5]:
def clean_text(text):
    for char in ('\n', '.'):
        text = text.replace(char, ' ')
    return text

I think the pretrained model I'm using is fixed at a maximum input sequence length (max_position_embeddings) of 512 tokens. So we will either have to find a pretrained model where this is larger; train our own version with this larger; or use some method where we walk along all the texts (or otherwise narrow them by search) and run on the < 512-length windows.  

We should remove citations - that [18] stuff  

And we really should domain transfer.

In [None]:
## config = transformers.BertConfig(max_position_embeddings=10000)

In [7]:
tokenizer = transformers.BertTokenizer.from_pretrained(
    'bert-base-uncased')
model = transformers.BertForQuestionAnswering.from_pretrained(
    'bert-large-uncased-whole-word-masking-finetuned-squad')

In [8]:
text = texts[0]
input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
input_ids = tokenizer.encode(input_text)[:512]

## why 102?
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 

input_ids_tensor = torch.tensor([input_ids])
token_type_ids_tensor = torch.tensor([token_type_ids])

start_scores, end_scores = model(input_ids_tensor, token_type_ids=token_type_ids_tensor)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  

In [9]:
t = torch.tensor([[46]])

In [None]:
top_k = 3
top_starts = _get_top_inds(start_scores, top_k)
top_ends = _get_top_inds(end_scores, top_k)
print(top_starts)
print(top_ends)

In [18]:
texts[0]

'VP3, and VP0 (which is further processed to VP2 and VP4 during virus assembly) (6). The P2 64 and P3 regions encode the non-structural proteins 2B and 2C and 3A, 3B (1-3) (VPg), 3C pro and 4 structural protein-coding region is replaced by reporter genes, allow the study of genome 68 replication without the requirement for high containment (9, 10) ( figure 1A ).\nThe FMDV 5′ UTR is the largest known picornavirus UTR, comprising approximately 1300 71 nucleotides and containing several highly structured regions. The first 360 nucleotides at the 5′ 72 end are predicted to fold into a single large stem loop termed the S-fragment, followed by a The PKs were originally predicted in 1987 and consist of two to four tandem repeats of a ~48 86 nucleotide region containing a small stem loop and downstream interaction site (figure 1B) 87 (12). Due to the sequence similarity between the PKs (figure 1C), it is speculated that they 88 were formed by duplication events during viral replication, probab

In [16]:
answerer = Answerer(model=model, tokenizer=tokenizer, texts=texts)
question = "How are mutations introduced to PK regions?"
question = "How bad is the flu?"
question = "What are PKs?"
# question = "What is a dog?"



In [17]:
answerer.answer(question, 1)

Found better end score (4.27646541595459) for answer 'c1 ##1'
c1 ##1
Found better end score (4.5037736892700195) for answer 'south korea'
south korea


'south korea'

In [None]:
## What if we cut _at_ an answer? We need overlapping windows!