In [50]:
import numpy as np
import torch
import os 

In [51]:
tokenizer = None 
    
import numpy as np
import torch
import os 

config_switch=os.getenv('DOCKER', 'local')
if config_switch=='local':
    startup_nodes = [{"host": "127.0.0.1", "port": "30001"}, {"host": "127.0.0.1", "port":"30002"}, {"host":"127.0.0.1", "port":"30003"}]
else:
    startup_nodes = [{"host": "rgcluster", "port": "30001"}, {"host": "rgcluster", "port":"30002"}, {"host":"rgcluster", "port":"30003"}]

try: 
    from redisai import ClusterClient
    redisai_cluster_client = ClusterClient(startup_nodes=startup_nodes)
except:
    print("Redis Cluster is not available")

def loadTokeniser():
    global tokenizer
    from transformers import BertTokenizerFast
    tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    return tokenizer


def qa_redisai(question, sentence_key,hash_tag):
    ### question is encoded
    ### use pre-computed context/answer text tensor

    global tokenizer

    if not tokenizer:
        tokenizer=loadTokeniser()

     

    token_key = f"tokenized:bert:qa:{sentence_key}"

    input_ids_question = tokenizer.encode(question, add_special_tokens=True, truncation=True, return_tensors="np")


    
    input_ids_context=redisai_cluster_client.tensorget(token_key)
    input_ids = np.append(input_ids_question,input_ids_context)
    
    print(input_ids.shape)
    print(input_ids)
    attention_mask = np.array([[1]*len(input_ids)])
    input_idss=np.array([input_ids])
    print(input_idss.shape)
    print("Attention mask shape ",attention_mask.shape)
    
    num_seg_a=input_ids_question.shape[1]
    print(num_seg_a)
    num_seg_b=input_ids_context.shape[0]
    print(num_seg_b)
    token_type_ids = np.array([0]*num_seg_a + [1]*num_seg_b)
    print("Segments id",token_type_ids.shape)
    
    redisai_cluster_client.tensorset(f'input_ids{hash_tag}', input_idss)
    redisai_cluster_client.tensorset(f'attention_mask{hash_tag}', attention_mask)
    redisai_cluster_client.tensorset(f'token_type_ids{hash_tag}', token_type_ids)

    redisai_cluster_client.modelrun(f'bert-qa{hash_tag}', [f'input_ids{hash_tag}', f'attention_mask{hash_tag}', f'token_type_ids{hash_tag}'],
                        [f'answer_start_scores{hash_tag}', f'answer_end_scores{hash_tag}'])
    print(f"Model run on {hash_tag}")
    answer_start_scores = redisai_cluster_client.tensorget(f'answer_start_scores{hash_tag}')
    answer_end_scores = redisai_cluster_client.tensorget(f'answer_end_scores{hash_tag}')

    answer_start = np.argmax(answer_start_scores)
    answer_end = np.argmax(answer_end_scores) + 1
    
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end], skip_special_tokens = True))
    print(answer)
    return answer


In [52]:
question="What about frequencies of occurenence RNA?"
qa_redisai(question,"PMC222961.xml:{06S}:26",'{06S}')

(182,)
[  101  2054  2055 13139  1997  5258  8625  5897 12987  1029   102  1056
  1044  1041  1042  1054  1041  1053  1057  1041  1050  1039  1045  1041
  1055  1051  1042  1051  1039  1039  1057  1054  1054  1041  1050  1039
  1041  1042  1051  1054  1045  1050  1057  1039  1048  1041  1051  1056
  1045  1040  1041  1055  1059  1041  1054  1041  1039  1051  1049  1052
  1037  1054  1041  1040  1056  1051  1056  1044  1041  1054  1037  1050
  1040  1051  1049  1054  1050  1037  1039  1051  1057  1050  1056  1041
  1054  1052  1037  1054  1056  1055  1044  1037  1058  1045  1050  1043
  1056  1044  1041  1055  1037  1049  1041  1038  1037  1055  1041  1052
  1054  1051  1052  1051  1054  1056  1045  1051  1050  1045  1050  1051
  1054  1040  1041  1054  1056  1051  1039  1051  1049  1052  1057  1056
  1041  1056  1044  1041  1037  1058  1037  1048  1057  1041  1056  1044
  1037  1056  1054  1041  1042  1048  1041  1039  1056  1041  1040  1056
  1044  1041  1045  1054  1045  1050  1057  

''

In [None]:
question="Effectiveness of community contact reduction"

In [None]:
sentence_key="PMC261870.xml:{06S}:26"
token_key = f"tokenized:bert:qa:{sentence_key}"

In [None]:
redisai_cluster_client.connection_pool

In [None]:
%%time 
slot = redisai_cluster_client.connection_pool.nodes.keyslot(sentence_key)
node = redisai_cluster_client.connection_pool.get_master_node_by_slot(slot)
connection = redisai_cluster_client.connection_pool.get_connection_by_node(node)
connection.send_command('RG.TRIGGER',"RunQABERT",sentence_key,question)
print(connection.__dict__)
print(redisai_cluster_client.parse_response(connection,"RG.TRIGGER"))

In [None]:
%%time
slot = redisai_cluster_client.connection_pool.nodes.keyslot(sentence_key)
node = redisai_cluster_client.connection_pool.get_master_node_by_slot(slot)
connection = redisai_cluster_client.connection_pool.get_connection_by_node(node)
connection.send_command('RG.TRIGGER',"RunQABERT",sentence_key,question)
print(connection.__dict__)
print(redisai_cluster_client.parse_response(connection,"RG.TRIGGER"))

In [None]:
question

In [None]:
from rediscluster import RedisCluster

In [None]:
startup_nodes = [{"host": "127.0.0.1", "port": "30001"}, {"host": "127.0.0.1", "port":"30002"}, {"host":"127.0.0.1", "port":"30003"}]
rc = RedisCluster(startup_nodes=startup_nodes, decode_responses=True)

In [None]:
object_methods = [method_name for method_name in dir(rc)
                  if callable(getattr(rc, method_name))]

In [None]:
sentence_key="PMC261870.xml:{06S}:26"
question="Effectiveness of community contact reduction"

In [None]:
rc.execute_command('RG.TRIGGER',"RunQABERT",sentence_key,question)

In [None]:
command='RG.TRIGGER'

In [None]:
rc.determine_node('RG.TRIGGER',"RunQABERT",sentence_key,question)

In [None]:
print(rc.nodes_flags.get(command))

In [None]:
args=[1,2]

In [None]:
len(args)>=1

In [None]:
rc.execute_command('RG.TRIGGER',"RunQABERT",sentence_key,question)

In [None]:
from rediscluster import RedisCluster

In [None]:
import logging

from rediscluster import RedisCluster

logging.basicConfig()
logger = logging.getLogger('rediscluster')
logger.setLevel(logging.DEBUG)
logger.propagate = True

In [None]:
rc = RedisCluster(startup_nodes=startup_nodes, decode_responses=True)

In [None]:
rc.execute_command('RG.TRIGGER',"RunQABERT",sentence_key,question)

In [None]:
rc.connection_pool.nodes.random_node()

In [None]:
list(rc.connection_pool.nodes.all_masters())

In [None]:
rc.get(sentence_key)

In [None]:
print(rc.parse_response(connection,"RG.TRIGGER"))

In [None]:
result=rc.get("cache{06S}_PMC261870.xml:{06S}:26_Effectiveness of community contact reduction")

In [None]:
print(result)

In [20]:
tokenizer = None 
model = None

import torch

def loadTokeniser():
    global tokenizer
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad", torchscript=True)
    return tokenizer

def loadModel():
    global model
    from transformers import AutoModelForQuestionAnswering
    model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad", torchscript=True)
    return model

def qa(question, content_text):
    global tokenizer, model 

    if not tokenizer:
        tokenizer=loadTokeniser()

    if not model:
        model=loadModel()

    inputs = tokenizer.encode_plus(question, content_text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]
    print(input_ids)

    answer_start_scores, answer_end_scores = model(**inputs,return_dict=False)
    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer



In [5]:
content_text="The frequencies of occurrence for i nucleotides were compared to the random RNA counterparts having the same base proportion in order to compute the a value that reflected their i nucleotide bias Table 2"

In [6]:
question="What about frequencies of occurenence RNA?"

In [25]:
tokenizer = None 
model = None

import torch

def loadTokeniser():
    global tokenizer
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    return tokenizer

def loadModel():
    global model
    from transformers import AutoModelForQuestionAnswering
    model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    return model

if not tokenizer:
    tokenizer=loadTokeniser()

if not model:
    model=loadModel()

inputs = tokenizer.encode_plus(question, content_text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]
print(input_ids)

answer_start_scores, answer_end_scores = model(**inputs,return_dict=False)
answer_start = torch.argmax(
    answer_start_scores
)  # Get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))


[101, 2054, 2055, 13139, 1997, 5258, 8625, 5897, 12987, 1029, 102, 1996, 13139, 1997, 14404, 2005, 1045, 16371, 14321, 26601, 2015, 2020, 4102, 2000, 1996, 6721, 12987, 14562, 2383, 1996, 2168, 2918, 10817, 1999, 2344, 2000, 24134, 1996, 1037, 3643, 2008, 7686, 2037, 1045, 16371, 14321, 26601, 13827, 2795, 1016, 102]


In [38]:
print(answer)

compared to the random rna counterparts having the same base proportion


In [7]:
%%time
print(qa(question,content_text))

[101, 2054, 2055, 13139, 1997, 5258, 8625, 5897, 12987, 1029, 102, 1996, 13139, 1997, 14404, 2005, 1045, 16371, 14321, 26601, 2015, 2020, 4102, 2000, 1996, 6721, 12987, 14562, 2383, 1996, 2168, 2918, 10817, 1999, 2344, 2000, 24134, 1996, 1037, 3643, 2008, 7686, 2037, 1045, 16371, 14321, 26601, 13827, 2795, 1016, 102]
compared to the random rna counterparts having the same base proportion
CPU times: user 12.4 s, sys: 1.4 s, total: 13.8 s
Wall time: 12.5 s


In [47]:
"cache{06S}_PMC222961.xml:{06S}:26_%s" % question

'cache{06S}_PMC222961.xml:{06S}:26_What about frequencies of occurenence RNA?'

In [53]:
question="When air samples collected?"

In [55]:
"cache{5M5}_PMC140314.xml:{5M5}:44_%s" % question 

'cache{5M5}_PMC140314.xml:{5M5}:44_When air samples collected?'

In [46]:
rc.get("cache{5M5}_PMC261870.xml:{5M5}:26_%s" % question)

NameError: name 'rc' is not defined

In [41]:
def print_tokens(input_ids):
    # BERT only needs the token IDs, but for the purpose of inspecting the 
    # tokenizer's behavior, let's also get the token strings and display them.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # For each token and its id...
    for token, id in zip(tokens, input_ids):

        # If this is the [SEP] token, add some space around it to make it stand out.
        if id == tokenizer.sep_token_id:
            print('')

        # Print the token string and its ID in two columns.
        print('{:<12} {:>6,}'.format(token, id))

        if id == tokenizer.sep_token_id:
            print('')

In [48]:
def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)
 
    # ======== Evaluate ========
    # Run our example question through the model.
    start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                    token_type_ids=torch.tensor([segment_ids]),return_dict=False) # The segment IDs to differentiate question from answer_text

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    print('Answer: "' + answer + '"')

In [49]:
answer_question(question, content_text)

Query has 51 tokens.

Answer: "compared to the random rna counterparts having the same base proportion"


In [None]:
"""
hget sentence:PMC222961.xml:{06S} 26
"The frequencies of occurrence for i nucleotides were compared to the random RNA counterparts having the same base proportion in order to compute the a value that reflected their i nucleotide bias Table 2
""""