In [2]:
import sys
import datasets
from transformers import AutoTokenizer
sys.path.append("..")
from babilong_utils import TaskDataset, SentenceSampler, NoiseInjectionDataset
%load_ext autoreload
%autoreload 2

In [2]:
# ### extract dataset archive
# !unzip ../data/tasks_1-20_v1-2.zip -d ../data/

In [3]:
!ls ../data/tasks_1-20_v1-2/en-10k/

qa10_indefinite-knowledge_test.txt   qa1_single-supporting-fact_test.txt
qa10_indefinite-knowledge_train.txt  qa1_single-supporting-fact_train.txt
qa11_basic-coreference_test.txt      qa20_agents-motivations_test.txt
qa11_basic-coreference_train.txt     qa20_agents-motivations_train.txt
qa12_conjunction_test.txt	     qa2_two-supporting-facts_test.txt
qa12_conjunction_train.txt	     qa2_two-supporting-facts_train.txt
qa13_compound-coreference_test.txt   qa3_three-supporting-facts_test.txt
qa13_compound-coreference_train.txt  qa3_three-supporting-facts_train.txt
qa14_time-reasoning_test.txt	     qa4_two-arg-relations_test.txt
qa14_time-reasoning_train.txt	     qa4_two-arg-relations_train.txt
qa15_basic-deduction_test.txt	     qa5_three-arg-relations_test.txt
qa15_basic-deduction_train.txt	     qa5_three-arg-relations_train.txt
qa16_basic-induction_test.txt	     qa6_yes-no-questions_test.txt
qa16_basic-induction_train.txt	     qa6_yes-no-questions_train.txt
qa17_positional-reasoning_test.

In [4]:
task = "qa2_two-supporting-facts"

In [5]:
train_path =f"../data/tasks_1-20_v1-2/en-10k/{task}_train.txt"
test_path = f"../data/tasks_1-20_v1-2/en-10k/{task}_test.txt"
noise_dataset_name = "pg19"
noise_dataset = datasets.load_dataset(noise_dataset_name)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


### Load task datasets

In [8]:
# task 
task_dataset_train = TaskDataset(train_path)
task_dataset_test = TaskDataset(test_path)

In [9]:
# background text
tokenizer = AutoTokenizer.from_pretrained('gpt2')

noise_sampler_train = SentenceSampler(noise_dataset['train'], tokenizer=tokenizer)
noise_sampler_test = SentenceSampler(noise_dataset['test'], tokenizer=tokenizer)

In [10]:
sample_size = 500               # max number of tokens in sample
dataset_train = NoiseInjectionDataset(task_dataset=task_dataset_train,
                                        noise_sampler=noise_sampler_train,
                                        tokenizer=tokenizer,
                                        sample_size=sample_size)

dataset_test = NoiseInjectionDataset(task_dataset=task_dataset_test,
                                        noise_sampler=noise_sampler_test,
                                        tokenizer=tokenizer,
                                        sample_size=sample_size)

In [11]:
sample = dataset_train[0]
sample.keys()

dict_keys(['facts', 'question', 'answer', 'references', 'background_text', 'fact_positions', 'input_tokens', 'question_tokens', 'target_tokens'])

In [12]:
for f in sample['facts']:
    print(f)
print("fact position:", sample['fact_positions'])
print("question:", sample['question'])
print("\nBACKGROUND:")

background_text = tokenizer.batch_decode(sample['background_text'])
for s in background_text[:20]:
    print(f'\'{s}\',')

Mary moved to the bathroom.
Sandra journeyed to the bedroom.
Mary got the football there.
John went to the kitchen.
Mary went back to the kitchen.
Mary went back to the garden.
fact position: [ 1  7  7 11 13 15]
question: Where is the football? 

BACKGROUND:
'













The Old Testament of the King James Version of the Bible




The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.',
'1:2 And the earth was without form, and void; and darkness was upon
the face of the deep.',
'And the Spirit of God moved upon the face of the
waters.',
'1:3 And God said, Let there be light: and there was light.',
'1:4 And God saw the light, that it was good: and God divided the light
from the darkness.',
'1:5 And God called the light Day, and the darkness he called Night.',
'And the evening and the morning were the first day.',
'1:6 And God said, Let there be a firmament in the midst of the waters,
and let it divide the waters from the waters.',
'1:7 An

### Visualize one sample

In [13]:

facts = sample['facts']
question = sample['question']
answer = tokenizer.decode(sample['target_tokens'])

#background_text = sample['background_text']

input_tokens = tokenizer.decode(sample['input_tokens'])

print(f"Facts: {' '.join(facts)}")
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"References: {' '.join(sample['references'])}")
print()
print('Background text: ', ' '.join(background_text))
print('Fact positions: ', sample['fact_positions'])
print('Combined input: ', input_tokens)

print(f"Target: {answer}")


Facts: Mary moved to the bathroom. Sandra journeyed to the bedroom. Mary got the football there. John went to the kitchen. Mary went back to the kitchen. Mary went back to the garden.
Question: Where is the football? 
Answer: garden
References: Mary got the football there. Mary went back to the garden.

Background text:  













The Old Testament of the King James Version of the Bible




The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth. 1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the Spirit of God moved upon the face of the
waters. 1:3 And God said, Let there be light: and there was light. 1:4 And God saw the light, that it was good: and God divided the light
from the darkness. 1:5 And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day. 1:6 And God said, Let there be a firmament in the midst of the waters,
and let it di

### collate function

In [14]:
import torch
from torch.nn.utils.rnn import pad_sequence

id_pad_value = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
gen_token = tokenizer.encode('GEN')[0]
eos_token = tokenizer.eos_token_id

def collate_fn(batch):
    targets = [torch.tensor(b['target_tokens']) for b in batch]
    input_ids = [torch.tensor(b['input_tokens'] + [gen_token] + b['target_tokens'] + [eos_token]) for b in batch]
    gen_inputs = [torch.tensor(b['input_tokens'] + [gen_token]) for b in batch]

    attention_mask = [torch.ones_like(b, dtype=int) for b in input_ids]
    labels_mask = [torch.zeros_like(b, dtype=bool) for b in input_ids]
    for m, t in zip(labels_mask, targets):
        m[-len(t) - 2:] = True

    input_ids = pad_sequence(input_ids, padding_value=id_pad_value, batch_first=True)
    gen_inputs = pad_sequence(gen_inputs, padding_value=id_pad_value, batch_first=True)
    # labels = pad_sequence(input_ids, padding_value=-100, batch_first=True)
    attention_mask = pad_sequence(attention_mask, padding_value=0, batch_first=True)
    labels_mask = pad_sequence(labels_mask, padding_value=0, batch_first=True)

    collated = {}
    collated['input_ids'] = collated['labels'] = input_ids
    collated['input_ids_generate'] = gen_inputs
    collated['labels_mask'] = labels_mask
    collated['attention_mask'] = attention_mask.bool()
    collated['attention_mask_generate'] = (gen_inputs != id_pad_value).bool()

    collated['target_text'] = [b['answer'] for b in batch]
    
    collated['background_text'] = [b['background_text'] for b in batch]
    collated['facts'] = [b['facts'] for b in batch]
    collated['question'] = [b['question'] for b in batch]
    
    return collated

In [15]:
batch = [dataset_train[i] for i in range(10)]
collated = collate_fn(batch)
collated.keys()

dict_keys(['input_ids', 'labels', 'input_ids_generate', 'labels_mask', 'attention_mask', 'attention_mask_generate', 'target_text', 'background_text', 'facts', 'question'])

In [16]:
# labels are marked with labels_mask
#tokenizer.batch_decode([c[m] for c, m in zip(collated['input_ids'], collated['labels_mask'])])

In [17]:
# different input_ids for .forward() and .generate()
#tokenizer.batch_decode([c[m] for c, m in zip(collated['input_ids'], collated['attention_mask'])])

In [18]:
#tokenizer.batch_decode([c[m] for c, m in zip(collated['input_ids_generate'], collated['attention_mask_generate'])])

### Checking if Contriever can find relevant facts from the sample

In [19]:
contriever_path = "../../contriever/" 
if contriever_path not in sys.path:
    sys.path.append(contriever_path)

#for p in sys.path:    
#    print(p)

from src.contriever import Contriever
from transformers import AutoTokenizer

In [20]:
contriever = Contriever.from_pretrained("facebook/contriever") 
c_tokenizer = AutoTokenizer.from_pretrained("facebook/contriever") 

Some weights of the model checkpoint at facebook/contriever were not used when initializing Contriever: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing Contriever from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Contriever from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
import numpy as np

@torch.no_grad()
def compite_statistics(res, num_retr=5, verbose=False):
    N = len(res['query'])
    num_retrieved_facts = 0
    num_facts = 0
    num_retrieves = 0
    for i in range(N):
        scores = torch.inner(res['query'][i], res['sentences'][i])
        sorted_scores = torch.argsort(scores)
        
        fact_ids = res['facts_ids'][i]
        k = num_retr if num_retr > 0 else len(fact_ids) 
        top_k = sorted_scores[-k:]
        
        num_retrieved_facts += sum(id < len(fact_ids) for id in top_k)
        num_facts += len(fact_ids)
        num_retrieves += k
        
        if verbose:
            print(f"retrieved sentences: {top_k}, all_facts: {fact_ids}")    

    stats = dict()
    stats['precision'] = num_retrieved_facts/ num_retrieves
    stats['recall'] = num_retrieved_facts / num_facts
    
    print(f"precision: {stats['precision']:.2f}, recall: {stats['recall']:.2f}")
    return stats

@torch.no_grad()
def get_contriever_embeds(collated):
    sent_embeds = []
    facts_ids = []
    query_embeds = []
    N = len(collated['facts'])
    for i in range(N):
        sentences = []
        sentences.extend(collated['facts'][i])
        facts_ids.append(np.arange(len(sentences)))
        background_text = tokenizer.batch_decode(collated['background_text'][i])
        sentences.extend(background_text)
        sentences.append(collated['question'][i]) # append as this is a single str
        
        # print("fact_ids:", facts_ids[i])
        # for i, s in enumerate(sentences):
        #     print(f"{i}: type={type(s).__name__}, {s}")
            
        inputs = c_tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
        embeds = contriever(**inputs)
        sent_embeds.append(embeds[:-1])
        query_embeds.append(embeds[-1])
        
        #print('===================')
    #print("DONE")
    return dict(query=query_embeds, sentences=sent_embeds, facts_ids=facts_ids) 

#type(collated["question"][0])
res = get_contriever_embeds(collated)

In [22]:
#sum([s.shape[0] for s in res['sentences']])
stats = compite_statistics(res, num_retr=1)
stats = compite_statistics(res, num_retr=5)
stats = compite_statistics(res, num_retr=10)

precision: 1.00, recall: 0.07
precision: 0.84, recall: 0.28
precision: 0.78, recall: 0.53


### Visualizing Contriever Similiarity Scores

In [23]:
def monocolor(v, text):
    r=127 + int(v*128) 
    b=127 + int(128*(1-v))
    g=128
    #print(f"{r},{g},{b}")
    return colored_background(r, g, b, text)
    
def colored_background(r, g, b, text):
    return f'\033[48;2;{r};{g};{b}m{text}\033[0m'

text = "What a nice red background!"
colored_text = colored_background(255, 0, 0, text)
colored_text = monocolor(0.5, text)
print(colored_text)

[48;2;191;128;191mWhat a nice red background![0m


In [24]:
def print_sorted_sentences_scores(collated, idx):
    sentences = []
    sentences.extend(collated['facts'][idx])
    facts_ids = np.arange(len(sentences))
    background_text = tokenizer.batch_decode(collated['background_text'][idx])
    sentences.extend(background_text)
    sentences.append(collated['question'][idx]) # append as this is a single str
    
    inputs = c_tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    embeds = contriever(**inputs)
    sent_embeds = embeds[:-1]
    query_embeds = embeds[-1]
    
    scores = torch.inner(query_embeds, sent_embeds)        
    print("Contriever-based similarity score between question and Babilong context")
    #print(scores)
    
    normalized_scores = (scores-scores.min())/(scores.max()-scores.min())
    #normalized_scores = normalized_scores.tolist()
    print("---------------------------------")
    print("QUESTION:", sentences[-1])
    print("---------------------------------")
    print(f"TOP {min(len(sentences)-1, 25)} sentences sorted from highest to lowest similarity score:")
    for i in reversed(torch.argsort(normalized_scores)[-25:].tolist()):
        norm_score = normalized_scores[i]
        texttype = colored_background(150, 255, 150, "[F]") if i in facts_ids else colored_background(200, 200, 200, "[N]")
        sent = f"#{i}. " + sentences[i].replace("\n", " ")
        if len(sent) > 120:
            sent = sent[:120] + "..."
        print(f"{scores[i]:.3f}, {texttype:>8s}: {monocolor(1-norm_score, sent)}")


print_sorted_sentences_scores(collated, 1)

Contriever-based similarity score between question and Babilong context
---------------------------------
QUESTION: Where is the football? 
---------------------------------
TOP 24 sentences sorted from highest to lowest similarity score:
0.961, [48;2;150;255;150m[F][0m: [48;2;127;128;255m#2. Mary got the football there.[0m
0.682, [48;2;150;255;150m[F][0m: [48;2;190;128;191m#10. Mary dropped the football.[0m
0.586, [48;2;200;200;200m[N][0m: [48;2;211;128;170m#23. 2:9 And out of the ground made the LORD[0m
0.578, [48;2;150;255;150m[F][0m: [48;2;213;128;168m#11. John got the milk there.[0m
0.572, [48;2;150;255;150m[F][0m: [48;2;214;128;167m#4. Mary went back to the kitchen.[0m
0.562, [48;2;150;255;150m[F][0m: [48;2;217;128;164m#6. Sandra went back to the office.[0m
0.561, [48;2;150;255;150m[F][0m: [48;2;217;128;164m#0. Mary moved to the bathroom.[0m
0.555, [48;2;150;255;150m[F][0m: [48;2;218;128;163m#5. Mary went back to the garden.[0m
0.552, [48;2;200;20

## Create environment for multi-step retrieval from a history

In [25]:
from retrieval_env import QARetrievalEnv, RNDStrategy

In [26]:
env = QARetrievalEnv(collated, contriever, c_tokenizer, tokenizer, 0)

In [39]:
s = env.reset()
print(s.keys())
print(s['acts_embed'].shape[0])

dict_keys(['acts_embed', 'acts_text', 'acts_mask', 'state_embed'])
18


In [46]:
print("=== step #0 ===")
for i, sent in enumerate(s['acts_text'][0:5]):
    print(i, sent)
print("\naction mask:", s['acts_mask'])
acts = s['acts_mask'].nonzero()[0]
env.step(acts[:3])

=== step #0 ===
0 Mary moved to the bathroom.
1 Sandra journeyed to the bedroom.
2 Mary got the football there.
3 John went to the kitchen.
4 Mary went back to the kitchen.

action mask: [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]


array([0, 1, 2])

### Create a dataloader

In [14]:
from torch.utils.data import DataLoader

dl = DataLoader(batch_size=2, dataset=dataset_train, collate_fn=collate_fn)
gen = iter(dl)
batch = next(gen)
batch.keys()

dict_keys(['input_ids', 'labels', 'input_ids_generate', 'labels_mask', 'attention_mask', 'attention_mask_generate', 'target_text'])

In [16]:
from contriever import Contriever
from transformers import AutoTokenizer

ModuleNotFoundError: No module named 'contriever'