In [1]:
from sentence_transformers import SentenceTransformer, util

bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [2]:
import datasets

dataset = datasets.load_dataset('ms_marco', 'v2.1', split='train[:5000]')

Found cached dataset ms_marco (/home/ubuntu/.cache/huggingface/datasets/ms_marco/v2.1/2.1.0/b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84)


In [3]:
corpus = []
for i in range(len(dataset)):
    corpus.extend(dataset[i]['passages']['passage_text'])
corpus_embeddings = bi_encoder.encode(corpus, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/1560 [00:00<?, ?it/s]

In [4]:
import torch
from transformers import T5TokenizerFast, T5ForConditionalGeneration

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
from transformers.models.bart.modeling_bart import shift_tokens_right
from sentence_transformers import util
import random
import torch
random.seed(42)

def random_mask(query):
    words = query.split()
    if len(words) < 3:
        return query
    mask_index = random.randint(1, len(words) - 1)
    return ' '.join(words[:mask_index])

def convert_to_features(batch):
    random.shuffle(batch['passages'])
    masked_queries = list(map(random_mask, batch['query']))

    query_embeddings = bi_encoder.encode(masked_queries, convert_to_tensor=True)
    masked_queries = [query + ' <extra_id_0>' for query in masked_queries]
    knn = util.semantic_search(query_embeddings, corpus_embeddings, top_k=10)
    contexts = ['; '.join([corpus[e['corpus_id']] for e in embeddings]) for embeddings in knn]
    inputs = [query + '# ' + context for context, query in zip(contexts, masked_queries)]

    input_encodings = tokenizer.batch_encode_plus(inputs, pad_to_max_length=True, max_length=1024, truncation=True, return_tensors='pt')
    label_encodings = tokenizer.batch_encode_plus(batch['query'], pad_to_max_length=True, max_length=1024, truncation=True, return_tensors='pt')
    labels = label_encodings['input_ids']
    # decoder_input_ids = shift_tokens_right(labels, model.config.pad_token_id)
    labels[labels[:,:] == model.config.pad_token_id] = -100
    
    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        # 'decoder_input_ids': decoder_input_ids,
        'labels': labels,
        'masked_queries': masked_queries,
    }

    return encodings

In [6]:
dataset = dataset.map(convert_to_features, batched=True, batch_size=8, keep_in_memory=True)



  0%|          | 0/625 [00:00<?, ?ba/s]



In [7]:
dataset = dataset.train_test_split(test_size=0.1)

In [10]:
from transformers.trainer import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./models/t5-autocomplete',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    do_train=True,
    do_eval=True,
    warmup_steps=500,   
    weight_decay=0.01,
    logging_dir='./logs',
    learning_rate=1e-5,
    logging_steps=25,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [11]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: query, wellFormedAnswers, passages, masked_queries, query_id, query_type, answers. If query, wellFormedAnswers, passages, masked_queries, query_id, query_type, answers are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4500
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 2250
  Number of trainable parameters = 222903552


OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 22.04 GiB total capacity; 20.76 GiB already allocated; 19.12 MiB free; 20.91 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [12]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: query, wellFormedAnswers, passages, masked_queries, query_id, query_type, answers. If query, wellFormedAnswers, passages, masked_queries, query_id, query_type, answers are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 2


OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 22.04 GiB total capacity; 20.79 GiB already allocated; 19.12 MiB free; 20.91 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [13]:
tokenizer.save_pretrained('~/models/t5-autocomplete')
model.save_pretrained('~/models/t5-autocomplete')

tokenizer config file saved in ~/models/t5-autocomplete/tokenizer_config.json
Special tokens file saved in ~/models/t5-autocomplete/special_tokens_map.json
Copy vocab file to ~/models/t5-autocomplete/spiece.model
Configuration saved in ~/models/t5-autocomplete/config.json
Configuration saved in ~/models/t5-autocomplete/generation_config.json
Model weights saved in ~/models/t5-autocomplete/pytorch_model.bin


In [None]:
for i in range(0, 10):
    print('Actual:    ', dataset['test'][i]['query'])
    print('Query:     ', dataset['test'][i]['masked_queries'])
    to_encode = dataset['test'][i]['masked_queries'] + '; ' + '# '.join(dataset['test'][i]['passages']['passage_text'])
    encoded = tokenizer(
        to_encode,
        pad_to_max_length=True,
        max_length=1024,
        truncation=True,
        return_tensors='pt')
    output = model.generate(input_ids=encoded['input_ids'].to('cuda'), max_length=1024, num_beams=4, early_stopping=True)
    print('Predicted: ', tokenizer.decode(output[0], skip_special_tokens=True))
    print('---------------------')

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Actual:     what does nuclear engineer do
Query:      what does nuclear <extra_id_0>
Predicted:  what does nuclear power mean
---------------------
Actual:     how long until you can drink out of a straw after getting teeth pulled
Query:      how long until you can drink out of a <extra_id_0>
Predicted:  how long until you can drink out of a car
---------------------
Actual:     what is diamond mining
Query:      what is <extra_id_0>


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predicted:  what is lion meat
---------------------
Actual:     lung volume measurements are based on what
Query:      lung volume measurements are <extra_id_0>
Predicted:  lung volume measurements are required
---------------------
Actual:     problem of other minds definition in philosophy
Query:      problem of other minds definition in <extra_id_0>
Predicted:  problem of other minds definition in dementia
---------------------
Actual:     causes of ongoing headache
Query:      causes of <extra_id_0>


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predicted:  causes of cancer
---------------------
Actual:     what county does charlotte in
Query:      what <extra_id_0>
Predicted:  what is an oligarchy
---------------------
Actual:     were the grateful dead at altamont
Query:      were the grateful dead at <extra_id_0>
Predicted:  were the grateful dead at the time
---------------------
Actual:     acinus define
Query:      acinus define <extra_id_0>


Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.1"
}



Predicted:  acinus define
---------------------
Actual:     is it advantageous to file married filing separately
Query:      is it advantageous <extra_id_0>
Predicted:  is it advantageous to live on reservations and trust lands?
---------------------
