In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')


In [2]:
import datasets

dataset = datasets.load_dataset('ms_marco', 'v2.1', split='train[:5000]')

Found cached dataset ms_marco (/home/ubuntu/.cache/huggingface/datasets/ms_marco/v2.1/2.1.0/b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84)


In [3]:
from transformers.models.bart.modeling_bart import shift_tokens_right
import random
import torch
random.seed(42)

def convert_to_features(batch):
    inputs = [passages['passage_text'][0] for passages in batch['passages']]

    input_encodings = tokenizer.batch_encode_plus(inputs, pad_to_max_length=True, max_length=1024, truncation=True, return_tensors='pt')
    label_encodings = tokenizer.batch_encode_plus(batch['query'], pad_to_max_length=True, max_length=1024, truncation=True, return_tensors='pt')
    labels = label_encodings['input_ids']
    # decoder_input_ids = shift_tokens_right(labels, model.config.pad_token_id)
    labels[labels[:,:] == model.config.pad_token_id] = -100
    
    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        # 'decoder_input_ids': decoder_input_ids,
        'labels': labels,
    }

    return encodings

In [4]:
dataset = dataset.map(convert_to_features, batched=True, batch_size=8)

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/ms_marco/v2.1/2.1.0/b6a62715fa5219aea5275dd3556601004cd63945cb63e36e022f77bb3cbbca84/cache-32a4beb0f14189b7.arrow


In [5]:
dataset = dataset.train_test_split(test_size=0.1)

In [10]:
from transformers.trainer import TrainingArguments, Trainer
model = model.cuda()
training_args = TrainingArguments(
    output_dir='./models/bart-summarizer',
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    do_train=True,
    do_eval=True,
    warmup_steps=500,   
    weight_decay=0.01,
    logging_dir='./logs',
    learning_rate=1e-05,
    logging_steps=30,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [11]:
trainer.train()

Step,Training Loss
30,3.2065
60,3.2081
90,2.9877
120,2.8888
150,2.6147
180,2.5494
210,2.1928
240,1.9662
270,1.8525
300,1.8431


In [12]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: wellFormedAnswers, passages, query_type, query, answers, query_id. If wellFormedAnswers, passages, query_type, query, answers, query_id are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 2


{'eval_loss': 2.878620147705078}

In [23]:
tokenizer.save_pretrained('~/models/bart-fine-tuned-query-from-doc')
model.save_pretrained('~/models/bart-fine-tuned-query-from-doc')

tokenizer config file saved in ~/models/bart-fine-tuned-query-from-doc/tokenizer_config.json
Special tokens file saved in ~/models/bart-fine-tuned-query-from-doc/special_tokens_map.json
Configuration saved in ~/models/bart-fine-tuned-query-from-doc/config.json
Configuration saved in ~/models/bart-fine-tuned-query-from-doc/generation_config.json
Model weights saved in ~/models/bart-fine-tuned-query-from-doc/pytorch_model.bin


In [21]:
model = BartForConditionalGeneration.from_pretrained('./models/bart-summarizer/checkpoint-1000')

loading configuration file ./models/bart-summarizer/checkpoint-1000/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label

In [22]:
model = model.cpu()
for i in range(0, 10):
    print('Actual:    ', dataset['test'][i]['query'])
    to_encode = dataset['test'][i]['passages']['passage_text'][0]
    encoded = tokenizer(
        to_encode,
        pad_to_max_length=True,
        max_length=1024,
        truncation=True,
        return_tensors='pt')
    output = model.generate(input_ids=encoded['input_ids'].to('cpu'), max_length=1024, num_beams=4, early_stopping=True)
    print('Predicted: ', tokenizer.decode(output[0], skip_special_tokens=True))
    print('---------------------')

Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Actual:     how much does an ultrasound tech make


Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Predicted:  average salary of Ultrasound Technologists
---------------------
Actual:     how do i get an ori number


Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Predicted:  what is a gcic service agreement
---------------------
Actual:     what is a IFI contractor


Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Predicted:  what is an independent contractor definition
---------------------
Actual:     what instrument is used in holography? laser spectacles telescope microscope


Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Predicted:  what is a microscope
---------------------
Actual:     what cause pain on the left side of the head


Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Predicted:  what causes sharp head pain
---------------------
Actual:     what are frame structures


Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Predicted:  what is a Universal Fabrication
---------------------
Actual:     types of muscles in the human body


Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Predicted:  what muscles are found in the body
---------------------
Actual:     what is an activity coordinator


Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Predicted:  activities coordinator degree programs
---------------------
Actual:     is driving an unregistered vehicle a moving violation


Generate config GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}



Predicted:  what is a driving license
---------------------
Actual:     what altitude does a pilot need oxygen
Predicted:  what does oxygen do
---------------------


In [None]:
import torch
for item in dataset['test']:
    text = item['text']
    labels = item['labels_text']
    encoded = tokenizer(text, return_tensors='pt').to('cuda')
    output = model.generate(**encoded, max_length=512, num_beams=4, early_stopping=True)
    predicted = tokenizer.decode(output[0].to('cpu'), skip_special_tokens=True)
    print('Text     : ', text)
    print('Predicted: ', predicted)
    print('Expected : ', labels)
    print('--' * 20)

KeyError: 'text'

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask', 'labels_text'],
        num_rows: 90
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask', 'labels_text'],
        num_rows: 10
    })
})