In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 28.6 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 65.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 39.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
# Loading dummy dataset with 4 columns - [ids, title, context, triplets]
from datasets import load_dataset
dataset = load_dataset("csv", data_files="train.csv")



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Loading REBEL model and tokenizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")


Downloading:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
# Preprocessing input text (context) and labels (triplets)
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["context"], batch["triplets"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [None]:
encoder_max_length = 256
decoder_max_length = 64

train_data = dataset.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=['id', 'title', 'context', 'triplets'],
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
train_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7
    })
})

In [None]:
# Finetuning REBEL
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data["train"],
    tokenizer=tokenizer,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 7
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5
You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=5, training_loss=3.6867977142333985, metrics={'train_runtime': 542.647, 'train_samples_per_second': 0.064, 'train_steps_per_second': 0.009, 'total_flos': 18962165268480.0, 'train_loss': 3.6867977142333985, 'epoch': 5.0})

In [None]:
# Inference
y_pred = trainer.predict(train_data["train"])

***** Running Prediction *****
  Num examples = 7
  Batch size = 16


In [None]:
len(y_pred)

3

In [None]:
y_pred.predictions[0].shape[0]

7

In [None]:
# Decoding infered logits
import numpy as np

preds = np.argmax(y_pred.predictions[0], axis = -1)
print(preds)
print(preds.shape)

decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=False)

[[    0     0  1437 10967    65  6317    12 29270   139  1591  1437 50266
    727  1437 50265   652   923     2     2  1866 10967 10967 10967 10967
  10967 10967 10967 10967 10967 10967 10967 10967 10967 10967 10967 10967
  10967 10967 10967 10967 10967 10967 10967 10967 10967 10967 10967 10967
  10967 10967 10967 10967 10967    65    65    65    65    65    65    65
     65    65    65    65]
 [    0     0 35196 11294    83     4 22756   281  1437 50266 10967   394
   1437 50265   737   547     2     2   392   392    53    53    53    53
     53    53    53    53    53    53    53    53    53    53    53    53
     53 10967 10967 10967 10967 10967 10967 10967 10967 10967 10967 10967
  10967 10967 10967 10967 10967 10967 10967 10967 10967 10967 10967   113
  10967 10967 10967 10967]
 [    0     0  1437 10967   292  6317    12 29270   139  1591  1437 50266
  10967  1437 50265   652   923     2     2 14333 10967 10967 10967 10967
  10967 10967 10967 10967 10967 10967 10967 10967 10967 10

In [None]:
decoded_preds

['<s><s>  Philippine one hundred-peso note <subj> 100 <obj> face value</s></s>100 Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine one one one one one one one one one one one',
 '<s><s>Philipp Manuel A. Roxas <subj> Philippine president <obj> position held</s></s> May May but but but but but but but but but but but but but but but but but Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine Philippine" Philippine Philippine Philippine Philippine',
 '<s><s>  Philippine five hundred-peso note <subj> Phili