# Prepare the unannotated corpus for doccano

In [1]:
import os
from glob import glob
import csv
import pandas as pd
import re
import json

# Load data

In [2]:
from utils import load_conll

In [3]:
texts, tags = load_conll('08272022.conll')

In [4]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2, random_state=7)

In [5]:
for i,j in zip(train_texts[3],train_tags[3]):
    print(i,'\t\t',j)

Fig 		 O
. 		 O
11 		 O
The 		 O
plot 		 O
of 		 O
ln 		 B-P
[ 		 I-P
ɛ 		 I-P
/ 		 I-P
σ−1 		 I-P
/ 		 I-P
E 		 I-P
] 		 I-P
vs 		 I-P
. 		 I-P
ln 		 I-P
ɛ 		 L-P
of 		 O
neat 		 U-S
and 		 O
nanophased 		 B-G
epoxy 		 L-G
. 		 O


In [6]:
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
# modified from https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=jrkdZBLYHVcB
class T5Dataset(Dataset):
    def __init__(self, tokens, tags, tokenizer, max_len, task_prefix):
        self.len = len(tokens)
        self.tokens = tokens
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.task_prefix = task_prefix
        # create encodings for tokens and labels
        self.unique_tags = set(tag for doc in tags for tag in doc)
        self.tag2id = {tag: _id for _id, tag in enumerate(self.unique_tags)}
        self.id2tag = {_id: tag for tag, _id in self.tag2id.items()}

    def __getitem__(self, index):
        # step 1: get the sentence and word labels (skip, we already have it)

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(self.task_prefix.split() + self.tokens[index],
                                  is_split_into_words=True,
#                                   return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)
        # step 3: use tokenizer to encode labels as a sentence
        target_encoding = self.tokenizer(self.tags[index],
                                         is_split_into_words=True,
                                         padding='max_length',
                                         truncation=True,
                                         max_length=self.max_len)
        encoded_labels = torch.as_tensor(target_encoding.input_ids)
        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        # replace padding token id's of the labels by -100 so it's ignored by the loss
        encoded_labels[encoded_labels == tokenizer.pad_token_id] = -100
        item['labels'] = encoded_labels
        return item

    def __len__(self):
        return self.len

# Define evaluation metrics

In [7]:
import evaluate

metric = evaluate.load("seqeval")

In [8]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     print(f'preds: {decoded_preds}')
#     print(f'labels: {decoded_labels}')
    
    # Some simple post-processing
    decoded_preds = [pred.strip().split() for pred in decoded_preds]
    decoded_labels = [label.strip().split() for label in decoded_labels]
#     print(f'post-process preds: {decoded_preds}')
#     print(f'post-process labels: {decoded_labels}')
    
    # More post-processing to make sure the predicted seq has the same len as labels
    for i in range(len(decoded_labels)):
        label_len = len(decoded_labels[i])
        if len(decoded_preds[i]) > label_len:
            decoded_preds[i] = decoded_preds[i][:label_len]
        elif len(decoded_preds[i]) < label_len:
            decoded_preds[i] += ['O']*(label_len-len(decoded_preds[i]))
#     print(f'preds for eval: {decoded_preds}')
#     print(f'labels for eval: {decoded_labels}')

    all_metrics = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [9]:
predictions = [['O','B-P','L-P'],['N','O','O','O']]
references = [['O','B-P','L-P'],['M','O','O','O']]
metric.compute(predictions=predictions,references=references)



{'P': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 0.8571428571428571}

# T5 pretrained

In [10]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration

In [11]:
tokenizer = T5TokenizerFast.from_pretrained("t5-base",add_prefix_space=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [12]:
training_set = T5Dataset(train_texts, train_tags, tokenizer, max_len=200, task_prefix='')
val_set = T5Dataset(val_texts, val_tags, tokenizer, max_len=200, task_prefix='')

**Assumption**: Padding fixed to 200.

### Base model

In [12]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [13]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "tr

In [13]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    predict_with_generate=True,
    seed=7,
#     no_cuda=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.307435,0.251572,0.090293,0.13289,0.698398
2,0.413400,0.251194,0.231481,0.112867,0.151745,0.71103
3,0.260700,0.178076,0.343583,0.193378,0.247472,0.734992
4,0.187500,0.160181,0.374667,0.211437,0.270322,0.744042
5,0.187500,0.146192,0.412195,0.254327,0.314565,0.754851
6,0.150800,0.140893,0.424242,0.263356,0.324977,0.755242
7,0.125500,0.13836,0.444308,0.273138,0.338304,0.756544
8,0.107900,0.12726,0.48665,0.301731,0.372503,0.766897
9,0.107900,0.124863,0.471564,0.299473,0.366314,0.762925
10,0.092800,0.124522,0.487395,0.305493,0.375578,0.766311


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner/checkpoint-381
Configuration saved in T5-pretrained-labelseq-ner/checkpoint-381/config.json
Model weights saved in T5-pretrained-labelseq-ner/checkpoint-381/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner/checkpoint-381/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner/checkpoint-381/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner/checkpoint-381/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner/checkpoint-762
Configuration saved in T5-pretrained-labelseq-ner/checkpoint-762/config.json
Model weights saved in T5-pretrained-labelseq-ner/checkpoi

Saving model checkpoint to T5-pretrained-labelseq-ner/checkpoint-1143
Configuration saved in T5-pretrained-labelseq-ner/checkpoint-1143/config.json
Model weights saved in T5-pretrained-labelseq-ner/checkpoint-1143/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner/checkpoint-1143/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner/checkpoint-1143/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner/checkpoint-1143/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner/checkpoint-1524
Configuration saved in T5-pretrained-labelseq-ner/checkpoint-1524/config.json
Model weights saved in T5-pretrained-labelseq-ner/checkpoint-1524/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner/checkpoint-1524/tokenizer_config

Saving model checkpoint to T5-pretrained-labelseq-ner/checkpoint-2286
Configuration saved in T5-pretrained-labelseq-ner/checkpoint-2286/config.json
Model weights saved in T5-pretrained-labelseq-ner/checkpoint-2286/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner/checkpoint-2286/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner/checkpoint-2286/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner/checkpoint-2286/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner/checkpoint-2667
Configuration saved in T5-pretrained-labelseq-ner/checkpoint-2667/config.json
Model weights saved in T5-pretrained-labelseq-ner/checkpoint-2667/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner/checkpoint-2667/tokenizer_config.json
Special tokens file saved in T5-pretrained-labels

Saving model checkpoint to T5-pretrained-labelseq-ner/checkpoint-3429
Configuration saved in T5-pretrained-labelseq-ner/checkpoint-3429/config.json
Model weights saved in T5-pretrained-labelseq-ner/checkpoint-3429/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner/checkpoint-3429/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner/checkpoint-3429/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner/checkpoint-3429/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner/checkpoint-3810
Configuration saved in T5-pretrained-labelseq-ner/checkpoint-3810/config.json
Model weights saved in T5-pretrained-labelseq-ner/checkpoint-3810/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner/checkpoint-3810/tokenizer_config.json
Special tokens file saved in T5-pretrained-labels

TrainOutput(global_step=3810, training_loss=0.18261083818170343, metrics={'train_runtime': 3153.7099, 'train_samples_per_second': 9.655, 'train_steps_per_second': 1.208, 'total_flos': 7243268659200000.0, 'train_loss': 0.18261083818170343, 'epoch': 10.0})

### num_beams = 5

In [12]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [13]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "tr

In [15]:
model.config.update({'num_beams':5})
model.config.num_beams

5

In [16]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-nbeams5",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_num_beams=5,
    seed=7,
#     no_cuda=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.307435,0.189459,0.100075,0.13097,0.677432
2,0.413400,0.251194,0.223022,0.116629,0.153162,0.706472
3,0.260700,0.178076,0.337563,0.20015,0.251299,0.736098
4,0.187500,0.160181,0.366751,0.217457,0.273028,0.743652
5,0.187500,0.146192,0.406286,0.262603,0.319013,0.753028
6,0.150800,0.140893,0.416084,0.268623,0.326475,0.753353
7,0.125500,0.13836,0.441141,0.279157,0.341935,0.756739
8,0.107900,0.12726,0.483452,0.30775,0.376092,0.765269
9,0.107900,0.124863,0.474118,0.303236,0.369894,0.763967
10,0.092800,0.124522,0.490476,0.310008,0.379899,0.76618


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5/checkpoint-381
Configuration saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-381/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-381/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-381/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-381/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams5/checkpoint-381/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5/checkpoint-762
Configuration saved in T

Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5/checkpoint-1143
Configuration saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-1143/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-1143/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-1143/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-1143/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams5/checkpoint-1143/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5/checkpoint-1524
Configuration saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-1524/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-1524/pytorch_model.bin
tokenizer config f

Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5/checkpoint-2286
Configuration saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-2286/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-2286/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-2286/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-2286/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams5/checkpoint-2286/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5/checkpoint-2667
Configuration saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-2667/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-2667/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5/checkpo

Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5/checkpoint-3429
Configuration saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-3429/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-3429/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-3429/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-3429/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams5/checkpoint-3429/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5/checkpoint-3810
Configuration saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-3810/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5/checkpoint-3810/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5/checkpo

TrainOutput(global_step=3810, training_loss=0.18261083818170343, metrics={'train_runtime': 3464.6921, 'train_samples_per_second': 8.789, 'train_steps_per_second': 1.1, 'total_flos': 7243268659200000.0, 'train_loss': 0.18261083818170343, 'epoch': 10.0})

### num_beams = 10

In [18]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

loading configuration file https://huggingface.co/t5-base/resolve/main/config.json from cache at /home/nanomineduke/.cache/huggingface/transformers/91e9fe874e06c44883b535d6c950b8b89d6eaa3298d8e7fb3b2c78039e9f8b7b.66b9637a52aa11e9285cdd6e668cc0df14b3bcf0b6674cf3ba5353c542649637
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      

In [19]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "tr

In [20]:
model.config.update({'num_beams':10})
model.config.num_beams

10

In [22]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-nbeams10",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_num_beams=10,
    seed=7,
#     no_cuda=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [23]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.307435,0.190743,0.102333,0.133203,0.675544
2,0.413400,0.251194,0.223665,0.116629,0.153314,0.706342
3,0.260700,0.178076,0.336294,0.199398,0.250354,0.735838
4,0.187500,0.160181,0.366751,0.217457,0.273028,0.743717
5,0.187500,0.146192,0.40676,0.262603,0.319159,0.752832
6,0.150800,0.140893,0.416084,0.268623,0.326475,0.753353
7,0.125500,0.13836,0.441141,0.279157,0.341935,0.756739
8,0.107900,0.12726,0.483452,0.30775,0.376092,0.765269
9,0.107900,0.124863,0.474118,0.303236,0.369894,0.763967
10,0.092800,0.124522,0.490476,0.310008,0.379899,0.76618


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams10/checkpoint-381
Configuration saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-381/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-381/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-381/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-381/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams10/checkpoint-381/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams10/checkpoint-762
Configuration sav

Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams10/checkpoint-1143
Configuration saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-1143/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-1143/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-1143/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-1143/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams10/checkpoint-1143/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams10/checkpoint-1524
Configuration saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-1524/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-1524/pytorch_model.bin
tokenizer

Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams10/checkpoint-2286
Configuration saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-2286/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-2286/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-2286/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-2286/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams10/checkpoint-2286/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams10/checkpoint-2667
Configuration saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-2667/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-2667/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams

Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams10/checkpoint-3429
Configuration saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-3429/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-3429/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-3429/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-3429/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams10/checkpoint-3429/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams10/checkpoint-3810
Configuration saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-3810/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams10/checkpoint-3810/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams

TrainOutput(global_step=3810, training_loss=0.18261083818170343, metrics={'train_runtime': 3814.6936, 'train_samples_per_second': 7.982, 'train_steps_per_second': 0.999, 'total_flos': 7243268659200000.0, 'train_loss': 0.18261083818170343, 'epoch': 10.0})

In [24]:
trainer._gen_kwargs

{'max_length': None, 'num_beams': 10}

### num_beams = 5 with force_words_ids

In [13]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [14]:
model.config.update({'num_beams':5})
model.config.num_beams

5

In [15]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_beams": 5,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
 

In [16]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-nbeams5-force",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_num_beams=5,
    seed=7,
#     no_cuda=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
force_words_ids = tokenizer(list(training_set.tag2id.keys()), add_special_tokens=False).input_ids

In [18]:
from utils import Seq2SeqTrainerGenKwargs

trainer = Seq2SeqTrainerGenKwargs(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
#     bad_words_ids=bad_words_ids,
    force_words_ids=force_words_ids
)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.307435,0.004621,0.009782,0.006277,0.565764
2,0.413400,0.251194,0.004732,0.009782,0.006379,0.575791
3,0.260700,0.178076,0.006151,0.012792,0.008307,0.578331
4,0.187500,0.160181,0.007221,0.015801,0.009913,0.578591
5,0.187500,0.146192,0.009019,0.018811,0.012192,0.585298
6,0.150800,0.140893,0.008291,0.017306,0.011211,0.58367
7,0.125500,0.13836,0.008636,0.018059,0.011685,0.584842
8,0.107900,0.12726,0.008003,0.016554,0.01079,0.586795
9,0.107900,0.124863,0.008727,0.018059,0.011768,0.583865
10,0.092800,0.124522,0.00939,0.019564,0.012689,0.58406


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-381
Configuration saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-381/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-381/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-381/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-381/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-381/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-762
Configuration saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-762/config.json
M

Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1143
Configuration saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1143/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1143/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1143/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1143/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1143/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1524
Configuration saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1524/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1524/pytorch_model.bin
tokenizer config fi

Special tokens file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1905/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-1905/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-2286
Configuration saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-2286/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-2286/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-2286/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-2286/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-2286/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modi

Copy vocab file to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-2667/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3048
Configuration saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3048/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3048/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3048/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3048/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3048/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))


Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3429
Configuration saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3429/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3429/pytorch_model.bin
tokenizer config file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3429/tokenizer_config.json
Special tokens file saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3429/special_tokens_map.json
Copy vocab file to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3429/spiece.model
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3810
Configuration saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3810/config.json
Model weights saved in T5-pretrained-labelseq-ner-nbeams5-force/checkpoint-3810/pytorch_model.bin
tokenizer config fi

TrainOutput(global_step=3810, training_loss=0.18261083818170343, metrics={'train_runtime': 4801.0518, 'train_samples_per_second': 6.342, 'train_steps_per_second': 0.794, 'total_flos': 7243268659200000.0, 'train_loss': 0.18261083818170343, 'epoch': 10.0})