# Prepare the unannotated corpus for doccano

In [1]:
import os
from glob import glob
import csv
import pandas as pd
import re
import json

# Load data

In [2]:
from utils import load_conll

In [3]:
texts, tags = load_conll('08272022.conll')

In [4]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2, random_state=7)

In [5]:
for i,j in zip(train_texts[3],train_tags[3]):
    print(i,'\t\t',j)

Fig 		 O
. 		 O
11 		 O
The 		 O
plot 		 O
of 		 O
ln 		 B-P
[ 		 I-P
ɛ 		 I-P
/ 		 I-P
σ−1 		 I-P
/ 		 I-P
E 		 I-P
] 		 I-P
vs 		 I-P
. 		 I-P
ln 		 I-P
ɛ 		 L-P
of 		 O
neat 		 U-S
and 		 O
nanophased 		 B-G
epoxy 		 L-G
. 		 O


In [6]:
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
# modified from https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=jrkdZBLYHVcB
class T5Dataset(Dataset):
    def __init__(self, tokens, tags, tokenizer, max_len, task_prefix):
        self.len = len(tokens)
        self.tokens = tokens
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.task_prefix = task_prefix
        # create encodings for tokens and labels
        self.unique_tags = set(tag for doc in tags for tag in doc)
        self.tag2id = {tag: _id for _id, tag in enumerate(self.unique_tags)}
        self.id2tag = {_id: tag for tag, _id in self.tag2id.items()}

    def __getitem__(self, index):
        # step 1: get the sentence and word labels (skip, we already have it)

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(self.task_prefix.split() + self.tokens[index],
                                  is_split_into_words=True,
#                                   return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)
        # step 3: use tokenizer to encode labels as a sentence
        target_encoding = self.tokenizer(self.tags[index],
                                         is_split_into_words=True,
                                         padding='max_length',
                                         truncation=True,
                                         max_length=self.max_len)
        encoded_labels = torch.as_tensor(target_encoding.input_ids)
        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        # replace padding token id's of the labels by -100 so it's ignored by the loss
        encoded_labels[encoded_labels == tokenizer.pad_token_id] = -100
        item['labels'] = encoded_labels
        return item

    def __len__(self):
        return self.len

# Define evaluation metrics

In [7]:
import evaluate

metric = evaluate.load("seqeval")

In [8]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    # somehow preds are also getting padded with -100s...
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip().split() for pred in decoded_preds]
    decoded_labels = [label.strip().split() for label in decoded_labels]

    # More post-processing to make sure the predicted seq has the same len as labels
    for i in range(len(decoded_labels)):
        label_len = len(decoded_labels[i])
        if len(decoded_preds[i]) > label_len:
            decoded_preds[i] = decoded_preds[i][:label_len]
        elif len(decoded_preds[i]) < label_len:
            decoded_preds[i] += ['O']*(label_len-len(decoded_preds[i]))

    all_metrics = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [9]:
predictions = [['O','B-P','L-P'],['N','O','O','O']]
references = [['O','B-P','L-P'],['M','O','O','O']]
metric.compute(predictions=predictions,references=references)



{'P': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 0.8571428571428571}

# T5 pretrained

In [9]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration

In [10]:
tokenizer = T5TokenizerFast.from_pretrained("t5-base",add_prefix_space=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [11]:
training_set = T5Dataset(train_texts, train_tags, tokenizer, max_len=200, task_prefix='')
val_set = T5Dataset(val_texts, val_tags, tokenizer, max_len=200, task_prefix='')

**Assumption**: Padding fixed to 200.

### Base model

In [12]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [13]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "tr

In [13]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    predict_with_generate=True,
    seed=7,
#     no_cuda=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.307435,0.251572,0.090293,0.13289,0.698398
2,0.413400,0.251194,0.231481,0.112867,0.151745,0.71103
3,0.260700,0.178076,0.343583,0.193378,0.247472,0.734992
4,0.187500,0.160181,0.374667,0.211437,0.270322,0.744042
5,0.187500,0.146192,0.412195,0.254327,0.314565,0.754851
6,0.150800,0.140893,0.424242,0.263356,0.324977,0.755242
7,0.125500,0.13836,0.444308,0.273138,0.338304,0.756544
8,0.107900,0.12726,0.48665,0.301731,0.372503,0.766897
9,0.107900,0.124863,0.471564,0.299473,0.366314,0.762925
10,0.092800,0.124522,0.487395,0.305493,0.375578,0.766311


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3810, training_loss=0.18261083818170343, metrics={'train_runtime': 3153.7099, 'train_samples_per_second': 9.655, 'train_steps_per_second': 1.208, 'total_flos': 7243268659200000.0, 'train_loss': 0.18261083818170343, 'epoch': 10.0})

### num_beams = 5

In [12]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [13]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "tr

In [15]:
model.config.update({'num_beams':5})
model.config.num_beams

5

In [16]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-nbeams5",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_num_beams=5,
    seed=7,
#     no_cuda=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.307435,0.189459,0.100075,0.13097,0.677432
2,0.413400,0.251194,0.223022,0.116629,0.153162,0.706472
3,0.260700,0.178076,0.337563,0.20015,0.251299,0.736098
4,0.187500,0.160181,0.366751,0.217457,0.273028,0.743652
5,0.187500,0.146192,0.406286,0.262603,0.319013,0.753028
6,0.150800,0.140893,0.416084,0.268623,0.326475,0.753353
7,0.125500,0.13836,0.441141,0.279157,0.341935,0.756739
8,0.107900,0.12726,0.483452,0.30775,0.376092,0.765269
9,0.107900,0.124863,0.474118,0.303236,0.369894,0.763967
10,0.092800,0.124522,0.490476,0.310008,0.379899,0.76618


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3810, training_loss=0.18261083818170343, metrics={'train_runtime': 3464.6921, 'train_samples_per_second': 8.789, 'train_steps_per_second': 1.1, 'total_flos': 7243268659200000.0, 'train_loss': 0.18261083818170343, 'epoch': 10.0})

### num_beams = 10

In [18]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

loading configuration file https://huggingface.co/t5-base/resolve/main/config.json from cache at /home/nanomineduke/.cache/huggingface/transformers/91e9fe874e06c44883b535d6c950b8b89d6eaa3298d8e7fb3b2c78039e9f8b7b.66b9637a52aa11e9285cdd6e668cc0df14b3bcf0b6674cf3ba5353c542649637
Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      

In [19]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "tr

In [20]:
model.config.update({'num_beams':10})
model.config.num_beams

10

In [22]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-nbeams10",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_num_beams=10,
    seed=7,
#     no_cuda=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [23]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.307435,0.190743,0.102333,0.133203,0.675544
2,0.413400,0.251194,0.223665,0.116629,0.153314,0.706342
3,0.260700,0.178076,0.336294,0.199398,0.250354,0.735838
4,0.187500,0.160181,0.366751,0.217457,0.273028,0.743717
5,0.187500,0.146192,0.40676,0.262603,0.319159,0.752832
6,0.150800,0.140893,0.416084,0.268623,0.326475,0.753353
7,0.125500,0.13836,0.441141,0.279157,0.341935,0.756739
8,0.107900,0.12726,0.483452,0.30775,0.376092,0.765269
9,0.107900,0.124863,0.474118,0.303236,0.369894,0.763967
10,0.092800,0.124522,0.490476,0.310008,0.379899,0.76618


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3810, training_loss=0.18261083818170343, metrics={'train_runtime': 3814.6936, 'train_samples_per_second': 7.982, 'train_steps_per_second': 0.999, 'total_flos': 7243268659200000.0, 'train_loss': 0.18261083818170343, 'epoch': 10.0})

In [24]:
trainer._gen_kwargs

{'max_length': None, 'num_beams': 10}

### num_beams = 5 with force_words_ids

In [13]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [14]:
model.config.update({'num_beams':5})
model.config.num_beams

5

In [15]:
model.config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_beams": 5,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
 

In [16]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-nbeams5-force",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    num_train_epochs=10,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_num_beams=5,
    seed=7,
#     no_cuda=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
force_words_ids = tokenizer(list(training_set.tag2id.keys()), add_special_tokens=False).input_ids

In [18]:
from utils import Seq2SeqTrainerGenKwargs

trainer = Seq2SeqTrainerGenKwargs(
    model=model,
    args=args,
    train_dataset=training_set,
    eval_dataset=val_set,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
#     bad_words_ids=bad_words_ids,
    force_words_ids=force_words_ids
)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3810


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.307435,0.004621,0.009782,0.006277,0.565764
2,0.413400,0.251194,0.004732,0.009782,0.006379,0.575791
3,0.260700,0.178076,0.006151,0.012792,0.008307,0.578331
4,0.187500,0.160181,0.007221,0.015801,0.009913,0.578591
5,0.187500,0.146192,0.009019,0.018811,0.012192,0.585298
6,0.150800,0.140893,0.008291,0.017306,0.011211,0.58367
7,0.125500,0.13836,0.008636,0.018059,0.011685,0.584842
8,0.107900,0.12726,0.008003,0.016554,0.01079,0.586795
9,0.107900,0.124863,0.008727,0.018059,0.011768,0.583865
10,0.092800,0.124522,0.00939,0.019564,0.012689,0.58406


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3810, training_loss=0.18261083818170343, metrics={'train_runtime': 4801.0518, 'train_samples_per_second': 6.342, 'train_steps_per_second': 0.794, 'total_flos': 7243268659200000.0, 'train_loss': 0.18261083818170343, 'epoch': 10.0})

# With control over logitsprocessor

In [12]:
# get possible ids of label tokens
labelids = set()
for thing in training_set:
    labelids.update(thing['labels'].numpy())
labelids.remove(tokenizer.eos_token_id)
labelids.remove(-100)
labelids = torch.LongTensor(list(labelids))
labelids

tensor([517, 134,  27, 301, 272,  18, 254, 345, 411, 412])

In [13]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

In [14]:
from finetune import MyTrainer
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

### learning_rate=5e-5, batch_size=8, weight_decay=1e-2

In [15]:
args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-logitsprocessor", evaluation_strategy="epoch", save_strategy="epoch",
    learning_rate=5e-5, num_train_epochs=30, weight_decay=0.01, predict_with_generate=True,
    per_device_train_batch_size=8, seed=7, #     no_cuda=True,
    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = MyTrainer(
    model=model, args=args, train_dataset=training_set, eval_dataset=val_set,
    data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer,
)
beamsz = 5

In [16]:
trainer.set_things(labelids, tokenizer, beamsz)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11430


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.307162,0.124682,0.07374,0.092671,0.623649
2,0.472000,0.271871,0.143836,0.094808,0.114286,0.600729
3,0.304400,0.224287,0.211828,0.148232,0.174413,0.680427
4,0.243300,0.215647,0.247152,0.21219,0.22834,0.702305
5,0.243300,0.170373,0.313205,0.292701,0.302606,0.732647
6,0.195400,0.161528,0.362781,0.376975,0.369742,0.758562
7,0.164700,0.145103,0.409706,0.387509,0.398299,0.775817
8,0.142100,0.149049,0.352006,0.349887,0.350943,0.749186
9,0.142100,0.140563,0.468796,0.474793,0.471776,0.802578
10,0.123800,0.126656,0.511389,0.523702,0.517472,0.818531


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=11430, training_loss=0.11802315165468819, metrics={'train_runtime': 17367.7344, 'train_samples_per_second': 5.26, 'train_steps_per_second': 0.658, 'total_flos': 2.17298059776e+16, 'train_loss': 0.11802315165468819, 'epoch': 30.0})

### learning_rate=1e-4, batch_size=8, weight_decay=1e-2

In [17]:
args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-logitsprocessor-lr1e-4", evaluation_strategy="epoch", save_strategy="epoch",
    learning_rate=1e-4, num_train_epochs=30, weight_decay=0.01, predict_with_generate=True,
    per_device_train_batch_size=8, seed=7, #     no_cuda=True,
    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = MyTrainer(
    model=model, args=args, train_dataset=training_set, eval_dataset=val_set,
    data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer,
)
beamsz = 5

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [18]:
trainer.set_things(labelids, tokenizer, beamsz)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11430


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.151111,0.503084,0.552295,0.526542,0.815341
2,0.086900,0.140965,0.456781,0.48909,0.472384,0.799909
3,0.077800,0.134222,0.590701,0.583145,0.586899,0.835525
4,0.070300,0.143641,0.502505,0.528217,0.51504,0.812801
5,0.070300,0.137537,0.56032,0.580135,0.570055,0.838651
6,0.059900,0.134703,0.576379,0.613243,0.59424,0.84959
7,0.053500,0.134402,0.607349,0.634312,0.620537,0.859422
8,0.050200,0.138921,0.613201,0.650113,0.631118,0.85838
9,0.050200,0.138878,0.593772,0.645598,0.618601,0.855255
10,0.043200,0.136779,0.603175,0.629044,0.615838,0.849394


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=11430, training_loss=0.0372927551403029, metrics={'train_runtime': 17398.8882, 'train_samples_per_second': 5.25, 'train_steps_per_second': 0.657, 'total_flos': 2.17298059776e+16, 'train_loss': 0.0372927551403029, 'epoch': 30.0})

### learning_rate=2e-4, batch_size=8, weight_decay=1e-2

In [19]:
args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-logitsprocessor-lr2e-4", evaluation_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-4, num_train_epochs=30, weight_decay=0.01, predict_with_generate=True,
    per_device_train_batch_size=8, seed=7, #     no_cuda=True,
    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = MyTrainer(
    model=model, args=args, train_dataset=training_set, eval_dataset=val_set,
    data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer,
)
beamsz = 5

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [20]:
trainer.set_things(labelids, tokenizer, beamsz)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 11430


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.147819,0.553282,0.589917,0.571012,0.830707
2,0.058200,0.134215,0.585851,0.598194,0.591958,0.842948
3,0.055100,0.133107,0.587554,0.610986,0.599041,0.843795
4,0.050200,0.140343,0.606856,0.626035,0.616296,0.855645
5,0.050200,0.143708,0.593772,0.631302,0.611962,0.84959
6,0.044800,0.153496,0.600678,0.666667,0.631954,0.853106
7,0.039500,0.167229,0.61186,0.68322,0.645574,0.860854
8,0.035700,0.137771,0.641993,0.678706,0.659839,0.868603
9,0.035700,0.16384,0.582332,0.620015,0.600583,0.848613
10,0.032400,0.154596,0.646817,0.711061,0.677419,0.869124


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=11430, training_loss=0.026494044644313222, metrics={'train_runtime': 17118.9298, 'train_samples_per_second': 5.336, 'train_steps_per_second': 0.668, 'total_flos': 2.17298059776e+16, 'train_loss': 0.026494044644313222, 'epoch': 30.0})

### learning_rate=1e-4, batch_size=16, weight_decay=1e-2

In [21]:
args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-logitsprocessor-lr1e-4-batch-16", evaluation_strategy="epoch", save_strategy="epoch",
    learning_rate=1e-4, num_train_epochs=40, weight_decay=0.01, predict_with_generate=True,
    per_device_train_batch_size=16, seed=7, #     no_cuda=True,
    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = MyTrainer(
    model=model, args=args, train_dataset=training_set, eval_dataset=val_set,
    data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer,
)
beamsz = 5

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [22]:
trainer.set_things(labelids, tokenizer, beamsz)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 40
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7640


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.189311,0.679639,0.735892,0.706647,0.88729
2,No log,0.16054,0.659816,0.700527,0.679562,0.879281
3,0.016000,0.167983,0.698368,0.740406,0.718773,0.88996
4,0.016000,0.184332,0.691197,0.756208,0.722242,0.891001
5,0.016000,0.171497,0.72311,0.748683,0.735675,0.897252
6,0.015400,0.180105,0.706847,0.745673,0.725741,0.892043
7,0.015400,0.175856,0.710014,0.768247,0.737983,0.900703
8,0.013800,0.162801,0.727338,0.760722,0.743656,0.899987
9,0.013800,0.18871,0.66166,0.696012,0.678401,0.873681
10,0.013800,0.181263,0.717407,0.765989,0.740902,0.893997


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762


  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=7640, training_loss=0.009943935345292716, metrics={'train_runtime': 22236.8352, 'train_samples_per_second': 5.477, 'train_steps_per_second': 0.344, 'total_flos': 2.89730746368e+16, 'train_loss': 0.009943935345292716, 'epoch': 40.0})

### learning_rate=2e-4, batch_size=16

In [23]:
args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-logitsprocessor-lr2e-4-batch-16", evaluation_strategy="epoch", save_strategy="epoch",
    learning_rate=2e-4, num_train_epochs=40, weight_decay=0.01, predict_with_generate=True,
    per_device_train_batch_size=16, seed=7, #     no_cuda=True,
    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = MyTrainer(
    model=model, args=args, train_dataset=training_set, eval_dataset=val_set,
    data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer,
)
beamsz = 5

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
trainer.set_things(labelids, tokenizer, beamsz)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 40
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7640


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.205056,0.635402,0.677953,0.655988,0.865412
2,No log,0.164358,0.666438,0.732129,0.697741,0.879086
3,0.021200,0.148122,0.685997,0.729872,0.707255,0.883774
4,0.021200,0.161662,0.705841,0.754703,0.729455,0.892239
5,0.021200,0.161924,0.69021,0.742664,0.715477,0.892434
6,0.019300,0.155263,0.695497,0.732129,0.713343,0.885337
7,0.019300,0.158418,0.707143,0.744921,0.72554,0.89608
8,0.018000,0.157383,0.695431,0.721595,0.708272,0.888071
9,0.018000,0.158764,0.7127,0.772761,0.741516,0.90168
10,0.018000,0.159907,0.703678,0.76298,0.73213,0.894908


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=7640, training_loss=0.0109601586823064, metrics={'train_runtime': 22321.3553, 'train_samples_per_second': 5.457, 'train_steps_per_second': 0.342, 'total_flos': 2.89730746368e+16, 'train_loss': 0.0109601586823064, 'epoch': 40.0})

### learning_rate=3e-4, batch_size=16, weight_decay=1e-2

In [25]:
args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-logitsprocessor-lr3e-4-batch-16", evaluation_strategy="epoch", save_strategy="epoch",
    learning_rate=3e-4, num_train_epochs=40, weight_decay=0.01, predict_with_generate=True,
    per_device_train_batch_size=16, seed=7, #     no_cuda=True,
    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = MyTrainer(
    model=model, args=args, train_dataset=training_set, eval_dataset=val_set,
    data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer,
)
beamsz = 5

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [26]:
trainer.set_things(labelids, tokenizer, beamsz)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 40
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7640


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.16872,0.699489,0.721595,0.71037,0.879867
2,No log,0.140741,0.73003,0.742664,0.736292,0.894452
3,0.025600,0.155511,0.705202,0.734387,0.719499,0.884881
4,0.025600,0.139898,0.718412,0.748683,0.733235,0.896601
5,0.025600,0.16425,0.719348,0.763732,0.740876,0.898489
6,0.020900,0.159383,0.727532,0.767494,0.746979,0.899401
7,0.020900,0.148544,0.695622,0.753198,0.723266,0.886639
8,0.019200,0.147332,0.726826,0.778781,0.751907,0.894713
9,0.019200,0.142774,0.719573,0.760722,0.739576,0.897838
10,0.019200,0.190971,0.705314,0.768999,0.735781,0.887811


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=7640, training_loss=0.011748976190208765, metrics={'train_runtime': 22033.7458, 'train_samples_per_second': 5.528, 'train_steps_per_second': 0.347, 'total_flos': 2.89730746368e+16, 'train_loss': 0.011748976190208765, 'epoch': 40.0})

### learning_rate=3e-4, batch_size=16,weight_decay=1e-1

In [27]:
args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-logitsprocessor-lr3e-4-batch-16-wd-1e-1", evaluation_strategy="epoch", save_strategy="epoch",
    learning_rate=3e-4, num_train_epochs=40, weight_decay=0.1, predict_with_generate=True,
    per_device_train_batch_size=16, seed=7, #     no_cuda=True,
    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = MyTrainer(
    model=model, args=args, train_dataset=training_set, eval_dataset=val_set,
    data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer,
)
beamsz = 5

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [28]:
trainer.set_things(labelids, tokenizer, beamsz)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 40
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7640


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.167076,0.719573,0.760722,0.739576,0.901354
2,No log,0.174013,0.698571,0.735892,0.716746,0.888657
3,0.019700,0.128384,0.73767,0.776524,0.756598,0.899857
4,0.019700,0.148579,0.708897,0.749436,0.728603,0.897122
5,0.019700,0.179034,0.677669,0.72611,0.701053,0.886574
6,0.016100,0.165121,0.73415,0.766742,0.750092,0.899401
7,0.016100,0.171342,0.725783,0.766742,0.745701,0.89862
8,0.016500,0.159724,0.673759,0.714823,0.693684,0.879997
9,0.016500,0.19832,0.709655,0.774266,0.740554,0.896731
10,0.016500,0.174795,0.708799,0.763732,0.735241,0.894973


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762


  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762


  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=7640, training_loss=0.009427958137857977, metrics={'train_runtime': 22135.1556, 'train_samples_per_second': 5.503, 'train_steps_per_second': 0.345, 'total_flos': 2.89730746368e+16, 'train_loss': 0.009427958137857977, 'epoch': 40.0})

### learning_rate=3e-4, batch_size=16,weight_decay=1e-3

In [29]:
args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-logitsprocessor-lr3e-4-batch-16-wd-1e-3", evaluation_strategy="epoch", save_strategy="epoch",
    learning_rate=3e-4, num_train_epochs=40, weight_decay=1e-3, predict_with_generate=True,
    per_device_train_batch_size=16, seed=7, #     no_cuda=True,
    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = MyTrainer(
    model=model, args=args, train_dataset=training_set, eval_dataset=val_set,
    data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer,
)
beamsz = 5

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
trainer.set_things(labelids, tokenizer, beamsz)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 40
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7640


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.154543,0.742229,0.736644,0.739426,0.889504
2,No log,0.167995,0.691983,0.740406,0.715376,0.891522
3,0.017000,0.185671,0.675958,0.729872,0.701881,0.885337
4,0.017000,0.135336,0.745848,0.777276,0.761238,0.899857
5,0.017000,0.144473,0.75,0.774266,0.76194,0.903373
6,0.014800,0.159257,0.724426,0.783296,0.752711,0.901485
7,0.014800,0.157012,0.720506,0.772009,0.745369,0.895885
8,0.011900,0.160331,0.741269,0.782543,0.761347,0.904675
9,0.011900,0.156097,0.746408,0.781791,0.76369,0.905131
10,0.011900,0.15944,0.7445,0.789315,0.766253,0.908256


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762


  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
***** Running Evaluation *****
  Num examples = 762
  Batch size = 8




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=7640, training_loss=0.007843361919775058, metrics={'train_runtime': 22257.0808, 'train_samples_per_second': 5.472, 'train_steps_per_second': 0.343, 'total_flos': 2.89730746368e+16, 'train_loss': 0.007843361919775058, 'epoch': 40.0})

### learning_rate=3e-4, batch_size=16,weight_decay=1e-4

In [31]:
args = Seq2SeqTrainingArguments(
    "T5-pretrained-labelseq-ner-logitsprocessor-lr3e-4-batch-16-wd-1e-4", evaluation_strategy="epoch", save_strategy="epoch",
    learning_rate=3e-4, num_train_epochs=40, weight_decay=1e-4, predict_with_generate=True,
    per_device_train_batch_size=16, seed=7, #     no_cuda=True,
    )

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = MyTrainer(
    model=model, args=args, train_dataset=training_set, eval_dataset=val_set,
    data_collator=data_collator, compute_metrics=compute_metrics, tokenizer=tokenizer,
)
beamsz = 5

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.set_things(labelids, tokenizer, beamsz)
trainer.train()

***** Running training *****
  Num examples = 3045
  Num Epochs = 40
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7640


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.173764,0.714286,0.775019,0.743414,0.897643


***** Running Evaluation *****
  Num examples = 762
  Batch size = 8
