# EXPERIMENTO 03 - TREINAMENTO DO MODELO UTILIZANDO A API DE TRANSFORMERS DO HUGGINGFACE E O MODELO PRÉ-TREINADO => https://huggingface.co/alfaneo/bertimbaulaw-base-portuguese-cased

Modelo ajustado com termos jurídicos com base no modelo pré-treinado: https://huggingface.co/neuralmind/bert-base-portuguese-cased

**Ambiente Google Colab Pro**

**INSTALAÇÃO DAS DEPENDÊNCIAS**

In [None]:
!pip install transformers datasets torch tqdm numpy pandas py7zr rouge_score

**IMPORTAÇÃO DAS  BIBLIOTECAS**

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import EncoderDecoderModel
from datasets import load_metric
import torch
from tqdm.notebook import tqdm
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

**MONTAGEM DO DATASET COM OS PARES DE SENTENÇAS**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

PATH = '/PATH_COLAB/transformers-exp03'

dataset = load_dataset(PATH, data_files='dataset_132.879_pares_sentencas_stf.json', split='train', field="data")

ds = dataset.train_test_split(test_size=0.05)
train_data = ds['train'].shuffle(seed=42)
val_data = ds['test']
val_data



Mounted at /content/gdrive


Dataset({
    features: ['original', 'simples'],
    num_rows: 6644
})

**DEFINIÇÃO DO MODELO**

In [None]:
model_name = 'alfaneo/bertimbaulaw-base-portuguese-cased'
ds_col_in = 'original'
ds_col_out = 'simples'

tokenizer = AutoTokenizer.from_pretrained(model_name)
assert tokenizer.is_fast

encoder_max_length = 512
decoder_max_length = 512


def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels

    inputs = tokenizer(batch[ds_col_in], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch[ds_col_out], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # We have to make sure that the PAD token is ignored by the loss function
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

v_batch_size = 8

train_data = train_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=v_batch_size,
    remove_columns=[ds_col_in, ds_col_out]
)
val_data = val_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=v_batch_size,
    remove_columns=[ds_col_in, ds_col_out]
)

train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"],
)

val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"],
)

Map:   0%|          | 0/126235 [00:00<?, ? examples/s]

Map:   0%|          | 0/6644 [00:00<?, ? examples/s]

**TESTES INICIAIS**

In [None]:
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=v_batch_size)
val_dataloader = DataLoader(val_data, batch_size=v_batch_size)

batch = next(iter(train_dataloader))

for k,v in batch.items():
    print(k, v.shape)
print('---------------------------------------------------------')
print(tokenizer.decode(batch["input_ids"][0].tolist()))
print('---------------------------------------------------------')
labels = batch["labels"][0].tolist()
labels = [label for label in labels if label != -100 ]
tokenizer.decode(labels)

input_ids torch.Size([8, 512])
attention_mask torch.Size([8, 512])
labels torch.Size([8, 512])
---------------------------------------------------------
[CLS] ADMINISTRATIVO. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

'[CLS] DIREITO CIVIL PROCESSUAL. [SEP]'

**MÉTRICAS**

In [None]:
# Métricas
rouge = load_metric("rouge")


def compute_rouge(pred_ids, label_ids):
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

  rouge = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

**TREINAMENTO E DEFINIÇÃO DOS PARÂMETROS**

In [None]:
# Treinamento
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size
# settings for the generate() method
model.config.max_length = 512 # 120 
model.config.min_length = 40
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 0.8
model.config.num_beams = 3

training_arguments = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy='steps',
    num_train_epochs=30,
    per_device_train_batch_size=v_batch_size,
    per_device_eval_batch_size=v_batch_size,
    fp16=torch.cuda.is_available(),
    output_dir=PATH + '/output',
    logging_steps=100,
    save_steps=3000,
    eval_steps=10000,
    warmup_steps=2000,
    gradient_accumulation_steps=1,
    save_total_limit=3
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_arguments,
    compute_metrics=compute_rouge,
    train_dataset=train_data,
    eval_dataset=val_data
)

trainer.train()
trainer.save_model(PATH + '/model')

Downloading (…)lve/main/config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at alfaneo/bertimbaulaw-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at alfaneo/bertimbaulaw-base-portuguese-cased and are newly initialized: ['bert

Step,Training Loss,Validation Loss


OutOfMemoryError: ignored