In [1]:
# from google.colab import drive
from pathlib import Path

# drive.mount('/content/drive')
# path = Path("/content/drive/MyDrive/fireloc/20220815")
path = Path("..")

# !pip install pandas datasets transformers
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


build train/eval datasets

In [2]:
dataframe = pd.read_excel(
    path / "data/ref_fire_face_v3_102019.xlsx",
    sheet_name="referencia",
    names=["", "text", "", "", "", "", "fire", ""],
    usecols=["text", "fire"],
)
tokenizer = AutoTokenizer.from_pretrained(
    "neuralmind/bert-large-portuguese-cased",
    do_lower_case=False,
)
datasets = (
    Dataset.from_pandas(dataframe)
    .map(
        lambda example: {
            **tokenizer(
                example["text"],
                padding="max_length",
                truncation=True,
                max_length=100,
            ),
            "label": int(bool(example["fire"])),
        },
        remove_columns=["text", "fire"],
    )
    .train_test_split(test_size=0.1)
)


100%|██████████| 2492/2492 [00:01<00:00, 1972.11ex/s]


define metrics

In [3]:
accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
precision = evaluate.load("precision")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        **accuracy.compute(predictions=predictions, references=labels),
        **recall.compute(predictions=predictions, references=labels),
        **precision.compute(predictions=predictions, references=labels),
    }


build classifier model & trainer

In [4]:
classifier = AutoModelForSequenceClassification.from_pretrained(
    "neuralmind/bert-base-portuguese-cased",
    num_labels=2,
)
arguments = TrainingArguments("test_trainer", num_train_epochs=3)
trainer = Trainer(
    model=classifier,
    args=arguments,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

train model

In [5]:
trainer.train()

***** Running training *****
  Num examples = 2242
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 843
 59%|█████▉    | 500/843 [23:21<14:37,  2.56s/it] Saving model checkpoint to test_trainer\checkpoint-500
Configuration saved in test_trainer\checkpoint-500\config.json


{'loss': 0.1578, 'learning_rate': 2.0344009489916967e-05, 'epoch': 1.78}


Model weights saved in test_trainer\checkpoint-500\pytorch_model.bin
100%|██████████| 843/843 [40:16<00:00,  2.61s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 843/843 [40:16<00:00,  2.87s/it]

{'train_runtime': 2416.4612, 'train_samples_per_second': 2.783, 'train_steps_per_second': 0.349, 'train_loss': 0.11319066564666277, 'epoch': 3.0}





TrainOutput(global_step=843, training_loss=0.11319066564666277, metrics={'train_runtime': 2416.4612, 'train_samples_per_second': 2.783, 'train_steps_per_second': 0.349, 'train_loss': 0.11319066564666277, 'epoch': 3.0})

In [6]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 250
  Batch size = 8
100%|██████████| 32/32 [00:25<00:00,  1.25it/s]


{'eval_loss': 0.18482740223407745,
 'eval_accuracy': 0.964,
 'eval_recall': 0.8333333333333334,
 'eval_precision': 0.8620689655172413,
 'eval_runtime': 26.1308,
 'eval_samples_per_second': 9.567,
 'eval_steps_per_second': 1.225,
 'epoch': 3.0}

In [7]:
trainer.save_model(path / "models/bert")

Saving model checkpoint to ..\models\bert
Configuration saved in ..\models\bert\config.json
Model weights saved in ..\models\bert\pytorch_model.bin
