<a href="https://colab.research.google.com/github/alexlimatds/victor-doc_classification/blob/main/victor_doc_classification_BERTimbau_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Document classification of Victor project using BERTimbau as machine learning model

Because of memory limitations, this notebook runs just the training. For the test results, check the specific notebook.

Application based on the Hugging Face library.

### Installing dependencies

In [None]:
!pip install transformers
!pip install datasets

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/cd/40/866cbfac4601e0f74c7303d533a9c5d4a53858bd402e08e3e294dd271f25/transformers-4.2.1-py3-none-any.whl (1.8MB)
[K     |▏                               | 10kB 22.6MB/s eta 0:00:01[K     |▍                               | 20kB 15.8MB/s eta 0:00:01[K     |▋                               | 30kB 13.7MB/s eta 0:00:01[K     |▊                               | 40kB 12.5MB/s eta 0:00:01[K     |█                               | 51kB 8.4MB/s eta 0:00:01[K     |█▏                              | 61kB 7.6MB/s eta 0:00:01[K     |█▎                              | 71kB 8.5MB/s eta 0:00:01[K     |█▌                              | 81kB 9.4MB/s eta 0:00:01[K     |█▊                              | 92kB 10.2MB/s eta 0:00:01[K     |█▉                              | 102kB 8.1MB/s eta 0:00:01[K     |██                              | 112kB 8.1MB/s eta 0:00:01[K     |██▎                             | 122kB 

### Loading pretrained model and its tokenizer

In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

# Using the community model (https://huggingface.co/neuralmind/bert-base-portuguese-cased)
model_name = 'neuralmind/bert-base-portuguese-cased'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=6)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=209528.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=43.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=647.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=438235074.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

In [None]:
id2label = {
    0: 'acordao_de_2_instancia', 
    1: 'agravo_em_recurso_extraordinario', 
    2: 'despacho_de_admissibilidade', 
    3: 'outros', 
    4: 'peticao_do_RE', 
    5: 'sentenca'}
label2id = {v : k for (k, v) in id2label.items()}

model.config.id2label = id2label

### Loading and preprocessing dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"

Mounted at /content/gdrive


In [None]:
dataset_dir = root_dir + 'Machine Learning/Victor datasets/'

In [None]:
import torch
import torchtext

S = 250 # Maximum number of tokens in a sentence

dataset_fraction = 0.6 # fraction of train and validation datasets to be used

if dataset_fraction == 1.0: # full dataset
  train_ds_file = 'train_small.csv'
  validation_ds_file = 'validation_small.csv'
else:
  train_ds_file = f'train_small.csv-croped_{dataset_fraction}.csv'
  validation_ds_file = f'validation_small.csv-croped_{dataset_fraction}.csv'

In [None]:
import torch

class VictorDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    ids = torch.tensor(self.encodings[idx]['input_ids'], dtype=torch.long)
    mask = torch.tensor(self.encodings[idx]['attention_mask'], dtype=torch.long)
    return {
        'input_ids': ids, 
        'attention_mask': mask, 
        'labels': self.labels[idx]
    }

  def __len__(self):
    return len(self.labels)

In [None]:
import io

def load_and_preprocess(ds_path):
  print(f'Loading data from {ds_path}')
  label_ids = []
  encodings = []
  with io.open(dataset_dir + ds_path, encoding="utf8") as f:
    reader = torchtext.utils.unicode_csv_reader(f)
    next(reader)  # skip header
    for line in reader:
      label_ids.append(label2id[line[3]])
      encodings.append(tokenizer(line[5], truncation=True, padding='max_length', max_length=S))
  return VictorDataset(encodings, label_ids)

In [None]:
%%time

ds_train = load_and_preprocess(train_ds_file)
ds_validation = load_and_preprocess(validation_ds_file)

Loading data from train_small.csv-croped_0.6.csv
Loading data from validation_small.csv-croped_0.6.csv
CPU times: user 1min 50s, sys: 4.57 s, total: 1min 55s
Wall time: 1min 56s


### Fine tuning


In [None]:
from datasets import load_metric

metric = load_metric('f1')

def compute_metrics(eval_pred):
  labels = eval_pred.label_ids
  predictions = eval_pred.predictions.argmax(-1)
  f1_macro = metric.compute(predictions=predictions, references=labels, average='macro')
  f1_weighted = metric.compute(predictions=predictions, references=labels, average='weighted')
  return {
      'f1_macro': f1_macro['f1'], 
      'f1_weighted': f1_weighted['f1']
  }


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1800.0, style=ProgressStyle(description…




In [None]:
from transformers import Trainer, TrainingArguments

model_dir = '{}/BERTimbau{}-{}/'.format(dataset_dir, S, dataset_fraction)

training_args = TrainingArguments(
    output_dir=model_dir,            # output directory
    logging_dir='./logs',            # directory for storing logs
    num_train_epochs=5,              # total # of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro', 
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=ds_train, 
    eval_dataset=ds_validation, 
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Macro,F1 Weighted,Runtime,Samples Per Second
1,0.2561,0.280151,0.707405,0.904171,1015.7129,60.148
2,0.184,0.283353,0.743895,0.912077,1011.1311,60.42
3,0.1324,0.299502,0.762307,0.916808,1014.6879,60.209
4,0.0915,0.363205,0.780621,0.922181,1010.6513,60.449
5,0.0603,0.406096,0.766608,0.921624,1014.892,60.197


TrainOutput(global_step=14935, training_loss=0.1649681992941091, metrics={'train_runtime': 27661.7788, 'train_samples_per_second': 0.54, 'total_flos': 78070969299375000, 'epoch': 5.0})

In [None]:
#model.save_pretrained(model_dir)

In [None]:
from sklearn.metrics import classification_report
import numpy as np

outputs = trainer.predict(test_dataset=ds_validation)

report = classification_report(
    ds_validation.labels, 
    np.argmax(outputs[0], axis=1), 
    digits=4, 
    target_names=['acordao_de_2_instancia', 'agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca'])

print(report)

rep_file = open(model_dir + "report_validation.txt", "wt")
rep_file.write(report)
rep_file.close()

                                  precision    recall  f1-score   support

          acordao_de_2_instancia       0.81      0.80      0.81       299
agravo_em_recurso_extraordinario       0.62      0.59      0.60      2149
     despacho_de_admissibilidade       0.75      0.73      0.74       183
                          outros       0.95      0.96      0.96     50462
                   peticao_do_RE       0.81      0.79      0.80      6364
                        sentenca       0.81      0.75      0.78      1636

                        accuracy                           0.92     61093
                       macro avg       0.79      0.77      0.78     61093
                    weighted avg       0.92      0.92      0.92     61093

