<a href="https://colab.research.google.com/github/alexlimatds/victor-doc_classification/blob/main/victor_doc_classification_BERTimbau_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Document classification of Victor project using BERTimbau as machine learning model

Because of memory limitations, this notebook runs just the test. For the train, check the specific notebook.

Application based on the Hugging Face library.

### Installing dependencies

In [None]:
!pip install transformers
!pip install datasets



### Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = '/content/gdrive/My Drive/'

Mounted at /content/gdrive


In [None]:
dataset_dir = root_dir + 'Machine Learning/Victor datasets/'

### Application parameters

In [None]:
model_path = dataset_dir + 'BERTimbau250-0.6/'

S = 250 # Maximum number of tokens in a sentence

# dataset file
#test_ds_file = 'train_small.csv-croped_0.6.csv'
test_ds_file = 'test_small.csv'

### Loading saved model and its tokenizer

In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

# Using the fine-tuned model
model_name = 'neuralmind/bert-base-portuguese-cased'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_path)


In [None]:
id2label = {
    0: 'acordao_de_2_instancia', 
    1: 'agravo_em_recurso_extraordinario', 
    2: 'despacho_de_admissibilidade', 
    3: 'outros', 
    4: 'peticao_do_RE', 
    5: 'sentenca'}
label2id = {v : k for (k, v) in id2label.items()}

model.config.id2label = id2label

### Loading and preprocessing dataset

In [None]:
import torch
import torchtext

class VictorDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    ids = torch.tensor(self.encodings[idx]['input_ids'], dtype=torch.long)
    mask = torch.tensor(self.encodings[idx]['attention_mask'], dtype=torch.long)
    return {
        'input_ids': ids, 
        'attention_mask': mask, 
        'labels': self.labels[idx]
    }

  def __len__(self):
    return len(self.labels)

In [None]:
import io

def load_and_preprocess(ds_path):
  print(f'Loading data from {ds_path}')
  label_ids = []
  encodings = []
  with io.open(dataset_dir + ds_path, encoding="utf8") as f:
    reader = torchtext.utils.unicode_csv_reader(f)
    next(reader)  # skip header
    for line in reader:
      label_ids.append(label2id[line[3]])
      encodings.append(tokenizer(line[5], truncation=True, padding='max_length', max_length=S))
  return VictorDataset(encodings, label_ids)

In [None]:
%%time

ds_test = load_and_preprocess(test_ds_file)

Loading data from test_small.csv
CPU times: user 1min 5s, sys: 2.72 s, total: 1min 8s
Wall time: 1min 7s


### Evaluation


In [None]:
from datasets import load_metric

metric = load_metric('f1')

def compute_metrics(eval_pred):
  labels = eval_pred.label_ids
  predictions = eval_pred.predictions.argmax(-1)
  f1_macro = metric.compute(predictions=predictions, references=labels, average='macro')
  f1_weighted = metric.compute(predictions=predictions, references=labels, average='weighted')
  return {
      'f1_macro': f1_macro['f1'], 
      'f1_weighted': f1_weighted['f1']
  }


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    logging_dir='./logs',            # directory for storing logs
    per_device_eval_batch_size=32,   # batch size for evaluation
    output_dir='./logs'
)

trainer = Trainer(
    model=model, 
    args=training_args, 
    compute_metrics=compute_metrics
)

In [None]:
from sklearn.metrics import classification_report
import numpy as np

outputs = trainer.predict(test_dataset=ds_test)

report = classification_report(
    ds_test.labels, 
    np.argmax(outputs[0], axis=1), 
    digits=4, 
    target_names=['acordao_de_2_instancia', 'agravo_em_recurso_extraordinario', 'despacho_de_admissibilidade', 'outros', 'peticao_do_RE', 'sentenca'])

print(report)

rep_file = open(model_path + f'report-{test_ds_file}.txt', "wt")
rep_file.write(f'{model_path}\nS: {S}\ndataset: {test_ds_file}\n')
rep_file.write(report)
rep_file.close()

                                  precision    recall  f1-score   support

          acordao_de_2_instancia     0.7307    0.9341    0.8199       273
agravo_em_recurso_extraordinario     0.4810    0.5915    0.5306      1841
     despacho_de_admissibilidade     0.6828    0.6414    0.6615       198
                          outros     0.9718    0.9607    0.9662     85408
                   peticao_do_RE     0.7115    0.7504    0.7305      6331
                        sentenca     0.7161    0.7831    0.7481      1475

                        accuracy                         0.9362     95526
                       macro avg     0.7156    0.7769    0.7428     95526
                    weighted avg     0.9398    0.9362    0.9378     95526

