Imports

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np
from sklearn.metrics import classification_report
import os
import pickle

NUMBER_OF_LABELS = 30

tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased", max_length=512, truncation=True)
bert_model = AutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased")

  from .autonotebook import tqdm as notebook_tqdm
2025-01-17 08:05:46.739494: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-17 08:05:46.819596: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-17 08:05:46.889144: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737111946.942929    1434 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737111946.958724    1434 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory 

Read macmorpho and generate arrays of arrays of sentences and labels

In [2]:
def get_labels(labels_path="input/macmorpho-labels"):
    lines = []

    with open(labels_path, "r", encoding="utf-8") as file:
        for line in file:
            lines.append(line.strip())

    return lines

def load_macmorpho(file_path, tokenizer, label_list):
    sentences = []
    labels = []

    with open(file_path, "r", encoding="utf-8") as f:
        current_sentence = []
        current_label = []

        for line in f:
            line = line.strip()
            tokens = line.split(" ")

            for token in tokens:
                word, label = token.rsplit("_", 1)
                subwords = tokenizer.tokenize(word)  # Subword tokenization
                subword_count = len(subwords)

                current_sentence.extend(subwords)
                current_label.extend([label_list.index(label)] + [-1] * (subword_count - 1))  # -1 marks subwords that should not contribute to loss

            sentences.append(current_sentence)
            labels.append(current_label)
            current_sentence = []
            current_label = []

    return sentences, labels

From the sentences, generate all embeddings

In [3]:
def get_sentence_embeddings(sentences, labels):
    sentence_embeddings = []
    expanded_labels = []

    print(f"Total embeddings: {len(sentences)}")
    i = 0
    for sentence, label in zip(sentences, labels):
        if (i % (len(sentences)/40) == 0):
            print(f"Computed {i} embeddings so far...")
        i += 1

        input_ids = tokenizer(sentence, return_tensors='pt', is_split_into_words=True, padding=True, truncation=True)

        with torch.no_grad():
            outs = bert_model(**input_ids)
            token_embeddings = outs.last_hidden_state.squeeze(0)[1:-1]  # Remove [CLS] and [SEP]

        # Align labels with tokens
        word_ids = input_ids.word_ids() 
        expanded_label = []
        for word_id in word_ids:
            if word_id is None:
                continue
            elif expanded_label and word_id == expanded_label[-1]:
                expanded_label.append(-1)
            else:
                expanded_label.append(label[word_id])

        # Ensure embeddings and labels match
        if len(token_embeddings) != len(expanded_label):
            raise ValueError(f"Mismatch between tokens and labels: {len(token_embeddings)} vs {len(expanded_label)}")

        sentence_embeddings.append(token_embeddings)
        expanded_labels.extend(expanded_label)

    return torch.cat(sentence_embeddings).numpy(), np.array(expanded_labels)

In [4]:
print("Reading training file...")
training_sentence_tokens, training_labels = load_macmorpho("input/macmorpho-train.txt", tokenizer, get_labels())

if os.path.exists("output/training_embeddings.pkl"):
    print(f"Loading embeddings...")
    with open("output/training_embeddings.pkl", "rb") as f:
        data = pickle.load(f)
    X_train, y_train = data["embeddings"], data["labels"]
else:
    print(f"Computing embeddings...")
    X_train, y_train = get_sentence_embeddings(training_sentence_tokens, training_labels)

    with open("output/training_embeddings.pkl", "wb") as f:
        pickle.dump({"embeddings": X_train, "labels": y_train}, f)

Reading training file...
Loading embeddings...


In [5]:
model = Sequential([
    Dense(NUMBER_OF_LABELS, input_dim=768, activation='softmax')  # 768 is the size of the BERTimbau embedding
])

model.compile(optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

valid_indices = y_train != -1
X_train = X_train[valid_indices]
y_train = y_train[valid_indices]

print("Starting training")
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
W0000 00:00:1737111960.529875    1434 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Starting training
Epoch 1/10
[1m22168/22168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 566us/step - accuracy: 0.9289 - loss: 0.2741
Epoch 2/10
[1m22168/22168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 563us/step - accuracy: 0.9608 - loss: 0.1376
Epoch 3/10
[1m22168/22168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 561us/step - accuracy: 0.9623 - loss: 0.1316
Epoch 4/10
[1m22168/22168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 566us/step - accuracy: 0.9625 - loss: 0.1297
Epoch 5/10
[1m22168/22168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 579us/step - accuracy: 0.9628 - loss: 0.1294
Epoch 6/10
[1m22168/22168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 565us/step - accuracy: 0.9630 - loss: 0.1278
Epoch 7/10
[1m22168/22168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 565us/step - accuracy: 0.9634 - loss: 0.1275
Epoch 8/10
[1m22168/22168[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 564us/step 

<keras.src.callbacks.history.History at 0x7f74daccaf80>

In [6]:
print("Reading test file...")
testing_sentence_tokens, testing_labels = load_macmorpho("input/macmorpho-test.txt", tokenizer, get_labels())

if os.path.exists("output/testing_embeddings.pkl"):
    print(f"Loading embeddings...")
    with open("output/testing_embeddings.pkl", "rb") as f:
        data = pickle.load(f)
    X_test, y_test = data["embeddings"], data["labels"]
else:
    print(f"Computing embeddings...")
    X_test, y_test = get_sentence_embeddings(testing_sentence_tokens, testing_labels)

    with open("output/testing_embeddings.pkl", "wb") as f:
        pickle.dump({"embeddings": X_test, "labels": y_test}, f)

print("Testing...")
prediction = model.predict(X_test)
predicted_classes = np.argmax(prediction, axis=1)

Reading test file...
Loading embeddings...
Testing...
[1m9264/9264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 539us/step


zero division = 0 pois retiramos -1

In [9]:
print(classification_report(y_test, predicted_classes, zero_division=0))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00    122722
           0       0.29      0.85      0.43      8335
           1       0.77      0.83      0.80      5256
           2       0.77      0.79      0.78       227
           4       0.92      0.99      0.96     12328
           5       0.87      0.97      0.92      4430
           6       0.87      0.89      0.88      2487
           7       0.27      0.58      0.36        98
           8       0.41      0.96      0.57     35414
           9       0.58      0.94      0.72     15535
          10       0.70      0.93      0.79      2481
          11       0.71      0.83      0.76      3541
          12       0.77      0.86      0.81      1081
          13       0.91      0.98      0.94     16369
          14       0.73      0.94      0.82      3328
          17       0.80      0.80      0.80      1540
          18       0.74      0.97      0.84     19084
          20       0.75    