In [None]:
# constants.py

In [None]:
NUM_LABELS = 5

In [None]:
# data_loading.py

In [None]:
import os

import pandas as pd
from datasets import Dataset
from datasets import DatasetDict

ENCODING = {"label": {"Value": 0, "Value(+)": 1, "Value(-)": 2, "Fact": 3, "Policy": 4}}


def load_dataset():
    
    df_text = pd.DataFrame(pd.read_csv(os.path.join('/content','drive','Shareddrives','PLN','dataset', 'OpArticles.csv')))

    df_adu = pd.DataFrame(
        pd.read_csv(os.path.join('/content','drive','Shareddrives','PLN','dataset', 'OpArticles_ADUs.csv')))

    return df_adu, df_text


def normalize_dataset(df):
    df.drop(columns=['article_id', 'annotator', 'node', 'ranges'], inplace=True)
    df.replace(ENCODING, inplace=True)

    dataset_hf = Dataset.from_pandas(df)

    return dataset_hf


def split_train_test(df, test_percentage=0.2, validation_percentage=0.5):
    dataset = normalize_dataset(df)

    if test_percentage == 1.0:
        return DatasetDict({
            'test': dataset
        })

    train_test = dataset.train_test_split(test_size=test_percentage)

    # Split the 10% test+validation set in half test, half validation
    valid_test = train_test['test'].train_test_split(test_size=(1.0 - validation_percentage))

    train_valid_test_dataset = DatasetDict({
        'train': train_test['train'],
        'validation': valid_test['train'],
        'test': valid_test['test']
    })

    return train_valid_test_dataset


In [None]:
# evaluate.py

In [None]:
import numpy as np
from datasets import load_metric
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


def compute_metrics(eval_pred):
    metric = load_metric("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def evaluate(y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred, average='macro'))
    print('Recall: ', recall_score(y_test, y_pred, average='macro'))
    print('F1: ', f1_score(y_test, y_pred, average='macro'))


In [None]:
# main.py

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, Trainer
from transformers import AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding, Trainer
from transformers import AutoTokenizer


model_name = 'neuralmind/bert-base-portuguese-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)


def task_1():
    df_adu, _ = load_dataset()

    dataset = split_train_test(df_adu)

    tokenized_dataset = dataset.map(preprocess_function, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=NUM_LABELS)

    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",  # run validation at the end of each epoch
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.evaluate()
    trainer.predict(test_dataset=tokenized_dataset["test"])


def preprocess_function(sample):
    return tokenizer(sample["tokens"], truncation=True)


if __name__ == '__main__':
    task_1()


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:

"""
!pip install torch
!pip install transformers
!pip install datasets

"""