In [None]:
# data_loading.py

In [None]:
import os

import pandas as pd
from datasets import Dataset
from datasets import DatasetDict

ENCODING = {"label": {"Value": 0, "Value(+)": 1, "Value(-)": 2, "Fact": 3, "Policy": 4}}


def load_dataset():
    
    df_text = pd.DataFrame(pd.read_csv(os.path.join('/content','drive','Shareddrives','PLN','dataset', 'OpArticles.csv')))

    df_adu = pd.DataFrame(
        pd.read_csv(os.path.join('/content','drive','Shareddrives','PLN','dataset', 'OpArticles_ADUs.csv')))

    return df_adu, df_text


def normalize_dataset(df):
    df.drop(columns=['article_id', 'annotator', 'node', 'ranges'], inplace=True)
    df.replace(ENCODING, inplace=True)

    dataset_hf = Dataset.from_pandas(df)

    return dataset_hf


def split_train_test(df, test_percentage=0.2, validation_percentage=0.5):
    dataset = normalize_dataset(df)

    if test_percentage == 1.0:
        return DatasetDict({
            'test': dataset
        })

    train_test = dataset.train_test_split(test_size=test_percentage)

    # Split the 10% test+validation set in half test, half validation
    valid_test = train_test['test'].train_test_split(test_size=(1.0 - validation_percentage))

    train_valid_test_dataset = DatasetDict({
        'train': train_test['train'],
        'validation': valid_test['train'],
        'test': valid_test['test']
    })

    return train_valid_test_dataset


In [None]:
# evaluate.py

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


def evaluate(y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred, average='macro'))
    print('Recall: ', recall_score(y_test, y_pred, average='macro'))
    print('F1: ', f1_score(y_test, y_pred, average='macro'))


In [None]:
# main.py

In [None]:
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer


def task_1():
    df_adu, _ = load_dataset()

    dataset = split_train_test(df_adu, 1.0, 0.0)

    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=False)
    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

    y_pred = []
    y_test = []

    for index, elem in enumerate(dataset['test']):
        #print(f"Evaluating:{index + 1}/{len(dataset['test'])}")
        inputs = tokenizer(elem['tokens'], padding=True, truncation=True, return_tensors="pt")
        outputs = model(**inputs)

        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

        y_pred.append(np.argmax(predictions.detach().numpy(), axis=-1))

        y_test.append(elem['label'])

    evaluate(y_test, y_pred)

task_1()

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
"""

!pip install torch
!pip install transformers
!pip install datasets

"""