In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

from sklearn.metrics import f1_score
import torch

In [None]:
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset

In [117]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Split Train and Validation

In [118]:
TRAIN_TEXT_MIN_LEN = 3

def clean_df(df: pd.DataFrame):
    sentence_len = df["text"].str.len()
    Q1 = sentence_len.quantile(0.25)
    Q3 = sentence_len.quantile(0.75)
    IQR = Q3 - Q1
    train_text_max_len = 3 + 1.5 * IQR
    outliers = df[(sentence_len < TRAIN_TEXT_MIN_LEN) | (sentence_len>train_text_max_len)]
    outlier_percentage = len(outliers)/len(df) * 100
    if outlier_percentage <= 5:
        df = df.drop(outliers.index)
    
    return df
    
def x_y_split(df: pd.DataFrame):
    y_categorical = df["PCL_category"]
    y_binary = df["is_PCL"]
    X = df.drop(columns=["PCL_category", "is_PCL"])
    return X, y_binary, y_categorical


def data_read_split(data_path: str):
    df = pd.read_csv(
        data_path,
        sep="\t",
        skiprows=9,
        engine="python",
        index_col=0,
        header=None,
        names = ["article_id", "keyword", "country", "text", "PCL_category"]
    )
    df["is_PCL"] = df["PCL_category"] >= 2
    
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

    # df_train = clean_df(df_train)
    
    return df_train, df_test    

def data_preprocess(data_path: str): 
    df_train, df_test = data_read_split(data_path)
    X_train, y_train_b, y_train_c = x_y_split(df_train)
    X_test, y_test_b, y_test_c = x_y_split(df_test)
    return X_train, X_test, y_train_b, y_test_b, y_train_c, y_test_c

def t5_filtering(df: pd.DataFrame):
    return df[["text", "is_PCL"]]


In [119]:
data_path = "dontpatronizeme_pcl.tsv"
df_train, df_test = data_read_split(data_path)
df_train = t5_filtering(df_train)
df_test = t5_filtering(df_test)

In [None]:
train_dataset = Dataset.from_pandas(df_train, preserve_index=False)
test_dataset = Dataset.from_pandas(df_test, preserve_index=False)

model_name = "roberta-base"
num_labels = 2

tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("is_PCL", "labels")
test_dataset = test_dataset.rename_column("is_PCL", "labels")

train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

model = RobertaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

model.to(device)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
    }

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
results = trainer.evaluate()

print(results)