# File for training XML-RoBERTa on factchecking data

Installing libraries

In [31]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
)
import wandb
import os
from sklearn.utils import shuffle, resample
from typing import Any

# Creating a new folder to save data to after each run

In [32]:
def get_paths(is_gdrive: bool = False) -> Any:
    """Get paths for training data and storing results.

    Args:
        is_gdrive: Whether to use Google Drive path for storing results and
            retrieving data.

    Returns:
        dataset_path: Path to training data.
        save_path: Path to store results.
    """
    dataset_path = (
        "./drive/MyDrive/data"
        if is_gdrive
        else "/home/emrds/repos/Multilingual-Check-worthiness-Estimation-in-Text/data/processed/"
    )
    save_folder = (
        "./drive/MyDrive/results"
        if is_gdrive
        else "/home/emrds/repos/Multilingual-Check-worthiness-Estimation-in-Text/results"
    )

    folders = os.listdir(save_folder)
    run_numbers = [int(folder[3:]) for folder in folders if folder.startswith("run")]
    run_id = max(run_numbers, default=0) + 1

    save_path = f"{save_folder}/run{run_id}"

    os.makedirs(save_path)
    return dataset_path, save_path

In [38]:
def load_dataset(path: str) -> Dataset:
    df = pd.read_csv(path, sep="\t")
    df = df.copy()
    df = df[["text", "label"]]

    dataset = Dataset.from_pandas(df)
    return dataset

In [34]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

In [35]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=512
    )

In [39]:
dataset_path, save_path = get_paths(is_gdrive=False)

train_dataset = load_dataset(f"{dataset_path}/merged_train_oversampled.tsv")
test_dataset = load_dataset(f"{dataset_path}/merged_test_oversampled.tsv")
dev_test_dataset = load_dataset(f"{dataset_path}/merged_dev_test.tsv")

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
# tokenized_dev_test_dataset = dev_test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/50763 [00:00<?, ? examples/s]

Map:   0%|          | 0/7377 [00:00<?, ? examples/s]

In [44]:
train_dataset[10]

{'text': "And recently the Federal Trade Commission pointed out that some of these entertainment companies have warned parents that the material is inappropriate for children, and then they've turned around behind the backs of the parents and advertised that same adult material directly to children.",
 'label': 1}