# File for training XML-RoBERTa on factchecking data

Installing libraries

In [1]:
!pip install datasets
!pip install accelerate==0.28.0
!pip install wandb

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-

In [2]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    XLMRobertaConfig,
)
import wandb
import os
from sklearn.utils import shuffle
from typing import Any

# Creating a new folder to save data to after each run

In [3]:
# dataset_path = "./drive/MyDrive/data"
# save_folder = "./drive/MyDrive/results"

dataset_path = "../data/processed"
save_folder = "../results"

folders = os.listdir(save_folder)
run_numbers = [int(folder[3:]) for folder in folders if folder.startswith("run")]
run_id = max(run_numbers, default=0) + 1

save_path = f"{save_folder}/run{run_id}"

# Loading datasets

In [5]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

def load_dataset(path: str) -> Dataset:
    df = pd.read_csv(path, sep="\t")
    df = df.copy()
    df["label"] = df["class_label"].apply(lambda x: 1 if x == "Yes" else 0)
    df = df.drop(columns=["class_label", "sentence_id"])

    if "train" in path:
    # Split the dataset into an ~equal distribution of sentenecs that are factcheckworthy and not factcheckworthy
        df_yes = df[df["label"] == 1]
        df_no = df[df["label"] == 0]

        min_size = min(len(df_yes), len(df_no))

        df_yes_balanced = df_yes.sample(n=min_size, random_state=99)
        df_no_balanced = df_no.sample(n=min_size, random_state=99)

        df_balanced = pd.concat([df_yes_balanced, df_no_balanced])
        df_balanced = shuffle(df_balanced, random_state=99)

        # Temporarily only pick 5000 rows to get a rough estimate on how the model trains
        df_balanced = df_balanced[:5000] if len(df_balanced) > 5000 else df_balanced

        dataset = Dataset.from_pandas(df_balanced)
    else:
        dataset = Dataset.from_pandas(df)
    return dataset

def tokenize_function(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=64
    )


# Loading and tokenizing datasets into batches
train_dataset = load_dataset(f"{dataset_path}/preprocessed_train.tsv")
test_dataset = load_dataset(f"{dataset_path}/processed_dev.tsv")
dev_test_dataset = load_dataset(f"{dataset_path}/processed_dev_test.tsv")

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_dev_test_dataset = dev_test_dataset.map(tokenize_function, batched=True)


tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1032 [00:00<?, ? examples/s]

Map:   0%|          | 0/318 [00:00<?, ? examples/s]

Initialize Weights and Biases

In [6]:
wandb.init(project="dat550_project")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [7]:
# Test to check the distribution of the ones and zero labels (yes/no labels)
ones = []
zeroes = []
for label in tokenized_train_dataset["label"]:
  if label == 1:
    ones.append(1)
  else:
    zeroes.append(0)

print("Number of factcheckworhy sentences: ", len(ones))
print("Number of NOT factcheckworthy sentences: ", len(zeroes))

tokenized_train_dataset

2494
2506


Dataset({
    features: ['text', 'sentiment', 'subjectivity', 'word_count', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 5000
})

Function that will be called, computed, and results used to improve the model

In [8]:
def compute_metrics(p: EvalPrediction) -> dict[str, Any] :
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    print(f"Preds: {preds}")
    print(f"Num ones: ", len([p for p in preds if p == 1]))
    print(f"Num zeroes: ", len([p for p in preds if p == 0]))

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

Empty the cuda cache

In [9]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [10]:
tokenized_train_dataset

Dataset({
    features: ['text', 'sentiment', 'subjectivity', 'word_count', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 5000
})

In [11]:
torch.cuda.empty_cache()

config = XLMRobertaConfig.from_pretrained("xlm-roberta-large")
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-large", config=config
)

# optimizer = torch.optim.Adam(model.parameters(), lr=5e-10)

training_args = TrainingArguments(
    output_dir=save_path,  # output directory
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=8,  # batch size for evaluation
    warmup_steps=10,  # number of warmup steps for learning rate scheduler
    weight_decay=0.05,  # strength of weight decay
    logging_steps=500,  # how many batches to run before saving a backup of the run
    evaluation_strategy="epoch",  # when to run the model evaluation (check what the model has learned agains the data it has trained on)
    report_to="wandb",  # where to upload the data
    use_cpu=False if torch.cuda.is_available() else True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,
#    optimizers=(optimizer, None),
)

trainer.train()

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

wandb.finish()

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.696025,0.23062,0.374803,0.23062,1.0
2,0.705300,0.684273,0.76938,0.0,0.0,0.0
3,0.705300,0.684081,0.76938,0.0,0.0,0.0


Preds: [1 1 1 ... 1 1 1]
Num ones:  1032
Num zeroes:  0
Preds: [0 0 0 ... 0 0 0]
Num ones:  0
Num zeroes:  1032


  _warn_prf(average, modifier, msg_start, len(result))


Preds: [0 0 0 ... 0 0 0]
Num ones:  0
Num zeroes:  1032


  _warn_prf(average, modifier, msg_start, len(result))


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁██
eval/f1,█▁▁
eval/loss,█▁▁
eval/precision,█▁▁
eval/recall,█▁▁
eval/runtime,█▃▁
eval/samples_per_second,▁▅█
eval/steps_per_second,▁▅█
train/epoch,▁▃▅██
train/global_step,▁▃▅██

0,1
eval/accuracy,0.76938
eval/f1,0.0
eval/loss,0.68408
eval/precision,0.0
eval/recall,0.0
eval/runtime,4.448
eval/samples_per_second,232.013
eval/steps_per_second,29.002
train/epoch,3.0
train/global_step,939.0


Load the trained model and see how it performs agains the dev_test dataset.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(save_path)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
)

test_result = trainer.evaluate(tokenized_dev_test_dataset)
print(test_result)