# Training a Statement-Tuned Encoder Model

## Setup

Import the necessary libraries

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizerFast,RobertaForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
import evaluate
import wandb
import os

Setting up some global variables

In [None]:
EXPERIMENT_NAME = "roberta-base"
CACHE_DIR = "/scratch/afz225/.cache"
MODEL_SAVE_PATH = "./STTS_roberta-base"

Initialize the Weights and Biases Project to track the training

In [None]:
os.environ["WANDB_PROJECT"]=f"{EXPERIMENT_NAME}_train"
wandb.login()
wandb.init(
    project=f"{EXPERIMENT_NAME}_train",
)

Setting up the tokenizer

In [None]:
TRANSFORMER="roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(TRANSFORMER)

Preparing the data for training. The tolerance variable describes how far from the maximum context length (512 tokens) a statement is allowed before it is discarded (if the length of a statement is less than MAX_LEN+tolerance it is truncated until it is less than or equal to MAX_LEN)

In [None]:
tolerance = 20
data = load_dataset('ashabrawy/STTS', cache_dir=CACHE_DIR)
train = data['train'].filter(lambda example: example["is_true"] is not None).filter(lambda example: len(tokenizer(example['statement'])['input_ids']) < 514+tolerance)

Splitting the data into training and validation to detect overfitting when training (we'll take a subsample of the data)

In [None]:
train = train.train_test_split(test_size=50000)['test']

In [None]:
train_statements, val_statements, train_labels, val_labels = train_test_split(train['statement'], train['is_true'], test_size=.1)

Defining a PyTorch dataset to process the data to a point where it can be used directly by the HuggingFace Trainer.

In [None]:
class StatementDataset(torch.utils.data.Dataset):
    def __init__(self, statements, labels):
        self.statements = statements
        self.labels = labels

    def __getitem__(self, idx):
        encodings = tokenizer(self.statements[idx], truncation=True, padding=True)
        item = {key: torch.tensor(val) for key, val in encodings.items()}
        item['labels'] = int(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = StatementDataset(train_statements, train_labels)
val_dataset = StatementDataset(val_statements, val_labels)

Classification Metrics

In [None]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

Compute Metric function to be used during training by trainer to calculate CLF metrics

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries, which is in ids into text
    _, predictions = torch.max(torch.tensor(predictions), dim=1)


    return clf_metrics.compute(predictions=predictions, references=labels)


Data Collator used to pad all sequences to the same length.

In [None]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

## Training

Initializing trainer and arguments

In [None]:
training_args = TrainingArguments(
    output_dir=f'./{EXPERIMENT_NAME}-outputs',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_ratio=0.1,                # number of warmup steps for learning rate scheduler
    learning_rate=1e-06,
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f'./{EXPERIMENT_NAME}-logs',            # directory for storing logs
    logging_steps=1000,
    save_steps=1000,
    evaluation_strategy='steps',
    save_total_limit=2,
    load_best_model_at_end= True,
    metric_for_best_model='f1',
    report_to="wandb",
)

model = RobertaForSequenceClassification.from_pretrained(TRANSFORMER)

trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    compute_metrics=compute_metrics,
    eval_dataset=val_dataset,            # evaluation dataset
    data_collator=data_collator
)

Begin training/statement-tuning

In [None]:
trainer.train()

Save model locally

In [None]:
trainer.save_model(MODEL_SAVE_PATH)

Also you can push it to the hub

In [None]:
trainer.push_to_hub("ashabrawy/ST-trial-model")