In [1]:
pip install transformers datasets torch accelerate evaluate




In [2]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from transformers import BartForSequenceClassification, BartTokenizer, Trainer, TrainingArguments
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load FEVER v2.0 dataset (only "validation" is available)
dataset = load_dataset("fever/fever", "v2.0", split="validation", trust_remote_code=True)

In [4]:
# Corrected Label Mapping (matches dataset's labels exactly)
label_map = {
    "SUPPORTS": 0,
    "REFUTES": 1,
    "Not Enough Info": 2  # Adjusted to match dataset's label format
}

In [5]:
# Remove invalid labels before tokenization
dataset = dataset.filter(lambda x: x["label"] in label_map)

dataset = dataset.filter(lambda x: x["label"] != -1)

# Use only a subset of training data (e.g., 50K samples)
small_train = dataset.shuffle(seed=42).select(range(min(200, len(dataset))))

In [6]:
# Load tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-mnli")
# Load model
model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels=3)

In [7]:
# Preprocessing function with additional error handling for labels
def preprocess_function(examples):
    inputs = tokenizer(examples["claim"], padding="max_length", truncation=True, max_length=512)
    inputs["labels"] = [label_map.get(label, -1) for label in examples["label"]]  # Defaulting to -1 for unexpected labels
    return inputs

In [8]:
# Tokenize and remove original columns
tokenized_datasets = small_train.map(preprocess_function, batched=True, remove_columns=small_train.column_names)

Map: 100%|██████████| 200/200 [00:00<00:00, 1182.68 examples/s]


In [9]:
training_args = TrainingArguments(
    output_dir="./bart_fever",
    evaluation_strategy="steps",  # Evaluate frequently
    eval_steps=20,  # Adjust based on dataset size
    save_strategy="steps",
    save_steps=20,  # Save periodically
    per_device_train_batch_size=16,  # Increase if GPU allows
    per_device_eval_batch_size=16,
    num_train_epochs=1,  # Reduce epochs to save time
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=3,  # More frequent logging
    save_total_limit=1,  # Keep only the latest checkpoint
    fp16=True,  # Enable mixed precision training (reduces memory + speeds up training)
    gradient_accumulation_steps=2,  # Helps with memory if batch size is large
    dataloader_num_workers=4,  # Optimize data loading
    warmup_steps=10,  # Prevent early instability
    learning_rate=3e-5,  # Slightly higher LR for better efficiency
    lr_scheduler_type="linear",  # Simple LR scheduling
)




In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,  # Using "validation" split for training
    eval_dataset=tokenized_datasets,  # Add this line for evaluation
    tokenizer=tokenizer
)

  trainer = Trainer(


In [11]:
# Train and save
trainer.train()
model.save_pretrained("./fine_tuned_bart_mnli_fever")
tokenizer.save_pretrained("./fine_tuned_bart_mnli_fever")

  0%|          | 0/6 [00:00<?, ?it/s]

KeyboardInterrupt: 