# Imports

In [19]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import evaluate

# download/load the dataset

In [2]:
dataset = load_dataset("ag_news")
print(dataset)
print(dataset["train"][0])


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", 'label': 2}


# tokenize!

In [3]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize_data(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_data, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|██████████| 120000/120000 [00:11<00:00, 10239.50 examples/s]
Map: 100%|██████████| 7600/7600 [00:00<00:00, 10845.46 examples/s]


# Load the pretrained model; define the trainer; train the model!

In [23]:
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from accelerate import Accelerator
accelerator = Accelerator()
print(f"Accelerator device: {accelerator.device}")


Accelerator device: mps


In [6]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


True
tensor([[0.7853, 1.0079, 1.5081],
        [1.1613, 1.0873, 0.5867],
        [0.7981, 0.5697, 0.8008]], device='mps:0')


## approach 1

In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    bf16=True,  
    save_total_limit=2,  
    logging_steps=10,
    load_best_model_at_end=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(1000)),  
    eval_dataset=tokenized_dataset["test"].select(range(200)),  
    tokenizer=tokenizer
)
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.2597,0.435654
2,0.291,0.364821
3,0.0073,0.395615


TrainOutput(global_step=750, training_loss=0.3514659086813529, metrics={'train_runtime': 726.7428, 'train_samples_per_second': 4.128, 'train_steps_per_second': 1.032, 'total_flos': 789347340288000.0, 'train_loss': 0.3514659086813529, 'epoch': 3.0})

training shows that there is some overfitting going on. I'll try regularization to prevent this. I'll also use evaluate to control the metrics.

## approach 2

In [16]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

Downloading builder script: 100%|██████████| 7.56k/7.56k [00:00<00:00, 9.60MB/s]
Downloading builder script: 100%|██████████| 7.38k/7.38k [00:00<00:00, 17.0MB/s]
Downloading builder script: 100%|██████████| 6.79k/6.79k [00:00<00:00, 40.1MB/s]


In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision_score = precision.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall_score = recall.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1_score = f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    return {
        "accuracy": accuracy_score,
        "precision": precision_score,
        "recall": recall_score,
        "f1": f1_score,
    }

In [20]:
training_args = TrainingArguments(
    output_dir="./checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.1,  # Increased weight decay for regularization
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    lr_scheduler_type="cosine_with_restarts",  # Learning rate scheduler
    bf16=True  # Enable mixed precision for MPS or CUDA
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
tokenizer=tokenizer,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop if no improvement after 2 epochs
)

# Train the model
trainer.train()



  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

trying something different to reduce training time

In [24]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Small batch size
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.1,
    gradient_accumulation_steps=4,  # Simulate larger batch size
    logging_dir="./logs",
    bf16=True,  # Mixed precision
    warmup_steps=500,
    max_grad_norm=1.0,
    load_best_model_at_end=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(10000)),  # Subset for faster debugging
    eval_dataset=tokenized_dataset["test"].select(range(2000)),  # Subset for evaluation
    tokenizer=tokenizer
)

# Train
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.6064,0.333551
2,0.2607,0.247729
3,0.1499,0.292301


TrainOutput(global_step=1875, training_loss=0.29421167399088544, metrics={'train_runtime': 18043.3944, 'train_samples_per_second': 1.663, 'train_steps_per_second': 0.104, 'total_flos': 3974163701760000.0, 'train_loss': 0.29421167399088544, 'epoch': 3.0})