## Hugging Face Transformers Tutorial


check Hugging Face Website for more details: https://huggingface.co/docs/transformers/index

It is recommended to run this notebook in Google Colab since it provides free GPU access.


In [None]:
!pip install datasets
!pip install -U "transformers>=4.40.0"

In [None]:
import transformers
from transformers import TrainingArguments
import inspect, os

print("Transformers version:", transformers.__version__)
print("Transformers module path:", transformers.__file__)
print("TrainingArguments module:", TrainingArguments.__module__)
print("TrainingArguments signature:", inspect.signature(TrainingArguments.__init__))

In [None]:
from collections import defaultdict, Counter
import json

from matplotlib import pyplot as plt
import numpy as np
import torch

def print_encoding(model_inputs, indent=4):
    indent_str = " " * indent
    print("{")
    for k, v in model_inputs.items():
        print(indent_str + k + ":")
        print(indent_str + indent_str + str(v))
    print("}")

## Part 2: Finetuning

2.1 Loading in a dataset

In [None]:
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader

imdb_dataset = load_dataset("imdb")

# Just take the first 50 tokens for speed
def truncate(example):
    return {
        'text': " ".join(example['text'].split()[:50]),
        'label': example['label']
    }

# Create a small dataset
small_imdb_dataset = DatasetDict(
    train=imdb_dataset['train']
        .shuffle(seed=1111)
        .select(range(128))
        .map(truncate),

    val=imdb_dataset['train']
        .shuffle(seed=1111)
        .select(range(128, 160))
        .map(truncate),)

In [None]:
small_imdb_dataset

In [None]:
small_imdb_dataset['train'][:10]

In [None]:
from transformers import DistilBertTokenizer, DistilBertTokenizerFast, AutoTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")

In [None]:
# Prepare the dataset - this tokenizes the dataset in batches of 16 examples.
small_tokenized_dataset = small_imdb_dataset.map(
    lambda example: tokenizer(example['text'], padding=True, truncation=True),
    batched=True,
    batch_size=16
)

small_tokenized_dataset = small_tokenized_dataset.remove_columns(["text"])
small_tokenized_dataset = small_tokenized_dataset.rename_column("label", "labels")
small_tokenized_dataset.set_format("torch")

In [None]:
small_tokenized_dataset['train'][0:2]

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_tokenized_dataset['train'], batch_size=16)
eval_dataloader = DataLoader(small_tokenized_dataset['val'], batch_size=16)

### 2.2 Training

In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup, DistilBertForSequenceClassification
from tqdm.notebook import tqdm
import torch

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased',
    num_labels=2
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)

optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
import os

os.makedirs("checkpoints", exist_ok=True)

In [None]:
loss = 0

best_val_loss = float("inf")
progress_bar = tqdm(range(num_training_steps))


for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    # ------------------------ TRAIN -----------------------
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)

    # ---------------------- VALIDATION ---------------------
    model.eval()
    val_loss = 0
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(eval_dataloader)
    print(f"Validation loss: {avg_val_loss}")

    if avg_val_loss < best_val_loss:
        print("Saving checkpoint!")
        best_val_loss = avg_val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': best_val_loss,
        }, f"checkpoints/epoch_{epoch}.pt")


In [None]:
imdb_dataset = load_dataset("imdb")

small_imdb_dataset = DatasetDict(
    train=imdb_dataset['train'].shuffle(seed=1111).select(range(128)).map(truncate),
    val=imdb_dataset['train'].shuffle(seed=1111).select(range(128, 160)).map(truncate),
)

small_tokenized_dataset = small_imdb_dataset.map(
    lambda example: tokenizer(example['text'], truncation=True),
    batched=True,
    batch_size=16
)

In [None]:
from transformers import TrainingArguments, Trainer, DistilBertForSequenceClassification
import numpy as np  # make sure this is imported

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-cased',
    num_labels=2
)

arguments = TrainingArguments(
    output_dir="sample_hf_trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    eval_strategy="epoch",          # <<< changed here
    save_strategy="epoch",          # this is fine
    learning_rate=2e-5,
    load_best_model_at_end=True,
    seed=224
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": np.mean(predictions == labels)}

trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=small_tokenized_dataset['train'],
    eval_dataset=small_tokenized_dataset['val'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
import json
import os
from transformers import TrainerCallback, EarlyStoppingCallback

class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path
        # ensure directory exists
        os.makedirs(os.path.dirname(log_path), exist_ok=True)

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return

        # remove very large unnecessary field
        logs.pop("total_flos", None)

        # only the main process writes (important in distributed setups)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import EarlyStoppingCallback
from transformers.integrations import WandbCallback, TensorBoardCallback

# remove the built-in ones, if present
trainer.remove_callback(EarlyStoppingCallback)
trainer.remove_callback(WandbCallback)
trainer.remove_callback(TensorBoardCallback)

# now add your own ones
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=0.0))
trainer.add_callback(LoggingCallback("sample_hf_trainer/log.jsonl"))

In [None]:
# train the model
trainer.train()

In [None]:
# evaluating the model is very easy

# results = trainer.evaluate()                           # just gets evaluation metrics
results = trainer.predict(small_tokenized_dataset['val']) # also gives you predictions

In [None]:
results

In [None]:
from transformers import AutoModelForSequenceClassification
# To load our saved model, we can pass the path to the checkpoint into the `from_pretrained` method:
test_str = "I enjoyed the movie!"

finetuned_model = AutoModelForSequenceClassification.from_pretrained("sample_hf_trainer/checkpoint-24")
model_inputs = tokenizer(test_str, return_tensors="pt")
prediction = torch.argmax(finetuned_model(**model_inputs).logits)
print(["NEGATIVE", "POSITIVE"][prediction])

Included here are also some practical tips for fine-tuning:

**Good default hyperparameters.**

* Epochs: {2, 3, 4} (larger amounts of data need fewer epochs)
* Batch size (bigger is better: as large as you can make it)
* Optimizer: AdamW
* AdamW learning rate: {2e-5, 5e-5}
* Learning rate scheduler: linear warm up for first {0, 100, 500} steps of training
* weight_decay (l2 regularization): {0, 0.01, 0.1}


## Part 3:  Generation

In [None]:
from transformers import AutoModelForCausalLM

gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')

gpt2 = AutoModelForCausalLM.from_pretrained('distilgpt2')
gpt2.config.pad_token_id = gpt2.config.eos_token_id  # Prevents warning during decoding

In [None]:
prompt = "Once upon a time"

tokenized_prompt = gpt2_tokenizer(prompt, return_tensors="pt")

for i in range(10):
    output = gpt2.generate(**tokenized_prompt,
                  max_length=50,
                  do_sample=True,
                  top_p=0.9)

    print(f"{i + 1}) {gpt2_tokenizer.batch_decode(output)[0]}")

## Defining Custom Datasets

In [None]:
# Option 1: Load into Hugging Face Datasets

# Kaggle donwload https://www.kaggle.com/datasets/mexwell/the-e2e-challenge-dataset
import pandas as pd
from datasets import Dataset

df = pd.read_csv("e2e-dataset/trainset.csv")
custom_dataset = Dataset.from_pandas(df)

In [None]:
import csv
from torch.utils.data import Dataset, DataLoader

class E2EDataset(Dataset):
    """Tokenize data when we call __getitem__"""
    def __init__(self, path, tokenizer):
        with open(path, newline="") as f:
            reader = csv.reader(f)
            next(reader) # skip the heading
            self.data = [{"source": row[0], "target": row[1]} for row in reader]
        self.tokenizer = tokenizer

    def __getitem__(self, i):
        inputs = self.tokenizer(self.data[i]['source'])
        labels = self.tokenizer(self.data[i]['target'])
        inputs['labels'] = labels.input_ids
        return inputs

In [None]:
bart_tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

In [None]:
dataset = E2EDataset("e2e-dataset/trainset.csv", bart_tokenizer)

In [None]:
import torch

src_texts = ["This is the first test.", "This is the second test."]
tgt_texts = ["Target 1", "Target 2"]

batch = bart_tokenizer(
    src_texts,
    text_target=tgt_texts,
    max_length=128,
    truncation=True,
    padding=True,
    return_tensors="pt",
)

batch  # contains input_ids, attention_mask, labels

## Pipelines

In [None]:
from transformers import pipeline

sentiment_analysis = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")

You can run the pipeline by just calling it on a string

In [None]:
sentiment_analysis("Hugging Face Transformers is really cool!")

Or on a list of strings:

In [None]:
sentiment_analysis(["I didn't know if I would like Hákarl, but it turned out pretty good.",
                    "I didn't know if I would like Hákarl, and it was just as bad as I'd heard."])

## Masked Language Modeling

In [None]:
from transformers import AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", fast=True)
bert = AutoModelForMaskedLM.from_pretrained("bert-base-cased")

In [None]:
prompt = "I am [MASK] to learn about HuggingFace!"
model = pipeline("fill-mask", "bert-base-cased")
model(prompt)

In [None]:
inputs = tokenizer(prompt, return_tensors="pt")
mask_index = np.where(inputs['input_ids'] == tokenizer.mask_token_id)
outputs = bert(**inputs)
top_5_predictions = torch.softmax(outputs.logits[mask_index], dim=1).topk(5)

print(prompt)
for i in range(5):
    prediction = tokenizer.decode(top_5_predictions.indices[0, i])
    prob = top_5_predictions.values[0, i]
    print(f"  {i+1}) {prediction}\t{prob:.3f}")