# Setup and install libraries

In [None]:
%pip install torch datasets transformers huggingface_hub accelerate scikit-learn
import torch

# Preprocess data

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
imdb = load_dataset("imdb")
small_train_dataset = imdb["train"].shuffle(seed=42).select(range(3000))
small_test_dataset = imdb["test"].shuffle(seed=42).select(range(300))

# loading pretrained DistilBERT tokenizer
# tokenization - breaking text into smaller units (tokens)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# prepare text inputs for model using map method
def preprocess(examples): 
    return tokenizer(examples["text"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess, batched=True)
tokenized_test = small_test_dataset.map(preprocess, batched=True)

# convert training samples to PyTorch tensors and concatenate them
# with correct amount of padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training the model
Can throw away pretraining head of DistilBERT model and replace with classification head fine-tuned for sentiment analysis.
This uses Trainer API, and this enables transferring knowledge from DistilBERT to the custom model.

In [3]:
from transformers import AutoModelForSequenceClassification
import numpy as np
from datasets import load_metric

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# define metrics to evaluate model, accuracy and F1 score
def compute_metrics(eval_pred):
    load_accuracy =  load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# define training arguments and trainer with all objects constructed
from transformers import TrainingArguments, Trainer

repo_name = "hf_tutorial"

training_args = TrainingArguments(
    output_dir = repo_name,
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 2,
    weight_decay = 0.01,
    save_strategy = "epoch",
    push_to_hub = True,
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_test,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)


In [6]:
# train model
trainer.train()

  0%|          | 0/376 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'train_runtime': 1038.5633, 'train_samples_per_second': 5.777, 'train_steps_per_second': 0.362, 'train_loss': 0.27325242630978847, 'epoch': 2.0}


TrainOutput(global_step=376, training_loss=0.27325242630978847, metrics={'train_runtime': 1038.5633, 'train_samples_per_second': 5.777, 'train_steps_per_second': 0.362, 'train_loss': 0.27325242630978847, 'epoch': 2.0})

In [9]:
trainer.evaluate()

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

{'eval_loss': 0.31526488065719604,
 'eval_accuracy': 0.8666666666666667,
 'eval_f1': 0.8717948717948718,
 'eval_runtime': 23.0654,
 'eval_samples_per_second': 13.007,
 'eval_steps_per_second': 0.824,
 'epoch': 2.0}