# Setup and install libraries

In [None]:
%pip install torch datasets transformers huggingface_hub accelerate scikit-learn
import torch

# Preprocess data

In [6]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
dataset = load_dataset("sst2")
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(4))
small_test_dataset = dataset["test"].shuffle(seed=42).select(range(2))

# loading pretrained DistilBERT tokenizer
# tokenization - breaking text into smaller units (tokens)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# prepare text inputs for model using map method
def preprocess(examples): 
    return tokenizer(examples["sentence"], truncation=True)

tokenized_train = small_train_dataset.map(preprocess, batched=True)
tokenized_test = small_test_dataset.map(preprocess, batched=True)

# convert training samples to PyTorch tensors and concatenate them
# with correct amount of padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

# Training the model
Can throw away pretraining head of DistilBERT model and replace with classification head fine-tuned for sentiment analysis.
This uses Trainer API, and this enables transferring knowledge from DistilBERT to the custom model.

In [7]:
from transformers import AutoModelForSequenceClassification
import numpy as np
from datasets import load_metric

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# define metrics to evaluate model, accuracy and F1 score
def compute_metrics(eval_pred):
    load_accuracy =  load_metric("accuracy")
    load_f1 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# define training arguments and trainer with all objects constructed
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "sst2",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 2,
    weight_decay = 0.01,
    save_strategy = "epoch",
    push_to_hub = True,
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_test,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
# train model
trainer.train()

NameError: name 'trainer' is not defined

In [1]:
trainer.evaluate()

NameError: name 'trainer' is not defined