In [1]:
import sys, platform
import torch, transformers, datasets, evaluate, sklearn, pandas as pd, numpy as np

print("Python:", sys.version.split()[0], "| OS:", platform.system())
print("PyTorch:", torch.__version__, "| CUDA available:", torch.cuda.is_available())
print("Transformers:", transformers.__version__)
print("datasets:", datasets.__version__, "| evaluate:", evaluate.__version__)

  from .autonotebook import tqdm as notebook_tqdm


Python: 3.13.5 | OS: Windows
PyTorch: 2.9.0+cpu | CUDA available: False
Transformers: 4.57.1
datasets: 4.3.0 | evaluate: 0.4.6


In [2]:
from datasets import load_dataset

raw = load_dataset("glue", "sst2")
# take a small slice so it runs fast
train_small = raw["train"].select(range(1000))
valid_small = raw["validation"].select(range(500))

len(train_small), len(valid_small), train_small[0]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating train split: 100%|██████████| 67

(1000,
 500,
 {'sentence': 'hide new secretions from the parental units ',
  'label': 0,
  'idx': 0})

In [3]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(batch):
    return tokenizer(batch["sentence"], truncation=True, padding="max_length", max_length=128)

train_tok = train_small.map(tokenize_fn, batched=True, remove_columns=train_small.column_names)
valid_tok = valid_small.map(tokenize_fn, batched=True, remove_columns=valid_small.column_names)

# add labels back (Trainer expects a 'labels' column)
train_tok = train_tok.add_column("labels", train_small["label"])
valid_tok = valid_tok.add_column("labels", valid_small["label"])

train_tok[0], valid_tok[0]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 1000/1000 [00:00<00:00, 14372.67 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 16526.54 examples/s]


({'input_ids': [101,
   5342,
   2047,
   3595,
   8496,
   2013,
   1996,
   18643,
   3197,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'attention_mask': [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   

In [4]:
from transformers import AutoModelForSequenceClassification

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=num_labels
)
model.__class__.__name__, model.config.num_labels


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


('DistilBertForSequenceClassification', 2)

In [6]:
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": metric.compute(predictions=preds, references=labels)["accuracy"]}

args = TrainingArguments(
    output_dir="outputs/distilbert_sst2_small",
    eval_strategy="epoch",      # <-- updated name
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    weight_decay=0.01,
    seed=42,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.503,0.375207,0.842


TrainOutput(global_step=63, training_loss=0.4858779528784373, metrics={'train_runtime': 239.8065, 'train_samples_per_second': 4.17, 'train_steps_per_second': 0.263, 'total_flos': 33116849664000.0, 'train_loss': 0.4858779528784373, 'epoch': 1.0})

In [7]:
trainer.evaluate()




{'eval_loss': 0.37520715594291687,
 'eval_accuracy': 0.842,
 'eval_runtime': 33.4632,
 'eval_samples_per_second': 14.942,
 'eval_steps_per_second': 0.478,
 'epoch': 1.0}