Finetune on IMDb database using AdamW optimizer

# Necessary Libraries

In [1]:
%pip install transformers datasets evaluate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.12

# Load Dataset and Model

In [40]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [37]:
model_name = "distilbert-base-uncased"

student_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
from datasets import load_dataset

dataset = load_dataset("imdb")

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

train_dataset = dataset["train"].map(tokenize_function, batched=True)
test_dataset = dataset["test"].map(tokenize_function, batched=True)



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

# Evaluation Metrics

In [43]:
import evaluate
import numpy as np

def metrics(eval_pred):
    metric = evaluate.load("accuracy")

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Train Model

In [44]:
training_args = TrainingArguments(
    output_dir="distilbert-finetuned-imdb",          # Directory to save the model checkpoints
    eval_strategy="epoch",     # Evaluate after each epoch
    save_strategy="epoch",           # Save model at each epoch
    learning_rate=5e-5,              # Learning rate
    per_device_train_batch_size=16,  # Speed up training
    per_device_eval_batch_size=16,   # Speed up evaluation
    num_train_epochs=1,              # Reduce training time
#    weight_decay=0.01,               # Weight decay (L2 regularization)
    logging_steps=10,                # Log every 10 steps
    load_best_model_at_end=True,     # Load best model at the end of training
    fp16=True,                       # Use mixed precision for faster training on GPU
    report_to="none",                 # Disable WandB logging
    optim="adamw_torch",              # Use AdamW optimizer
    lr_scheduler_type="linear"       # Use linear learning rate scheduler
)

In [45]:
from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer=tokenizer) # Dynamic padding

trainer = Trainer(
    model=student_model,
    args=training_args,
    data_collator=collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2573,0.190451,0.9298


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=1563, training_loss=0.25673115352598885, metrics={'train_runtime': 443.8732, 'train_samples_per_second': 56.322, 'train_steps_per_second': 3.521, 'total_flos': 3280166004732288.0, 'train_loss': 0.25673115352598885, 'epoch': 1.0})

# Upload model to Hugging Face Hub

In [46]:
from huggingface_hub import notebook_login

notebook_login() # Login to account

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [47]:
kwargs = {
    "dataset_tags": "imdb",
    "dataset": "imdb",
    "model_name": f"{model_name}-finetuned-imdb",
    "finetuned_from": model_name,
    "tasks": "text-classification"
}

trainer.push_to_hub(commit_message="Using different optimizer and learning rate scheduler to improve performance", **kwargs)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/avanishd/distilbert-finetuned-imdb/commit/ff5518bd2e8e66968651e6f3e73f7010d653e514', commit_message='Using different optimizer and learning rate scheduler to improve performance', commit_description='', oid='ff5518bd2e8e66968651e6f3e73f7010d653e514', pr_url=None, repo_url=RepoUrl('https://huggingface.co/avanishd/distilbert-finetuned-imdb', endpoint='https://huggingface.co', repo_type='model', repo_id='avanishd/distilbert-finetuned-imdb'), pr_revision=None, pr_num=None)