# Full Fine-tuning

In [12]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from transformers import glue_compute_metrics as compute_metrics
from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers import glue_tasks_num_labels as num_labels

device_name = "cpu"  # default device is CPU
if torch.cuda.is_available():
    # I read that this works for detecting if notebook is being run in a colab environment, not sure though
    device_name = "cuda" # CUDA for NVIDIA GPU
elif torch.backends.mps.is_available():
    device_name = torch.device("mps")  # Metal Performance Shaders for Apple M-series GPU

# device_name = "cuda:0"
device = torch.device(device_name)
print(device)

# Define hyperparameters
batch_size = 32
learning_rate = 1e-3
num_epochs = 3

# Define the GLUE task and model
task_name = "sst-2"  # Example task: MRPC (you can change it to other GLUE tasks)
model_name = "distilbert-base-uncased"  # Pretrained BERT model

# Load the GLUE task-specific tokenizer and processor
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
processor = processors[task_name]()
label_list = processor.get_labels()
num_labels = num_labels[task_name]

# Load and preprocess the GLUE dataset
train_examples = processor.get_train_examples('./sst2/')
train_features = convert_examples_to_features(train_examples, tokenizer, label_list=label_list, max_length=512, output_mode=output_modes[task_name])

mps


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_data = TensorDataset(torch.tensor([f.input_ids for f in train_features], dtype=torch.long),
                            torch.tensor([f.attention_mask for f in train_features], dtype=torch.long),
                            torch.tensor([f.label for f in train_features], dtype=torch.long))

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Load the pretrained BERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)
model.to(device)

# Define the loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate) 

In [19]:
from tqdm.notebook import tqdm
import time

# Training loop
for epoch in tqdm(range(num_epochs)):
    start_time = time.time()
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_dataloader):
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs} - Average Loss: {average_loss:.4f}")

    # Record the end time
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time

    print(f"Epoch {epoch + 1} ---")
    print(f"Elapsed time: {elapsed_time} seconds\n")

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/268 [00:00<?, ?it/s]

RuntimeError: Placeholder storage has not been allocated on MPS device!