In [19]:
import numpy as np

import json

import torch

from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer
from torchinfo import summary
from datasets import load_dataset
import evaluate

![flow-model-requirement](images/Flow-min-requirement-Training-FineTuning.png)

# Tokenizer Model Preparation

In [20]:
# checkpoint = 'bert-base-uncased'
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, clean_up_tokenization_spaces=True)

In [21]:
# Create function for tokenizing the dataset

def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)

# Load Dataset and Data Preparation

In [None]:
# https://huggingface.co/datasets/amazon_polarity
# takes a long time to process, you may want to try it yourself
# dataset = load_dataset('amazon_polarity')

In [22]:
raw_datasets = load_dataset('glue', 'sst2')

In [23]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [24]:
raw_datasets['train'][0:3]

{'sentence': ['hide new secretions from the parental units ',
  'contains no wit , only labored gags ',
  'that loves its characters and communicates something rather beautiful about human nature '],
 'label': [0, 0, 1],
 'idx': [0, 1, 2]}

In [25]:
# Tokenize the dataset

tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [26]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [27]:
tokenized_datasets['train'][0:3]

{'sentence': ['hide new secretions from the parental units ',
  'contains no wit , only labored gags ',
  'that loves its characters and communicates something rather beautiful about human nature '],
 'label': [0, 0, 1],
 'idx': [0, 1, 2],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
  [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102],
  [101,
   2008,
   7459,
   2049,
   3494,
   1998,
   10639,
   2015,
   2242,
   2738,
   3376,
   2055,
   2529,
   3267,
   102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

# Training Arguments Object

In [28]:
training_args = TrainingArguments(
    output_dir='trainer_demo',
    eval_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1,
    report_to='none',
    fp16=True,  # Enable mixed-precision training if your GPU supports it
)

# Main Model for Classification

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

# Fine Tuning

In [31]:
print(torch.cuda.is_available())  # Should return True if a compatible GPU is detected


True


In [32]:
# After defining the trainer, before starting the training
print("Model device:", next(model.parameters()).device)  # Should output "cuda" if on GPU

Model device: cpu


In [33]:
# Sanity check

params_before = []
for name, p in model.named_parameters():
    params_before.append(p.detach().cpu().numpy())

In [38]:
# Define evaluation metrics function
# It helps to evaluate performance model per epochs.

metric = evaluate.load('accuracy')

def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [39]:
# Define Trainer Object

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Default Data Collator: ", trainer.data_collator)
# NOTE: By default data_collator use DataCollatorWithPadding

Default Data Collator:  DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, 

In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1861,0.413721,0.893349


TrainOutput(global_step=8419, training_loss=0.1468258087560561, metrics={'train_runtime': 545.6289, 'train_samples_per_second': 123.434, 'train_steps_per_second': 15.43, 'total_flos': 517212489917652.0, 'train_loss': 0.1468258087560561, 'epoch': 1.0})

In [None]:
# # Manually evaluation model
# metric = evaluate.load('accuracy')

# predictions = trainer.predict(tokenized_datasets['validation'])
# logits, labels = predictions.predictions, predictions.label_ids

# accuracy = metric.compute(predictions=np.argmax(logits, axis=-1), references=labels)
# print(accuracy)

In [None]:
# Save the model
# trainer.save_model('model_demo')

In [None]:
# Do sanity Check
params_after = []
for name, p in model.named_parameters():
  params_after.append(p.detach().cpu().numpy())

for p1, p2 in zip(params_before, params_after):
  print(np.sum(np.abs(p1 - p2)))

# Fix Naming Label Manually

In [None]:
# config_path = './model_demo/config.json'
# with open(config_path) as f:
#     j = json.load(f)
#     print("Before updated:")
#     print(j)

#     # Add 'id2label' for determine the label name
#     j['id2label'] = {0: 'negative', 1: 'positive'}

#     with open(config_path, 'w') as f:
#         json.dump(j, f, indent=2)

In [None]:
# # Cross check
# with open(config_path) as f:
#     j = json.load(f)

# print(j)