In [57]:
import numpy as np

import json

import torch

from transformers import AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer
from torchinfo import summary
from datasets import load_dataset
import evaluate

![flow-model-requirement](images/Flow-min-requirement-Training-FineTuning.png)

# Tokenizer Model Preparation

In [13]:
# checkpoint = 'bert-base-uncased'
checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, clean_up_tokenization_spaces=True)

In [3]:
# Create function for tokenizing the dataset

def tokenize_fn(batch):
    return tokenizer(batch['sentence'], truncation=True)

# Load Dataset and Data Preparation

In [4]:
raw_datasets = load_dataset('glue', 'sst2')

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [6]:
raw_datasets['train'][0:3]

{'sentence': ['hide new secretions from the parental units ',
  'contains no wit , only labored gags ',
  'that loves its characters and communicates something rather beautiful about human nature '],
 'label': [0, 0, 1],
 'idx': [0, 1, 2]}

In [7]:
# Tokenize the dataset

tokenized_datasets = raw_datasets.map(tokenize_fn, batched=True)

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [9]:
tokenized_datasets['train'][0:3]

{'sentence': ['hide new secretions from the parental units ',
  'contains no wit , only labored gags ',
  'that loves its characters and communicates something rather beautiful about human nature '],
 'label': [0, 0, 1],
 'idx': [0, 1, 2],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
  [101, 3397, 2053, 15966, 1010, 2069, 4450, 2098, 18201, 2015, 102],
  [101,
   2008,
   7459,
   2049,
   3494,
   1998,
   10639,
   2015,
   2242,
   2738,
   3376,
   2055,
   2529,
   3267,
   102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

# Training Arguments Arguments

In [27]:
training_args = TrainingArguments(
    output_dir='trainer_demo',
    eval_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=1,
    report_to='none',
    fp16=True,  # Enable mixed-precision training if your GPU supports it
)

# Main Model for Classification

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=2
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

# Fine Tuning

In [32]:
print(torch.cuda.is_available())  # Should return True if a compatible GPU is detected


True


In [31]:
# After defining the trainer, before starting the training
print("Model device:", next(model.parameters()).device)  # Should output "cuda" if on GPU

Model device: cuda:0


In [21]:
# Sanity check

params_before = []
for name, p in model.named_parameters():
    params_before.append(p.detach().cpu().numpy())

In [49]:
# Define evaluation metrics function
# It helps to evaluate performance model per epochs.

def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [52]:
# Define Trainer Object

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [53]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1801,0.392657,0.90367


TrainOutput(global_step=8419, training_loss=0.16742661396048814, metrics={'train_runtime': 612.663, 'train_samples_per_second': 109.928, 'train_steps_per_second': 13.742, 'total_flos': 517212489917652.0, 'train_loss': 0.16742661396048814, 'epoch': 1.0})

In [54]:
# # Manually evaluation model
# metric = evaluate.load('accuracy')

# predictions = trainer.predict(tokenized_datasets['validation'])
# logits, labels = predictions.predictions, predictions.label_ids

# accuracy = metric.compute(predictions=np.argmax(logits, axis=-1), references=labels)
# print(accuracy)

In [55]:
# Save the model
# trainer.save_model('model_demo')

In [64]:
# Do sanity Check
params_after = []
for name, p in model.named_parameters():
  params_after.append(p.detach().cpu().numpy())

for p1, p2 in zip(params_before, params_after):
  print(np.sum(np.abs(p1 - p2)))

27883.146
161.08818
2.7293258
1.986839
2232.5437
2.6709063
2213.6782
1.9484951
2012.0284
1.753175
1948.2728
1.3667009
2.744045
1.3348948
8460.632
9.250202
7858.458
1.0969527
2.6402593
1.2763145
2163.956
2.404458
2154.2632
1.6051614
1961.2778
1.380275
1899.1196
1.1958301
2.800476
1.2029843
8395.897
8.556961
7781.2656
1.0476208
2.5595791
1.3969865
2143.835
2.6230874
2156.0964
1.7447972
1924.1206
1.239581
1907.9014
1.15749
2.7441812
1.262913
8444.629
9.246374
7612.789
1.1211011
2.3512888
1.0532508
2134.5413
2.2581916
2151.4163
1.7893755
1934.0452
1.189076
1861.9167
1.3203943
2.5234005
1.3941909
8302.032
9.428415
7296.802
1.3004622
2.1989338
1.0753925
2038.1344
2.4453697
2019.7267
1.1392417
1721.0021
1.2407024
1686.5559
1.885263
2.2727752
1.7646315
7594.2646
9.256289
6494.3145
1.8898032
2.2399554
1.8041012
1871.8185
2.3218775
1886.4762
0.84884477
1564.0212
2.348853
1577.3542
2.3680573
2.5383296
2.599451
6691.4673
9.455466
6277.17
2.1716945
4.818941
1.4956582
1526.9487
1.726846
4.4161253
0.

# Fix Naming Label Manually

In [62]:
# config_path = './model_demo/config.json'
# with open(config_path) as f:
#     j = json.load(f)
#     print("Before updated:")
#     print(j)

#     # Add 'id2label' for determine the label name
#     j['id2label'] = {0: 'negative', 1: 'positive'}

#     with open(config_path, 'w') as f:
#         json.dump(j, f, indent=2)

In [61]:
# # Cross check
# with open(config_path) as f:
#     j = json.load(f)

# print(j)

{'_name_or_path': 'distilbert-base-uncased', 'activation': 'gelu', 'architectures': ['DistilBertForSequenceClassification'], 'attention_dropout': 0.1, 'dim': 768, 'dropout': 0.1, 'hidden_dim': 3072, 'initializer_range': 0.02, 'max_position_embeddings': 512, 'model_type': 'distilbert', 'n_heads': 12, 'n_layers': 6, 'pad_token_id': 0, 'problem_type': 'single_label_classification', 'qa_dropout': 0.1, 'seq_classif_dropout': 0.2, 'sinusoidal_pos_embds': False, 'tie_weights_': True, 'torch_dtype': 'float32', 'transformers_version': '4.44.2', 'vocab_size': 30522, 'id2label': {'0': 'negative', '1': 'positive'}}
