## Finetune OOB BERT-base model for binary classification

##### Prerequisites 

In [None]:
%%capture 

!pip install jupyter==1.0.0
!pip install ipywidgets==8.0.4

!pip install transformers==4.18.0
!pip install datasets==2.9.0
!pip install torch==1.10.2+cu113

#### Imports 

In [2]:
from transformers import BertForSequenceClassification
from transformers import TrainingArguments
from transformers import BertTokenizerFast
from datasets import load_dataset
from transformers import pipeline
from transformers import set_seed
from transformers import Trainer
from datasets import DatasetDict
import transformers
import numpy as np
import datasets
import logging 
import torch
import os

##### Setup logging 

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using torch version: {torch.__version__}]')

[Using transformers version: 4.18.0]
[Using datasets version: 2.9.0]
[Using torch version: 1.10.2+cu113]


#### Setup essentials 

In [5]:
N_GPUS = 1
TRAIN_EPOCHS = 2
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
MAX_LEN = 128
LOGGING_STEPS = 64
SAVE_STEPS = 10240  # reduce it to a smaler value like 512 if you want to save checkpoints
SAVE_TOTAL_LIMIT = 2

set_seed(123)

#### Load BERT base model

In [6]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2,  force_download=True)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

#### Load OOB BERT tokenizer 

In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
logger.info(f'Tokenizer: {tokenizer}')

Tokenizer: PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


#### Tokenize classification dataset 

In [8]:
data = load_dataset('csv', 
                     data_files='./data/clf_dataset.csv', 
                     delimiter=',', 
                     split='train', 
                     cache_dir='/tmp/cache')
logger.info(f'Loaded data: {data}')

Using custom data configuration default-39044f29c104f4ff
Found cached dataset csv (/tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
Loaded data: Dataset({
    features: ['response', 'label'],
    num_rows: 1343
})


##### Create data splits 

In [9]:
train_validation_test = data.train_test_split(shuffle=True, seed=123, test_size=0.1)
data_splits = DatasetDict({'train': train_validation_test['train'],  
                           'validation': train_validation_test['test']})
logger.info(f'Data splits: {data_splits}')

Loading cached split indices for dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-ed98a41f56082d07.arrow and /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-460959e44de30d4d.arrow
Data splits: DatasetDict({
    train: Dataset({
        features: ['response', 'label'],
        num_rows: 1208
    })
    validation: Dataset({
        features: ['response', 'label'],
        num_rows: 135
    })
})


In [10]:
def preprocess_function(examples):
    return tokenizer(examples['response'], truncation=True, padding=True)

##### Tokenize 

In [11]:
# Tokenize datasets 
num_proc = int(os.cpu_count()/N_GPUS)
logger.info(f'Total number of processes = {num_proc}')
tokenized_data = data_splits.map(preprocess_function, batched=True, num_proc=num_proc)
logger.info(f'Tokenized data: {tokenized_data}')

Total number of processes = 8


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-8108d45f5a64ab26.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-a0badf16af05891e.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-f41e83dce88555db.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-1824e501e57253df.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-0dff67b18a3d0365.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-168c7699a0fd2db9.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-64aee7158e98e85e.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-f6cb8ad7392cd6ca.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-f8c1bf4a34e15e82.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-cef96a60e04705a5.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-557486df23a29dd1.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-4698d922a96e6687.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-75aaff94f57aebcb.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-55e1f87166b1c73e.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-4d23b9c44e5619ab.arrow


 

Loading cached processed dataset at /tmp/cache/csv/default-39044f29c104f4ff/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-f934398fed8bf0d7.arrow
Tokenized data: DatasetDict({
    train: Dataset({
        features: ['response', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1208
    })
    validation: Dataset({
        features: ['response', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 135
    })
})


#### Finetune 

##### Define training hyperparameters 

In [12]:
training_args = TrainingArguments(output_dir='./model', 
                                  overwrite_output_dir=True, 
                                  num_train_epochs=TRAIN_EPOCHS,  
                                  optim='adamw_torch', 
                                  save_strategy='steps', 
                                  evaluation_strategy='epoch',
                                  per_device_train_batch_size=TRAIN_BATCH_SIZE, 
                                  per_device_eval_batch_size=EVAL_BATCH_SIZE, 
                                  warmup_steps=10, 
                                  weight_decay=0.01,
                                  logging_steps=LOGGING_STEPS,
                                  save_steps=SAVE_STEPS, 
                                  save_total_limit=SAVE_TOTAL_LIMIT,
                                  logging_dir='logs')

##### Kick-off training
Note: Since we are using a small dataset for finetuning. The model overfits

In [13]:
trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=tokenized_data['train'], 
                  eval_dataset=tokenized_data['validation'], 
                  tokenizer=tokenizer)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response. If response are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1208
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 302


[2023-02-08 00:06:03.763: W smdistributed/modelparallel/torch/nn/predefined_hooks.py:47] Found unsupported HuggingFace version 4.18.0 for automated tensor parallelism. HuggingFace modules will not be automatically distributed. You can use smp.tp_register_with_module API to register desired modules for tensor parallelism, or directly instantiate an smp.nn.DistributedModule. Supported HuggingFace transformers versions for automated tensor parallelism: ['4.16.2']
[2023-02-08 00:06:03.806 pytorch-1-10-gpu-p-ml-g4dn-2xlarge-0431c88e252693110a51644c6a08:16617 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2023-02-08 00:06:03.938 pytorch-1-10-gpu-p-ml-g4dn-2xlarge-0431c88e252693110a51644c6a08:16617 INFO profiler_config_parser.py:111] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.




Epoch,Training Loss,Validation Loss
1,0.6958,0.672331
2,0.659,0.698973


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response. If response are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 135
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response. If response are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 135
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=302, training_loss=0.6825174053773185, metrics={'train_runtime': 67.1975, 'train_samples_per_second': 35.954, 'train_steps_per_second': 4.494, 'total_flos': 164982965041440.0, 'train_loss': 0.6825174053773185, 'epoch': 2.0})

#### Save model

In [14]:
trainer.save_model('./model')

Saving model checkpoint to ./model
Configuration saved in ./model/config.json
Model weights saved in ./model/pytorch_model.bin
tokenizer config file saved in ./model/tokenizer_config.json
Special tokens file saved in ./model/special_tokens_map.json


#### Create a classification pipeline using the saved model

In [15]:
clf = pipeline('sentiment-analysis', 
               model='./model', 
               return_all_scores=True)

loading configuration file ./model/config.json
Model config BertConfig {
  "_name_or_path": "./model",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading configuration file ./model/config.json
Model config BertConfig {
  "_name_or_path": "./model",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "clas

In [16]:
prediction = clf('yes. however if you have a medical condition that precludes your ability to work from an office setting or facility (such as asthma), please consult with local health officials')
logger.info(prediction)

[[{'label': 'LABEL_0', 'score': 0.4401438236236572}, {'label': 'LABEL_1', 'score': 0.5598561763763428}]]
INFO:sagemaker:[[{'label': 'LABEL_0', 'score': 0.4401438236236572}, {'label': 'LABEL_1', 'score': 0.5598561763763428}]]


In [17]:
prediction = clf('the state of emergency declared in s')
logger.info(prediction)

[[{'label': 'LABEL_0', 'score': 0.8407505750656128}, {'label': 'LABEL_1', 'score': 0.1592494398355484}]]
INFO:sagemaker:[[{'label': 'LABEL_0', 'score': 0.8407505750656128}, {'label': 'LABEL_1', 'score': 0.1592494398355484}]]
