# Multiclass Classification With Hugging Face Transformers

- Split pandas dataframe into train and test
- Convert to HF datasets
- Load a pretrained model
- Defining the performance metrics
- Define training hyperparameters
- Create Trainer
- Get the predictions on the test dataset
- Save trained model
- Load a saved model in local for prediction
- Plot confusion matrix for model performance evaluation
- Error Analysis: sort the validation samples by the model loss: highest and lowest loss

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

path = '/content/drive/MyDrive/career/data scientist/recommender system/datasets/'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd


In [4]:
df = pd.read_csv(path+"bbc-text-NLP-multiclass.csv")

In [5]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


# Convert category into numerical label

In [6]:
df.category = pd.Categorical(df.category)
df['label'] = df.category.cat.codes
df.head()

Unnamed: 0,category,text,label
0,tech,tv future in the hands of viewers with home th...,4
1,business,worldcom boss left books alone former worldc...,0
2,sport,tigers wary of farrell gamble leicester say ...,3
3,sport,yeading face newcastle in fa cup premiership s...,3
4,entertainment,ocean s twelve raids box office ocean s twelve...,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   category  2225 non-null   category
 1   text      2225 non-null   object  
 2   label     2225 non-null   int8    
dtypes: category(1), int8(1), object(1)
memory usage: 22.1+ KB


In [8]:
df = df[['text', 'label']]

# Split into pandas train and test dataframes

In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [10]:
train = train.reset_index(drop=True)
train

Unnamed: 0,text,label
0,wilkinson fit to face edinburgh england captai...,3
1,bush website blocked outside us surfers outsid...,4
2,china aviation seeks rescue deal scandal-hit j...,0
3,fiat chief takes steering wheel the chief exec...,0
4,brown proud of economy record gordon brown h...,2
...,...,...
1775,csi shows give unrealistic view people have ...,1
1776,telewest to challenge sky plus cable firm tele...,4
1777,us to raise tv indecency fines us politician...,1
1778,venezuela and china sign oil deal venezuelan p...,0


In [11]:
test = test.reset_index(drop=True)
test

Unnamed: 0,text,label
0,duran duran show set for us tv chart stars dur...,1
1,hague s six-figure earnings shown the rewards ...,2
2,dame julie pops in to see poppins mary poppins...,1
3,robotic pods take on car design a new breed of...,4
4,vera drake s bafta triumph hope at the bafta f...,1
...,...,...
440,glaxo aims high after profit fall glaxosmithkl...,0
441,latin america sees strong growth latin america...,0
442,steel firm to cut 45 000 jobs mittal steel ...,0
443,last star wars not for children the sixth an...,1


# Convert from pandas dataframe to HF dataset

In [12]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train)
test_ds = Dataset.from_pandas(test)

In [13]:
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 1780
})

In [14]:
train_ds[:1]

{'label': [3],
 'text': ['wilkinson fit to face edinburgh england captain jonny wilkinson will make his long-awaited return from injury against edinburgh on saturday.  wilkinson  who has not played since injuring his bicep on 17 october  took part in full-contact training with newcastle falcons on wednesday. and the 25-year-old fly-half will start saturday s heineken cup match at murrayfield on the bench. but newcastle director of rugby rob andrew said:  he s fine and we hope to get him into the game at some stage.  the 25-year-old missed england s autumn internationals after aggravating the haematoma in his upper right arm against saracens. he was subsequently replaced as england captain by full-back jason robinson. sale s charlie hodgson took over the number 10 shirt in the internationals against canada  south africa and australia. wilkinson s year has been disrupted by injury as his muscle problem followed eight months on the sidelines with a shoulder injury sustained in the world c

In [15]:
print(train_ds.features)

{'text': Value(dtype='string', id=None), 'label': Value(dtype='int8', id=None)}


# Tokenize the Whole Dataset: Train dataset and test dataset to have 2 more cols: ['category', 'text', 'input_ids', 'attention_mask']

In [16]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [17]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [18]:
print((tokenize(train_ds[:2])))

{'input_ids': [[101, 16237, 4906, 2000, 2227, 5928, 2563, 2952, 26937, 16237, 2097, 2191, 2010, 2146, 1011, 19605, 2709, 2013, 4544, 2114, 5928, 2006, 5095, 1012, 16237, 2040, 2038, 2025, 2209, 2144, 22736, 2010, 12170, 3401, 2361, 2006, 2459, 2255, 2165, 2112, 1999, 2440, 1011, 3967, 2731, 2007, 8142, 14929, 2006, 9317, 1012, 1998, 1996, 2423, 1011, 2095, 1011, 2214, 4875, 1011, 2431, 2097, 2707, 5095, 1055, 2002, 3170, 7520, 2452, 2674, 2012, 6264, 3790, 2006, 1996, 6847, 1012, 2021, 8142, 2472, 1997, 4043, 6487, 4080, 2056, 1024, 2002, 1055, 2986, 1998, 2057, 3246, 2000, 2131, 2032, 2046, 1996, 2208, 2012, 2070, 2754, 1012, 1996, 2423, 1011, 2095, 1011, 2214, 4771, 2563, 1055, 7114, 27340, 2044, 12943, 17643, 26477, 1996, 5292, 14545, 20389, 2050, 1999, 2010, 3356, 2157, 2849, 2114, 7354, 19023, 1012, 2002, 2001, 3525, 2999, 2004, 2563, 2952, 2011, 2440, 1011, 2067, 4463, 6157, 1012, 5096, 1055, 4918, 26107, 2165, 2058, 1996, 2193, 2184, 3797, 1999, 1996, 27340, 2114, 2710, 2148, 30

In [19]:
train_encoded = train_ds.map(tokenize, batched=True, batch_size=None)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
print(train_encoded.column_names)

['text', 'label', 'input_ids', 'attention_mask']


In [21]:
test_encoded = test_ds.map(tokenize, batched=True, batch_size=None)
print(test_encoded.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

['text', 'label', 'input_ids', 'attention_mask']


# Fine-Tuning Transformers
* Load a pretrained model
* Defining the performance metrics
* Define training hyperparameters
* Create Trainer
* Get the predictions on the test dataset
* Save trained model
* Load a saved model in local for prediction
* Plot confusion matrix for model performance evaluation
* Error Analysis: sort the validation samples by the model loss: highest and lowest loss

## Load a pretrained model

In [22]:
from transformers import AutoModelForSequenceClassification

num_labels = 5
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

## Defining the performance metrics

In [23]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

## Define training hyperparameters

In [24]:
from transformers import Trainer, TrainingArguments

batch_size = 32 # 64
num_epochs = 1
logging_steps = len(train_encoded) 

model_name = f"{model_ckpt}-finetuned-classification"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=num_epochs,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  log_level="error")

## Create Trainer

In [25]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=train_encoded,
                  eval_dataset=test_encoded,
                  tokenizer=tokenizer)

## Empty cuda memory if necessary

In [26]:
# import torch
# torch.cuda.empty_cache()

In [27]:
# !pip install GPUtil

# import torch
# from GPUtil import showUtilization as gpu_usage
# from numba import cuda

# def free_gpu_cache():
#     print("Initial GPU Usage")
#     gpu_usage()                             

#     torch.cuda.empty_cache()

#     cuda.select_device(0)
#     cuda.close()
#     cuda.select_device(0)

#     print("GPU Usage after emptying the cache")
#     gpu_usage()

# free_gpu_cache() 

In [28]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33malinaz[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.707845,0.946067,0.946144


TrainOutput(global_step=56, training_loss=1.0569466182163783, metrics={'train_runtime': 101.7821, 'train_samples_per_second': 17.488, 'train_steps_per_second': 0.55, 'total_flos': 235804584652800.0, 'train_loss': 1.0569466182163783, 'epoch': 1.0})

## Get the predictions on the validation set

In [29]:
preds_output = trainer.predict(test_encoded)

In [30]:
preds_output.metrics

{'test_accuracy': 0.946067415730337,
 'test_f1': 0.9461443547662537,
 'test_loss': 0.7078450322151184,
 'test_runtime': 8.0431,
 'test_samples_per_second': 55.327,
 'test_steps_per_second': 1.741}

## Grid Search


## customer callback for training performance metrics

In [31]:
from transformers import TrainerCallback
import copy

class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = copy.deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            print("training performance metrics: ", control_copy)

            return control_copy

In [32]:
class PrinterCallback(TrainerCallback):
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer

    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            print(logs)

        return logs

In [33]:
## hyper-parameters
## top 3 feature importance are: learning_rate > weight_decay > per_gpu_batch_size
# {
#   "per_gpu_batch_size": (16, 64),
#   "weight_decay": (0, 0.3),
#   "learning_rate": (1e-5, 5e-5),
#   "warmup_steps": (0, 500),
#   "num_epochs": (2, 5)
#   "batch_size": (8, 16, 32, 64)
# }

grid = {
  "weight_decay": (0.01, 0.3),
  "learning_rate": (1e-5, 5e-5),
  "batch_size": (8, 16)
}

In [34]:
grid_performance_list = []

for wd in grid["weight_decay"]:
  for lr in grid["learning_rate"]:
    for bs in grid["batch_size"]:
      print("weight_decay: ",wd, "\n learning_rate: ",lr,"\n batch_size: ",bs)
      training_args = TrainingArguments(output_dir=model_name,
                                        num_train_epochs=2,
                                        learning_rate=lr,
                                        per_device_train_batch_size=bs,
                                        per_device_eval_batch_size=bs,
                                        weight_decay=wd,
                                        evaluation_strategy="epoch",
                                        disable_tqdm=False,
                                        log_level="error")
      
      trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=train_encoded,
                  eval_dataset=test_encoded,
                  tokenizer=tokenizer)
      
      #trainer.add_callback(PrinterCallback(trainer)) 

      trainer.train()


      preds_output = trainer.predict(test_encoded)
      grid_performance = preds_output.metrics
      grid_performance["weight_decay"] = wd
      grid_performance["learning_rate"] = lr
      grid_performance["batch_size"] = bs
      print(grid_performance)

      grid_performance_list.append(grid_performance)

print(grid_performance_list)
    

    

weight_decay:  0.01 
 learning_rate:  1e-05 
 batch_size:  8




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.131855,0.968539,0.968477
2,No log,0.100989,0.970787,0.970692


{'test_loss': 0.10098914802074432, 'test_accuracy': 0.9707865168539326, 'test_f1': 0.9706920472312108, 'test_runtime': 8.0817, 'test_samples_per_second': 55.062, 'test_steps_per_second': 6.929, 'weight_decay': 0.01, 'learning_rate': 1e-05, 'batch_size': 8}
weight_decay:  0.01 
 learning_rate:  1e-05 
 batch_size:  16




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.076817,0.979775,0.979692
2,No log,0.082891,0.977528,0.977452


{'test_loss': 0.08289101719856262, 'test_accuracy': 0.9775280898876404, 'test_f1': 0.9774516559900344, 'test_runtime': 8.0565, 'test_samples_per_second': 55.235, 'test_steps_per_second': 3.475, 'weight_decay': 0.01, 'learning_rate': 1e-05, 'batch_size': 16}
weight_decay:  0.01 
 learning_rate:  5e-05 
 batch_size:  8




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.186033,0.964045,0.964088
2,No log,0.188054,0.970787,0.970773


{'test_loss': 0.18805433809757233, 'test_accuracy': 0.9707865168539326, 'test_f1': 0.970773379330725, 'test_runtime': 8.037, 'test_samples_per_second': 55.369, 'test_steps_per_second': 6.968, 'weight_decay': 0.01, 'learning_rate': 5e-05, 'batch_size': 8}
weight_decay:  0.01 
 learning_rate:  5e-05 
 batch_size:  16




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.251433,0.961798,0.961836
2,No log,0.186674,0.966292,0.966211


{'test_loss': 0.1866736263036728, 'test_accuracy': 0.9662921348314607, 'test_f1': 0.9662106759805906, 'test_runtime': 8.0646, 'test_samples_per_second': 55.179, 'test_steps_per_second': 3.472, 'weight_decay': 0.01, 'learning_rate': 5e-05, 'batch_size': 16}
weight_decay:  0.3 
 learning_rate:  1e-05 
 batch_size:  8




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.199953,0.968539,0.968361
2,No log,0.223605,0.975281,0.975165


{'test_loss': 0.22360503673553467, 'test_accuracy': 0.9752808988764045, 'test_f1': 0.9751650778687984, 'test_runtime': 8.0485, 'test_samples_per_second': 55.29, 'test_steps_per_second': 6.958, 'weight_decay': 0.3, 'learning_rate': 1e-05, 'batch_size': 8}
weight_decay:  0.3 
 learning_rate:  1e-05 
 batch_size:  16




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.281537,0.968539,0.968357
2,No log,0.266782,0.975281,0.975165


{'test_loss': 0.26678231358528137, 'test_accuracy': 0.9752808988764045, 'test_f1': 0.9751650778687984, 'test_runtime': 8.0372, 'test_samples_per_second': 55.368, 'test_steps_per_second': 3.484, 'weight_decay': 0.3, 'learning_rate': 1e-05, 'batch_size': 16}
weight_decay:  0.3 
 learning_rate:  5e-05 
 batch_size:  8




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.428569,0.957303,0.957325
2,No log,0.267938,0.968539,0.968523


{'test_loss': 0.267937570810318, 'test_accuracy': 0.9685393258426966, 'test_f1': 0.968522736505308, 'test_runtime': 8.0472, 'test_samples_per_second': 55.299, 'test_steps_per_second': 6.959, 'weight_decay': 0.3, 'learning_rate': 5e-05, 'batch_size': 8}
weight_decay:  0.3 
 learning_rate:  5e-05 
 batch_size:  16




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.282393,0.955056,0.955053
2,No log,0.28662,0.966292,0.966232


{'test_loss': 0.28661954402923584, 'test_accuracy': 0.9662921348314607, 'test_f1': 0.9662319815307369, 'test_runtime': 8.04, 'test_samples_per_second': 55.349, 'test_steps_per_second': 3.483, 'weight_decay': 0.3, 'learning_rate': 5e-05, 'batch_size': 16}
[{'test_loss': 0.10098914802074432, 'test_accuracy': 0.9707865168539326, 'test_f1': 0.9706920472312108, 'test_runtime': 8.0817, 'test_samples_per_second': 55.062, 'test_steps_per_second': 6.929, 'weight_decay': 0.01, 'learning_rate': 1e-05, 'batch_size': 8}, {'test_loss': 0.08289101719856262, 'test_accuracy': 0.9775280898876404, 'test_f1': 0.9774516559900344, 'test_runtime': 8.0565, 'test_samples_per_second': 55.235, 'test_steps_per_second': 3.475, 'weight_decay': 0.01, 'learning_rate': 1e-05, 'batch_size': 16}, {'test_loss': 0.18805433809757233, 'test_accuracy': 0.9707865168539326, 'test_f1': 0.970773379330725, 'test_runtime': 8.037, 'test_samples_per_second': 55.369, 'test_steps_per_second': 6.968, 'weight_decay': 0.01, 'learning_rat

In [35]:
print(grid_performance_list)

[{'test_loss': 0.10098914802074432, 'test_accuracy': 0.9707865168539326, 'test_f1': 0.9706920472312108, 'test_runtime': 8.0817, 'test_samples_per_second': 55.062, 'test_steps_per_second': 6.929, 'weight_decay': 0.01, 'learning_rate': 1e-05, 'batch_size': 8}, {'test_loss': 0.08289101719856262, 'test_accuracy': 0.9775280898876404, 'test_f1': 0.9774516559900344, 'test_runtime': 8.0565, 'test_samples_per_second': 55.235, 'test_steps_per_second': 3.475, 'weight_decay': 0.01, 'learning_rate': 1e-05, 'batch_size': 16}, {'test_loss': 0.18805433809757233, 'test_accuracy': 0.9707865168539326, 'test_f1': 0.970773379330725, 'test_runtime': 8.037, 'test_samples_per_second': 55.369, 'test_steps_per_second': 6.968, 'weight_decay': 0.01, 'learning_rate': 5e-05, 'batch_size': 8}, {'test_loss': 0.1866736263036728, 'test_accuracy': 0.9662921348314607, 'test_f1': 0.9662106759805906, 'test_runtime': 8.0646, 'test_samples_per_second': 55.179, 'test_steps_per_second': 3.472, 'weight_decay': 0.01, 'learning_r

In [36]:
grid_df = pd.DataFrame(grid_performance_list)
grid_df

Unnamed: 0,test_loss,test_accuracy,test_f1,test_runtime,test_samples_per_second,test_steps_per_second,weight_decay,learning_rate,batch_size
0,0.100989,0.970787,0.970692,8.0817,55.062,6.929,0.01,1e-05,8
1,0.082891,0.977528,0.977452,8.0565,55.235,3.475,0.01,1e-05,16
2,0.188054,0.970787,0.970773,8.037,55.369,6.968,0.01,5e-05,8
3,0.186674,0.966292,0.966211,8.0646,55.179,3.472,0.01,5e-05,16
4,0.223605,0.975281,0.975165,8.0485,55.29,6.958,0.3,1e-05,8
5,0.266782,0.975281,0.975165,8.0372,55.368,3.484,0.3,1e-05,16
6,0.267938,0.968539,0.968523,8.0472,55.299,6.959,0.3,5e-05,8
7,0.28662,0.966292,0.966232,8.04,55.349,3.483,0.3,5e-05,16


## Check raw predictions

In [37]:
import numpy as np
import matplotlib.pyplot as plt

y_preds = np.argmax(preds_output.predictions, axis=1)

In [38]:
y_preds

array([1, 2, 1, 4, 1, 4, 1, 3, 2, 1, 4, 4, 0, 3, 3, 3, 2, 2, 0, 3, 3, 1,
       0, 4, 3, 3, 1, 3, 0, 3, 1, 4, 3, 4, 3, 4, 0, 0, 2, 3, 4, 4, 2, 3,
       4, 4, 0, 4, 3, 0, 0, 0, 1, 0, 3, 2, 1, 0, 3, 0, 4, 0, 1, 3, 1, 2,
       0, 2, 1, 4, 3, 4, 1, 3, 0, 4, 0, 1, 0, 3, 1, 1, 0, 4, 0, 2, 3, 1,
       2, 1, 0, 2, 2, 3, 1, 4, 0, 3, 2, 4, 2, 2, 1, 4, 1, 2, 4, 2, 3, 3,
       4, 1, 0, 2, 1, 1, 4, 2, 1, 0, 4, 3, 1, 3, 2, 4, 0, 2, 1, 1, 2, 0,
       3, 1, 0, 1, 2, 1, 0, 2, 0, 0, 1, 0, 4, 0, 3, 2, 3, 1, 0, 1, 3, 2,
       3, 0, 3, 2, 0, 0, 3, 3, 0, 4, 1, 3, 4, 0, 3, 1, 3, 0, 3, 0, 2, 2,
       2, 0, 0, 3, 2, 3, 0, 0, 2, 3, 3, 4, 4, 1, 0, 2, 0, 4, 1, 3, 2, 3,
       3, 2, 2, 1, 2, 3, 2, 0, 4, 1, 1, 0, 1, 1, 1, 2, 2, 3, 2, 4, 0, 1,
       3, 2, 4, 4, 3, 4, 1, 1, 2, 1, 2, 4, 2, 3, 4, 2, 0, 1, 4, 2, 2, 0,
       4, 0, 0, 2, 3, 4, 4, 2, 3, 0, 2, 3, 4, 4, 0, 3, 1, 4, 2, 4, 3, 0,
       2, 1, 0, 1, 3, 0, 4, 0, 2, 0, 0, 3, 1, 4, 3, 1, 2, 2, 1, 2, 4, 3,
       2, 1, 3, 0, 3, 0, 0, 4, 0, 3, 3, 1, 0, 2, 4,

## Save trained model

In [39]:
model_name = "trained_model_bbc_classification"
trainer.save_model(path+model_name)

# Load a saved model in local for prediction

# Using Trainer at inference time: a string or a list of strings


In [40]:
pd.set_option('display.max_colwidth', None)

train[:2]

Unnamed: 0,text,label
0,wilkinson fit to face edinburgh england captain jonny wilkinson will make his long-awaited return from injury against edinburgh on saturday. wilkinson who has not played since injuring his bicep on 17 october took part in full-contact training with newcastle falcons on wednesday. and the 25-year-old fly-half will start saturday s heineken cup match at murrayfield on the bench. but newcastle director of rugby rob andrew said: he s fine and we hope to get him into the game at some stage. the 25-year-old missed england s autumn internationals after aggravating the haematoma in his upper right arm against saracens. he was subsequently replaced as england captain by full-back jason robinson. sale s charlie hodgson took over the number 10 shirt in the internationals against canada south africa and australia. wilkinson s year has been disrupted by injury as his muscle problem followed eight months on the sidelines with a shoulder injury sustained in the world cup final.,3
1,bush website blocked outside us surfers outside the us have been unable to visit the official re-election site of president george w bush. the blocking of browsers sited outside the us began in the early hours of monday morning. since then people outside the us trying to browse the site get a message saying they are not authorised to view it. the blocking does not appear to be due to an attack by vandals or malicious hackers but as a result of a policy decision by the bush camp. the international exclusion zone around georgewbush.com was spotted by net monitoring firm netcraft which keeps an eye on traffic patterns across many different sites. netcraft said that since the early hours of 25 october attempts to view the site through its monitoring stations in london amsterdam and sydney failed. by contrast netcraft s four monitoring stations in the us managed to view the site with no problems. the site can still be seen using anonymous proxy services that are based in the us. some web users in canada also report that they can browse the site. the pattern of traffic to the website suggests that the blocking was not due to an attack by vandals or politically motivated hackers. geographic blocking works because the numerical addresses that the net uses to organise itself are handed out on a regional basis. on 21 october the george w bush website began using the services of a company called akamai to ensure that the pages videos and other content on its site reaches visitors. mike prettejohn president of netcraft speculated that the blocking decision might have been taken to cut costs and traffic in the run-up to the election on 2 november. he said the site may see no reason to distribute content to people who will not be voting next week. managing traffic could also be a good way to ensure that the site stays working in the closing days of the election campaign. however simply blocking non-us visitors also means that americans overseas are barred too. akamai declined to comment saying it could not talk about customer websites.,4


In [41]:
train.iloc[0].text

'wilkinson fit to face edinburgh england captain jonny wilkinson will make his long-awaited return from injury against edinburgh on saturday.  wilkinson  who has not played since injuring his bicep on 17 october  took part in full-contact training with newcastle falcons on wednesday. and the 25-year-old fly-half will start saturday s heineken cup match at murrayfield on the bench. but newcastle director of rugby rob andrew said:  he s fine and we hope to get him into the game at some stage.  the 25-year-old missed england s autumn internationals after aggravating the haematoma in his upper right arm against saracens. he was subsequently replaced as england captain by full-back jason robinson. sale s charlie hodgson took over the number 10 shirt in the internationals against canada  south africa and australia. wilkinson s year has been disrupted by injury as his muscle problem followed eight months on the sidelines with a shoulder injury sustained in the world cup final.'

In [42]:
text = [train.iloc[0].text, train.iloc[1].text]
encoding = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

model = AutoModelForSequenceClassification.from_pretrained(path+model_name, local_files_only=True)
# forward pass
outputs = model(**encoding)
predictions = outputs.logits.argmax(-1)

In [43]:
predictions

tensor([3, 4])

# Using Trainer at inference time: batch testing


In [44]:
batch_test = train[:2][['text']]
batch_test

Unnamed: 0,text
0,wilkinson fit to face edinburgh england captain jonny wilkinson will make his long-awaited return from injury against edinburgh on saturday. wilkinson who has not played since injuring his bicep on 17 october took part in full-contact training with newcastle falcons on wednesday. and the 25-year-old fly-half will start saturday s heineken cup match at murrayfield on the bench. but newcastle director of rugby rob andrew said: he s fine and we hope to get him into the game at some stage. the 25-year-old missed england s autumn internationals after aggravating the haematoma in his upper right arm against saracens. he was subsequently replaced as england captain by full-back jason robinson. sale s charlie hodgson took over the number 10 shirt in the internationals against canada south africa and australia. wilkinson s year has been disrupted by injury as his muscle problem followed eight months on the sidelines with a shoulder injury sustained in the world cup final.
1,bush website blocked outside us surfers outside the us have been unable to visit the official re-election site of president george w bush. the blocking of browsers sited outside the us began in the early hours of monday morning. since then people outside the us trying to browse the site get a message saying they are not authorised to view it. the blocking does not appear to be due to an attack by vandals or malicious hackers but as a result of a policy decision by the bush camp. the international exclusion zone around georgewbush.com was spotted by net monitoring firm netcraft which keeps an eye on traffic patterns across many different sites. netcraft said that since the early hours of 25 october attempts to view the site through its monitoring stations in london amsterdam and sydney failed. by contrast netcraft s four monitoring stations in the us managed to view the site with no problems. the site can still be seen using anonymous proxy services that are based in the us. some web users in canada also report that they can browse the site. the pattern of traffic to the website suggests that the blocking was not due to an attack by vandals or politically motivated hackers. geographic blocking works because the numerical addresses that the net uses to organise itself are handed out on a regional basis. on 21 october the george w bush website began using the services of a company called akamai to ensure that the pages videos and other content on its site reaches visitors. mike prettejohn president of netcraft speculated that the blocking decision might have been taken to cut costs and traffic in the run-up to the election on 2 november. he said the site may see no reason to distribute content to people who will not be voting next week. managing traffic could also be a good way to ensure that the site stays working in the closing days of the election campaign. however simply blocking non-us visitors also means that americans overseas are barred too. akamai declined to comment saying it could not talk about customer websites.


In [45]:
batch_test_dataset = Dataset.from_pandas(batch_test)

In [46]:
batch_test

Unnamed: 0,text
0,wilkinson fit to face edinburgh england captain jonny wilkinson will make his long-awaited return from injury against edinburgh on saturday. wilkinson who has not played since injuring his bicep on 17 october took part in full-contact training with newcastle falcons on wednesday. and the 25-year-old fly-half will start saturday s heineken cup match at murrayfield on the bench. but newcastle director of rugby rob andrew said: he s fine and we hope to get him into the game at some stage. the 25-year-old missed england s autumn internationals after aggravating the haematoma in his upper right arm against saracens. he was subsequently replaced as england captain by full-back jason robinson. sale s charlie hodgson took over the number 10 shirt in the internationals against canada south africa and australia. wilkinson s year has been disrupted by injury as his muscle problem followed eight months on the sidelines with a shoulder injury sustained in the world cup final.
1,bush website blocked outside us surfers outside the us have been unable to visit the official re-election site of president george w bush. the blocking of browsers sited outside the us began in the early hours of monday morning. since then people outside the us trying to browse the site get a message saying they are not authorised to view it. the blocking does not appear to be due to an attack by vandals or malicious hackers but as a result of a policy decision by the bush camp. the international exclusion zone around georgewbush.com was spotted by net monitoring firm netcraft which keeps an eye on traffic patterns across many different sites. netcraft said that since the early hours of 25 october attempts to view the site through its monitoring stations in london amsterdam and sydney failed. by contrast netcraft s four monitoring stations in the us managed to view the site with no problems. the site can still be seen using anonymous proxy services that are based in the us. some web users in canada also report that they can browse the site. the pattern of traffic to the website suggests that the blocking was not due to an attack by vandals or politically motivated hackers. geographic blocking works because the numerical addresses that the net uses to organise itself are handed out on a regional basis. on 21 october the george w bush website began using the services of a company called akamai to ensure that the pages videos and other content on its site reaches visitors. mike prettejohn president of netcraft speculated that the blocking decision might have been taken to cut costs and traffic in the run-up to the election on 2 november. he said the site may see no reason to distribute content to people who will not be voting next week. managing traffic could also be a good way to ensure that the site stays working in the closing days of the election campaign. however simply blocking non-us visitors also means that americans overseas are barred too. akamai declined to comment saying it could not talk about customer websites.


In [47]:
batch_test_encoded = batch_test_dataset.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [48]:
# loading the model you previously trained
trained_model = AutoModelForSequenceClassification.from_pretrained(path+model_name, local_files_only=True)

# arguments for Trainer
test_args = TrainingArguments(
    output_dir = path,
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = batch_size,   
    dataloader_drop_last = False    
)

# init trainer
trainer = Trainer(
              model = trained_model, tokenizer=tokenizer,
              args = test_args, 
              compute_metrics = compute_metrics)

test_results = trainer.predict(batch_test_encoded)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2
  Batch size = 32


In [49]:
test_results

PredictionOutput(predictions=array([[-5.7508054, -5.3646507, -6.2657046, 12.270015 , -6.5854034],
       [-4.7953615, -4.1564517, -3.904071 , -6.9880385,  9.376428 ]],
      dtype=float32), label_ids=None, metrics={'test_runtime': 0.0528, 'test_samples_per_second': 37.883, 'test_steps_per_second': 18.941})

In [50]:
print(test_results.predictions)

[[-5.7508054 -5.3646507 -6.2657046 12.270015  -6.5854034]
 [-4.7953615 -4.1564517 -3.904071  -6.9880385  9.376428 ]]


In [51]:
test_y_preds = np.argmax(test_results.predictions, axis=1)

In [52]:
test_y_preds

array([3, 4])

In [53]:
batch_test['predicted'] = test_y_preds

batch_test

Unnamed: 0,text,predicted
0,wilkinson fit to face edinburgh england captain jonny wilkinson will make his long-awaited return from injury against edinburgh on saturday. wilkinson who has not played since injuring his bicep on 17 october took part in full-contact training with newcastle falcons on wednesday. and the 25-year-old fly-half will start saturday s heineken cup match at murrayfield on the bench. but newcastle director of rugby rob andrew said: he s fine and we hope to get him into the game at some stage. the 25-year-old missed england s autumn internationals after aggravating the haematoma in his upper right arm against saracens. he was subsequently replaced as england captain by full-back jason robinson. sale s charlie hodgson took over the number 10 shirt in the internationals against canada south africa and australia. wilkinson s year has been disrupted by injury as his muscle problem followed eight months on the sidelines with a shoulder injury sustained in the world cup final.,3
1,bush website blocked outside us surfers outside the us have been unable to visit the official re-election site of president george w bush. the blocking of browsers sited outside the us began in the early hours of monday morning. since then people outside the us trying to browse the site get a message saying they are not authorised to view it. the blocking does not appear to be due to an attack by vandals or malicious hackers but as a result of a policy decision by the bush camp. the international exclusion zone around georgewbush.com was spotted by net monitoring firm netcraft which keeps an eye on traffic patterns across many different sites. netcraft said that since the early hours of 25 october attempts to view the site through its monitoring stations in london amsterdam and sydney failed. by contrast netcraft s four monitoring stations in the us managed to view the site with no problems. the site can still be seen using anonymous proxy services that are based in the us. some web users in canada also report that they can browse the site. the pattern of traffic to the website suggests that the blocking was not due to an attack by vandals or politically motivated hackers. geographic blocking works because the numerical addresses that the net uses to organise itself are handed out on a regional basis. on 21 october the george w bush website began using the services of a company called akamai to ensure that the pages videos and other content on its site reaches visitors. mike prettejohn president of netcraft speculated that the blocking decision might have been taken to cut costs and traffic in the run-up to the election on 2 november. he said the site may see no reason to distribute content to people who will not be voting next week. managing traffic could also be a good way to ensure that the site stays working in the closing days of the election campaign. however simply blocking non-us visitors also means that americans overseas are barred too. akamai declined to comment saying it could not talk about customer websites.,4


In [54]:
train[:2]

Unnamed: 0,text,label
0,wilkinson fit to face edinburgh england captain jonny wilkinson will make his long-awaited return from injury against edinburgh on saturday. wilkinson who has not played since injuring his bicep on 17 october took part in full-contact training with newcastle falcons on wednesday. and the 25-year-old fly-half will start saturday s heineken cup match at murrayfield on the bench. but newcastle director of rugby rob andrew said: he s fine and we hope to get him into the game at some stage. the 25-year-old missed england s autumn internationals after aggravating the haematoma in his upper right arm against saracens. he was subsequently replaced as england captain by full-back jason robinson. sale s charlie hodgson took over the number 10 shirt in the internationals against canada south africa and australia. wilkinson s year has been disrupted by injury as his muscle problem followed eight months on the sidelines with a shoulder injury sustained in the world cup final.,3
1,bush website blocked outside us surfers outside the us have been unable to visit the official re-election site of president george w bush. the blocking of browsers sited outside the us began in the early hours of monday morning. since then people outside the us trying to browse the site get a message saying they are not authorised to view it. the blocking does not appear to be due to an attack by vandals or malicious hackers but as a result of a policy decision by the bush camp. the international exclusion zone around georgewbush.com was spotted by net monitoring firm netcraft which keeps an eye on traffic patterns across many different sites. netcraft said that since the early hours of 25 october attempts to view the site through its monitoring stations in london amsterdam and sydney failed. by contrast netcraft s four monitoring stations in the us managed to view the site with no problems. the site can still be seen using anonymous proxy services that are based in the us. some web users in canada also report that they can browse the site. the pattern of traffic to the website suggests that the blocking was not due to an attack by vandals or politically motivated hackers. geographic blocking works because the numerical addresses that the net uses to organise itself are handed out on a regional basis. on 21 october the george w bush website began using the services of a company called akamai to ensure that the pages videos and other content on its site reaches visitors. mike prettejohn president of netcraft speculated that the blocking decision might have been taken to cut costs and traffic in the run-up to the election on 2 november. he said the site may see no reason to distribute content to people who will not be voting next week. managing traffic could also be a good way to ensure that the site stays working in the closing days of the election campaign. however simply blocking non-us visitors also means that americans overseas are barred too. akamai declined to comment saying it could not talk about customer websites.,4


# Hyper-parameter Optimization

Bayesian Optimization and Population Based Training


Instead of grid search, we fine-tune BERT using more advanced search algorithms like Bayesian Optimization and Population Based Training. As a result, we can

- gain a better understanding of our hyperparameters and
- train a model with 5% better accuracy in the same amount of time.d

## hyper-parameters
## top 3 feature importance are: learning_rate > weight_decay > per_gpu_batch_size
{
  "per_gpu_batch_size": (16, 64),
  "weight_decay": (0, 0.3),
  "learning_rate": (1e-5, 5e-5),
  "warmup_steps": (0, 500),
  "num_epochs": (2, 5)
}

!pip install ray==0.8.7
!pip install ray[tune]

!pip install wandb

import ray

# If running on a cluster uncomment use the line below instead 
# ray.init(address="auto", log_to_driver=False)

ray.shutdown()
ray.init(log_to_driver=True, ignore_reinit_error=True)

from ray.tune.examples.pbt_transformers import utils


from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, local_files_only = True)

from transformers import AutoModelForSequenceClassification

num_labels = 5
model_name = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)

from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import GlueDataset

def get_datasets(config):
  data_args = DataTrainingArguments(
        task_name=config["task_name"], data_dir=config["data_dir"])
  tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
  # train_dataset = GlueDataset(
  #     data_args,
  #     tokenizer=tokenizer,
  #     mode="train",
  #     cache_dir=config["data_dir"])
  # eval_dataset = GlueDataset(
  #     data_args,
  #     tokenizer=tokenizer,
  #     mode="dev",
  #     cache_dir=config["data_dir"])
  # # Only use the first half for validation
  # eval_dataset = eval_dataset[:len(eval_dataset) // 2]
  # return train_dataset, eval_dataset

import logging
import os
from typing import Dict, Optional, Tuple

from ray import tune

import transformers
from transformers.file_utils import is_torch_tpu_available
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR #, is_wandb_available
import wandb

import torch
from torch.utils.data import Dataset



class TuneTransformerTrainer(transformers.Trainer):
    def get_optimizers(
            self, num_training_steps
    ):
        self.current_optimizer, self.current_scheduler = super(
        ).get_optimizers(num_training_steps)
        return (self.current_optimizer, self.current_scheduler)

    def evaluate(self,
                 eval_dataset= None):
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        output = self._prediction_loop(
            eval_dataloader, description="Evaluation")
        self._log(output.metrics)

        self.save_state()

        tune.report(**output.metrics)

        return output.metrics

    def save_state(self):
        with tune.checkpoint_dir(step=self.global_step) as checkpoint_dir:
            self.args.output_dir = checkpoint_dir
            # This is the directory name that Huggingface requires.
            output_dir = os.path.join(
                self.args.output_dir,
                f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}")
            self.save_model(output_dir)
            if self.is_world_master():
                torch.save(self.current_optimizer.state_dict(),
                           os.path.join(output_dir, "optimizer.pt"))
                torch.save(self.current_scheduler.state_dict(),
                           os.path.join(output_dir, "scheduler.pt"))

def recover_checkpoint(tune_checkpoint_dir, model_name=None):
    if tune_checkpoint_dir is None or len(tune_checkpoint_dir) == 0:
        return model_name
    # Get subdirectory used for Huggingface.
    subdirs = [
        os.path.join(tune_checkpoint_dir, name)
        for name in os.listdir(tune_checkpoint_dir)
        if os.path.isdir(os.path.join(tune_checkpoint_dir, name))
    ]
    # There should only be 1 subdir.
    assert len(subdirs) == 1, subdirs
    return subdirs[0]

from transformers import AutoConfig, TrainingArguments, glue_tasks_num_labels
from ray.tune.integration.wandb import wandb_mixin

@wandb_mixin
def train_transformer(config, checkpoint_dir=None):
  #train_dataset, eval_dataset = get_datasets(config)
  train_dataset=train_encoded
  eval_dataset=test_encoded

  get_datasets(config)

  training_args = TrainingArguments(
        output_dir=tune.get_trial_dir(),
        learning_rate=config["learning_rate"],
        do_train=True,
        do_eval=True,
        #evaluate_during_training=True,
        evaluation_strategy='epoch',
        # Run eval after every epoch.
        eval_steps=(len(train_dataset) // config["per_gpu_train_batch_size"]) +
        1,
        # We explicitly set save to 0, and do checkpointing in evaluate instead
        save_steps=0,
        num_train_epochs=config["num_epochs"],
        max_steps=config["max_steps"],
        per_device_train_batch_size=config["per_gpu_train_batch_size"],
        per_device_eval_batch_size=config["per_gpu_val_batch_size"],
        warmup_steps=0,
        weight_decay=config["weight_decay"],
        logging_dir="./logs",
    )

  model_name_or_path = recover_checkpoint(checkpoint_dir, config["model_name"])
  num_labels = 5 #glue_tasks_num_labels[config["task_name"]]

  config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=num_labels,
        finetuning_task=task_name,
    )
  model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path,
        config=config,
    )
  
  
   
  # Use our modified TuneTransformerTrainer
  tune_trainer = TuneTransformerTrainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      compute_metrics=utils.build_compute_metrics_fn(task_name),
  )
  tune_trainer.train(model_name_or_path)

task_name = "rte"

task_data_dir = os.path.join(path, task_name.upper())

config = {
        # These 3 configs below were defined earlier
        "model_name": "distilbert-base-uncased",
        "task_name": task_name,
        "data_dir": task_data_dir,
        "per_gpu_val_batch_size": 32,
        "per_gpu_train_batch_size": tune.choice([16, 32, 64]),
        "learning_rate": tune.uniform(1e-5, 5e-5),
        "weight_decay": tune.uniform(0.0, 0.3),
        "num_epochs": tune.choice([2, 3, 4, 5]),
        "max_steps": -1,  # We use num_epochs instead.
        "wandb": {
            "project": "pbt_transformers",
            "reinit": True,
            "allow_val_change": True,
            "api_key": "6184578f75a3c29f799c66136f1550c90e397078"
        }
    }

from ray.tune.schedulers import PopulationBasedTraining

scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="eval_acc",
        mode="max",
        perturbation_interval=2,
        hyperparam_mutations={
            "weight_decay": lambda: tune.uniform(0.0, 0.3).func(None),
            "learning_rate": lambda: tune.uniform(1e-5, 5e-5).func(None),
            "per_gpu_train_batch_size": [16, 32, 64],
        })

from ray.tune import CLIReporter

reporter = CLIReporter(
        parameter_columns={
            "weight_decay": "w_decay",
            "learning_rate": "lr",
            "per_gpu_train_batch_size": "train_bs/gpu",
            "num_epochs": "num_epochs"
        },
        metric_columns=[
            "eval_acc", "eval_loss", "epoch", "training_iteration"
        ])

analysis = tune.run(
        train_transformer,
        resources_per_trial={
            "cpu": 1,
            "gpu": 1
        },
        config=config,
        num_samples=3,
        scheduler=scheduler,
        keep_checkpoints_num=3,
        checkpoint_score_attr="training_iteration",
        progress_reporter=reporter,
        local_dir="./ray_results/",
        name="tune_transformer_pbt")