In [1]:
#!pip install nlp
#!pip install sklearn

In [1]:
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Training and fine-tuning

Model classes in 🤗 Transformers are designed to be compatible with native PyTorch and TensorFlow 2 and can be used seemlessly with either. We will show how to use our included `Trainer()` class which handles much of the complexity of training for you.

What's included:

* How to use `Trainer()` class which handles much of the complexity of training for you. Based on [this link](https://huggingface.co/transformers/training.html#trainer)


What's NOT included:

* Traning using native tensorflow or PyTorch.
* How to use fast.ai to fine-tune transformers. Based on [this tutorial](https://docs.fast.ai/tutorial.transformers). I initially wanted to give it a try, but it seems a bit ad hoc.

## 1. HuggingFace Trainer
We provide a simple but feature-complete training and evaluation interface through `Trainer()` and `TFTrainer()`. You can train, fine-tune, and evaluate any 🤗 Transformers model with a wide range of training options and with built-in features like logging, gradient accumulation, and mixed precision.

In [2]:
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [3]:
# loading the pretrained model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])

### Option 1: Preprocess with nlp library

There are two main advantages:

1. Using the map function is fast
2. The preprocessed dataset is cached. So it'll be lightning fast when calling for the second time.

In [7]:
# the dataset is nothing magical.
test_dataset[0]

{'label': 1,
 'text': "I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge."}

In [8]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=1024)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=1024)

In [9]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

### Option 2: Preprocess manually

In [5]:
train_texts, train_labels = zip(*[(doc['text'], doc['label']) for doc in train_dataset])
test_texts, test_labels = zip(*[(doc['text'], doc['label']) for doc in test_dataset])

In [6]:
%%time
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")

CPU times: user 4min 11s, sys: 663 ms, total: 4min 11s
Wall time: 4min 12s


In [7]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

### Training

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='/storage/yiping-transformer/results',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=21,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    evaluate_during_training=True,
    fp16=True,
    logging_dir='/storage/yiping-transformer/logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)



In [None]:
trainer.train()

In [None]:
model.save_pretrained('/storage/yiping-transformer/bert-base-imdb-finetuned')

### Evaluate the model

In [8]:
# loading the fine-tuned model
model = BertForSequenceClassification.from_pretrained("/storage/yiping-transformer/bert-base-imdb-finetuned/")

In [13]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    compute_metrics=compute_metrics,
    eval_dataset=test_dataset            # evaluation dataset
)

In [14]:
trainer.evaluate()

  import sys


{'eval_loss': 0.17838019132614136,
 'eval_accuracy': 0.93816,
 'eval_f1': 0.938347423831552,
 'eval_precision': 0.9355120865139949,
 'eval_recall': 0.9412}