Summary:

1. prepare datasets:
	- transformers datasets
	- load_datasets
2. tokenize raw_dataset
	- transformers AutoTokenizer.from_pretrained (bert-base-cased)
	- define tokenize map function
	- declair test, eval data
3. model
	- AutoModelForSequenceClassification (bert-base-cased)
	- training arguments using TrainingArguments from trainsformers
	- using numpy define compute_metrics function for evaluation of our model
	- create a "trainer" = Trainer with specific model, argument, tran&eval data


Finally run trainer.train() to train/fine-tune our model


In [None]:
#preparing datasets
!pip install transformers datasets


from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

raw_datasets = load_dataset("imdb")
#The raw_datasets object is a dictionary with three keys:
# "train", "test" and "unsupervised"


Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 29.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 37.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.9 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 49.4 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


In [None]:
#for our purpuse we will use BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tk_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation = True)

tokenized_datasets = raw_datasets.map(tk_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]



Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-f5a6b3beb96b51f2.arrow


  0%|          | 0/25 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-36d65c52ae052ea6.arrow
Loading cached shuffled indices for dataset at /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a/cache-c874cffb433197ef.arrow


In [None]:
#Fine-tuning in PyTorch with the Trainer API

#define our model from pretrained Transformer models
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels = 2)

#instantiate TrainingArguments/hyperparameters
#evaluation_strategy="epoch" enables regular evaluations of 'trainer'
t_args = TrainingArguments("test_trainer", evaluation_strategy="epoch")

# since there is not evaluation of how our model preformed by default in our "Trainer",
# we are going to manually compute these matrics

!pip install np
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

#create a trainer
trainer = Trainer(
    model=model,
    args=t_args, 
    train_dataset=small_train_dataset, 
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

#to fine-tune our model, we just call the following command using our small_trainig data
trainer.train()
trainer.evaluate()


NameError: ignored