# Import

In [1]:
!pip install -q datasets
!pip install -q transformers

[K     |████████████████████████████████| 290 kB 11.1 MB/s 
[K     |████████████████████████████████| 125 kB 51.6 MB/s 
[K     |████████████████████████████████| 243 kB 49.1 MB/s 
[K     |████████████████████████████████| 56 kB 4.2 MB/s 
[K     |████████████████████████████████| 1.3 MB 45.4 MB/s 
[K     |████████████████████████████████| 160 kB 45.0 MB/s 
[K     |████████████████████████████████| 271 kB 54.8 MB/s 
[K     |████████████████████████████████| 3.1 MB 12.7 MB/s 
[K     |████████████████████████████████| 596 kB 35.8 MB/s 
[K     |████████████████████████████████| 895 kB 30.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 35.6 MB/s 
[?25h

In [2]:
from datasets import list_datasets, load_dataset, list_metrics, load_metric

In [3]:

from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification, TrainingArguments, Trainer
import json
import numpy as np

# Tutorial

In [None]:
list_datasets()

In [None]:
raw_datasets = load_dataset("imdb")
print(json.dumps(raw_datasets['train'][0], indent=2))

In [None]:
classifier = pipeline("sentiment-analysis")

result = classifier("I love Hugging Face")
result

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

token = tokenizer.tokenize("NLP's very interesting")
token

In [None]:
ids = tokenizer.convert_tokens_to_ids(token)
ids

In [None]:
decoded = tokenizer.decode(ids)
decoded

# Dataset

In [4]:
raw_datasets = load_dataset("imdb")

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

# Tokenizer

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [7]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets
# tokenized_datasets.remove_columns(['idx','sentence1','sentence2'])
# tokenized_datasets.rename_column('label','labels')
# tokenized_datasets.with_format('tensorflow')

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 50000
    })
})

In [8]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))
# full_train_dataset = tokenized_datasets["train"]
# full_eval_dataset = tokenized_datasets["test"]

# Model

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

# Train

In [10]:
training_args = TrainingArguments("test_trainer")

In [11]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
)

In [12]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=39, training_loss=0.469545657818134, metrics={'train_runtime': 53.7579, 'train_samples_per_second': 5.581, 'train_steps_per_second': 0.725, 'total_flos': 78933316608000.0, 'train_loss': 0.469545657818134, 'epoch': 3.0})

# Test

In [13]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


{'eval_accuracy': 0.67,
 'eval_loss': 0.6092690825462341,
 'eval_runtime': 6.4787,
 'eval_samples_per_second': 15.435,
 'eval_steps_per_second': 2.007}