In [2]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")
raw_datasets

Found cached dataset glue (C:/Users/Admin/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 1000.71it/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [3]:
raw_datasets["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [4]:
from transformers import AutoTokenizer

In [5]:
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(
        example["sentence1"], example["sentence2"], padding="max_length", truncation=True, max_length=128
    )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets.column_names)

Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 9.67kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 143kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 759kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 1.17MB/s]
                                                                  

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}




In [6]:
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
tokenized_datasets["train"]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})

In [7]:
small_train_dataset = tokenized_datasets["train"].select(range(100))

In [8]:
#### TRAINING
from transformers import DataCollatorWithPadding

In [9]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading (…)"pytorch_model.bin";: 100%|██████████| 436M/436M [00:03<00:00, 117MB/s] 
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weig

In [12]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    )

In [13]:
from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

***** Running training *****
  Num examples = 3668
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1150
  Number of trainable parameters = 108311810
  0%|          | 0/1150 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 43%|████▎     | 500/1150 [01:52<02:16,  4.77it/s]Saving model checkpoint to test-trainer\checkpoint-500
Configuration saved in test-trainer\checkpoint-500\config.json


{'loss': 0.4481, 'learning_rate': 1.1304347826086957e-05, 'epoch': 2.17}


Model weights saved in test-trainer\checkpoint-500\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-500\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-500\special_tokens_map.json
 87%|████████▋ | 1000/1150 [03:42<00:31,  4.72it/s]Saving model checkpoint to test-trainer\checkpoint-1000
Configuration saved in test-trainer\checkpoint-1000\config.json


{'loss': 0.1418, 'learning_rate': 2.6086956521739132e-06, 'epoch': 4.35}


Model weights saved in test-trainer\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-1000\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-1000\special_tokens_map.json
100%|██████████| 1150/1150 [04:16<00:00,  5.40it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1150/1150 [04:16<00:00,  4.49it/s]

{'train_runtime': 256.2824, 'train_samples_per_second': 71.562, 'train_steps_per_second': 4.487, 'train_loss': 0.2661052131652832, 'epoch': 5.0}





TrainOutput(global_step=1150, training_loss=0.2661052131652832, metrics={'train_runtime': 256.2824, 'train_samples_per_second': 71.562, 'train_steps_per_second': 4.487, 'train_loss': 0.2661052131652832, 'epoch': 5.0})

In [14]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

***** Running Prediction *****
  Num examples = 408
  Batch size = 16
100%|██████████| 26/26 [00:01<00:00, 16.39it/s]

(408, 2) (408,)





In [15]:
import numpy as np
from datasets import load_metric

metric = load_metric("glue", "mrpc")
preds = np.argmax(predictions.predictions, axis=1)
metric.compute(predictions=preds, references=predictions.label_ids)

  metric = load_metric("glue", "mrpc")
Downloading builder script: 5.76kB [00:00, 5.77MB/s]                   


{'accuracy': 0.8333333333333334, 'f1': 0.8851351351351352}

In [16]:
metric = load_metric("glue", "mrpc")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at C:\Users\Admin/.cache\huggingface\hub\models--bert-base-cased\snapshots\5532cc56f74641d4bb33641f5c76a55d11f846e0\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_em

In [18]:
trainer.train()

***** Running training *****
  Num examples = 3668
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1377
  Number of trainable parameters = 108311810
 33%|███▎      | 458/1377 [01:03<01:54,  8.01it/s]***** Running Evaluation *****
  Num examples = 408
  Batch size = 8

 33%|███▎      | 460/1377 [01:05<07:28,  2.04it/s]

{'eval_loss': 0.4195807874202728, 'eval_accuracy': 0.8161764705882353, 'eval_f1': 0.853228962818004, 'eval_runtime': 1.5828, 'eval_samples_per_second': 257.766, 'eval_steps_per_second': 32.221, 'epoch': 1.0}


 36%|███▋      | 500/1377 [01:10<01:56,  7.52it/s]Saving model checkpoint to test-trainer\checkpoint-500
Configuration saved in test-trainer\checkpoint-500\config.json


{'loss': 0.5266, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


Model weights saved in test-trainer\checkpoint-500\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-500\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-500\special_tokens_map.json
 67%|██████▋   | 918/1377 [02:09<00:57,  7.98it/s]***** Running Evaluation *****
  Num examples = 408
  Batch size = 8
                                                  
 67%|██████▋   | 919/1377 [02:11<04:46,  1.60it/s]

{'eval_loss': 0.5583145022392273, 'eval_accuracy': 0.8284313725490197, 'eval_f1': 0.8825503355704698, 'eval_runtime': 1.6135, 'eval_samples_per_second': 252.867, 'eval_steps_per_second': 31.608, 'epoch': 2.0}


 73%|███████▎  | 1000/1377 [02:21<00:47,  7.94it/s]Saving model checkpoint to test-trainer\checkpoint-1000
Configuration saved in test-trainer\checkpoint-1000\config.json


{'loss': 0.2802, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


Model weights saved in test-trainer\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in test-trainer\checkpoint-1000\tokenizer_config.json
Special tokens file saved in test-trainer\checkpoint-1000\special_tokens_map.json
100%|█████████▉| 1376/1377 [03:15<00:00,  7.65it/s]***** Running Evaluation *****
  Num examples = 408
  Batch size = 8
                                                   
100%|██████████| 1377/1377 [03:17<00:00,  7.65it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 1377/1377 [03:17<00:00,  6.98it/s]

{'eval_loss': 0.7708588242530823, 'eval_accuracy': 0.8480392156862745, 'eval_f1': 0.89419795221843, 'eval_runtime': 1.6392, 'eval_samples_per_second': 248.895, 'eval_steps_per_second': 31.112, 'epoch': 3.0}
{'train_runtime': 197.2031, 'train_samples_per_second': 55.8, 'train_steps_per_second': 6.983, 'train_loss': 0.31994408944766417, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.31994408944766417, metrics={'train_runtime': 197.2031, 'train_samples_per_second': 55.8, 'train_steps_per_second': 6.983, 'train_loss': 0.31994408944766417, 'epoch': 3.0})