In [None]:
# install datasets
!pip install folium==0.2.1
!pip install datasets

# Make sure that we have a recent version of pyarrow in the session before we continue - otherwise reboot Colab to activate it
import pyarrow
if int(pyarrow.__version__.split('.')[1]) < 16 and int(pyarrow.__version__.split('.')[0]) == 0:
    import os
    os.kill(os.getpid(), 9)

In [None]:
!pip install transformers

In [None]:
import torch

from datasets import load_dataset, list_datasets, list_metrics
from pprint import pprint
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
datasets = list_datasets()
print(len(datasets), datasets[0:10])

6818 ['acronym_identification', 'ade_corpus_v2', 'adversarial_qa', 'aeslc', 'afrikaans_ner_corpus', 'ag_news', 'ai2_arc', 'air_dialogue', 'ajgt_twitter_ar', 'allegro_reviews']


In [None]:
glue_dataset = list_datasets(with_details=True)[datasets.index('glue')]

In [None]:
pprint(glue_dataset)

DatasetInfo: {
	id: glue
	sha: 9338f7b671827df886678df2bdd7cc7b4f36dffd
	lastModified: 2022-07-01T11:52:07.000Z
	tags: ['annotations_creators:unknown', 'language_creators:unknown', 'language:en', 'license:cc-by-4.0', 'multilinguality:monolingual', 'size_categories:10K<n<100K', 'source_datasets:unknown', 'task_categories:text-classification', 'task_ids:acceptability-classification', 'task_ids:natural-language-inference', 'task_ids:semantic-similarity-scoring', 'task_ids:sentiment-classification', 'task_ids:text-classification-other-coreference-nli', 'task_ids:text-classification-other-paraphrase-identification', 'task_ids:text-classification-other-qa-nli', 'task_ids:text-scoring', 'pretty_name:GLUE (General Language Understanding Evaluation benchmark)', 'configs:ax', 'configs:cola', 'configs:mnli', 'configs:mnli_matched', 'configs:mnli_mismatched', 'configs:mrpc', 'configs:qnli', 'configs:qqp', 'configs:rte', 'configs:sst2', 'configs:stsb', 'configs:wnli']
	private: False
	author: None


## GLUE: A MULTI-TASK BENCHMARK AND ANALYSIS PLATFORM FOR NATURAL LANGUAGE UNDERSTANDING

**GLUE** contains 11 tasks including MRPC, STS, QQP, and several NLI tasks. More details are available on https://gluebenchmark.com/tasks.

**MRPC (Microsoft Research Paraphrase Corpus):** https://www.microsoft.com/en-us/download/details.aspx?id=52398

5800 pairs of sentences have been extracted from news sources on the web, along with human annotations indicating whether each pair captures a paraphrase/semantic equivalence relationship.

In [None]:
train_dataset = load_dataset('glue', 'mrpc', split='train')
test_dataset = load_dataset('glue', 'mrpc', split='test')

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [None]:
def encode(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length')

train_dataset = train_dataset.map(encode, batched=True)
test_dataset = test_dataset.map(encode, batched=True)
print(train_dataset[0])

  0%|          | 0/2 [00:00<?, ?ba/s]

{'input_ids': tensor([  101,  7277,  2180,  5303,  4806,  1117,  1711,   117,  2292,  1119,
         1270,   107,  1103,  7737,   107,   117,  1104,  9938,  4267, 12223,
        21811,  1117,  2554,   119,   102, 11336,  6732,  3384,  1106,  1140,
         1112,  1178,   107,  1103,  7737,   107,   117,  7277,  2180,  5303,
         4806,  1117,  1711,  1104,  9938,  4267, 12223, 21811,  1117,  2554,
          119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [None]:
print(train_dataset[0].keys(), train_dataset[0]['label'])

dict_keys(['sentence1', 'sentence2', 'label', 'idx', 'labels', 'input_ids', 'token_type_ids', 'attention_mask']) 1


Let's use a BERT model for [classification](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertForSequenceClassification).

Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output)


In [None]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
train_dataset = train_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
!mkdir checkpoints

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
model.train().to(device)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)

for epoch in range(10):
    for i, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            print(f"loss: {loss}")

    if epoch % 5 == 0:
        torch.save(model, './checkpoints/model.pt')

In [None]:
# state = {
#         'epoch': epoch,
#         'state_dict': model.state_dict(),
#         'optimizer': optimizer.state_dict()}
# torch.save(state, './checkpoints/model.pt')

In [None]:
model = torch.load('./checkpoints/model.pt')
for epoch in range(1):
    for i, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if i % 10 == 0:
            print(f"loss: {loss}")
    torch.save(model, './checkpoints/model.pt')

  0%|          | 1/230 [00:01<05:21,  1.41s/it]

loss: 0.16851364076137543


  5%|▍         | 11/230 [00:15<05:02,  1.38s/it]

loss: 0.17845070362091064


  9%|▉         | 21/230 [00:29<04:53,  1.40s/it]

loss: 0.20905649662017822


 13%|█▎        | 31/230 [00:43<04:44,  1.43s/it]

loss: 0.4687296450138092


 18%|█▊        | 41/230 [00:57<04:33,  1.45s/it]

loss: 0.2169315367937088


 22%|██▏       | 51/230 [01:12<04:22,  1.47s/it]

loss: 0.3892258405685425


 27%|██▋       | 61/230 [01:26<04:11,  1.49s/it]

loss: 0.34391239285469055


 31%|███       | 71/230 [01:41<03:59,  1.51s/it]

loss: 0.5090816617012024


 35%|███▌      | 81/230 [01:57<03:45,  1.51s/it]

loss: 0.20200307667255402


 40%|███▉      | 91/230 [02:12<03:28,  1.50s/it]

loss: 0.38856402039527893


 44%|████▍     | 101/230 [02:26<03:12,  1.49s/it]

loss: 0.19887922704219818


 48%|████▊     | 111/230 [02:41<02:57,  1.49s/it]

loss: 0.14597256481647491


 53%|█████▎    | 121/230 [02:56<02:43,  1.50s/it]

loss: 0.526811957359314


 57%|█████▋    | 131/230 [03:11<02:28,  1.50s/it]

loss: 0.09427766501903534


 61%|██████▏   | 141/230 [03:26<02:13,  1.50s/it]

loss: 0.13140855729579926


 66%|██████▌   | 151/230 [03:41<01:58,  1.50s/it]

loss: 0.25751999020576477


 70%|███████   | 161/230 [03:56<01:43,  1.50s/it]

loss: 0.1658208966255188


 74%|███████▍  | 171/230 [04:11<01:28,  1.50s/it]

loss: 0.3904995918273926


 79%|███████▊  | 181/230 [04:26<01:13,  1.50s/it]

loss: 0.17683570086956024


 83%|████████▎ | 191/230 [04:41<00:58,  1.50s/it]

loss: 0.19563443958759308


 87%|████████▋ | 201/230 [04:56<00:43,  1.50s/it]

loss: 0.3323359489440918


 92%|█████████▏| 211/230 [05:11<00:28,  1.50s/it]

loss: 0.13525281846523285


 96%|█████████▌| 221/230 [05:26<00:13,  1.50s/it]

loss: 0.1264498084783554


100%|██████████| 230/230 [05:38<00:00,  1.47s/it]


In [None]:
test_dataset = test_dataset.map(lambda examples: {'labels': examples['label']}, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
model.eval()

all_ground_truth = []
all_predictions = []

with torch.no_grad():
    for i, batch in enumerate(tqdm(test_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch)[1]
        predicted_class_ids = logits.argmax(dim=-1)
        all_ground_truth += batch['labels'].cpu().detach().numpy().tolist()
        all_predictions += predicted_class_ids.cpu().detach().numpy().tolist()

100%|██████████| 108/108 [00:57<00:00,  1.89it/s]


# New huggingface library **evaluate**
https://github.com/huggingface/evaluate

In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.1.2-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.2 MB/s 
Installing collected packages: evaluate
Successfully installed evaluate-0.1.2


In [None]:
import evaluate

In [None]:
accuracy_metric = evaluate.load("accuracy")
results = accuracy_metric.compute(references=all_ground_truth, predictions=all_predictions)
print(results)

{'accuracy': 0.8179710144927537}


References:
1. GLUE https://openreview.net/pdf?id=rJ4km2R5t7
2. https://huggingface.co/docs/datasets/quickstart