# Fine-tuning BERT on IMPLI dataset and Exploring the dependence on the amount of data used

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
! pip install datasets
! pip install -U accelerate
! pip install -U transformers



In [30]:
import torch
import numpy as np
import os
import copy
import datetime

from transformers import (BertTokenizer,
                          AutoModelForSequenceClassification,
                          Trainer,
                          TrainingArguments)
from datasets import (Dataset,
                      load_dataset,
                      concatenate_datasets,
                      load_metric,
                      ClassLabel,
                      Features)

In [28]:
BATCH_SIZE = 32

In [5]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda:0


## Model

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
def tokenize_function(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], padding="max_length", truncation=True)

In [11]:
model = AutoModelForSequenceClassification.from_pretrained("an-eve/bert-base-uncased-mnli-2-labels")

In [9]:
metric = load_metric('glue', "mnli")
metric_name = "accuracy"

  metric = load_metric('glue', "mnli")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

## Uploading and Arranging IMPLI data

Preprocessing

In [11]:
url_train_ne = "https://github.com/an-eve/nlp-nli-idioms/raw/main/dataset/train_ne.tsv"
url_train_e = "https://github.com/an-eve/nlp-nli-idioms/raw/main/dataset/train_e.tsv"
url_test_ne = "https://github.com/an-eve/nlp-nli-idioms/raw/main/dataset/test_ne.tsv"
url_test_e = "https://github.com/an-eve/nlp-nli-idioms/raw/main/dataset/test_e.tsv"

In [12]:
train_ne_data = load_dataset('csv', data_files=url_train_ne, delimiter='\t', column_names = ['premise', 'hypothesis', 'label'], split='train')
train_e_data = load_dataset('csv', data_files=url_train_e, delimiter='\t', column_names = ['premise', 'hypothesis', 'label'], split='train')
test_ne_data = load_dataset('csv', data_files=url_test_ne, delimiter='\t', column_names = ['idiom', 'premise', 'hypothesis', 'label'], split='train')
test_e_data = load_dataset('csv', data_files=url_test_e, delimiter='\t', column_names = ['idiom', 'premise', 'hypothesis', 'label'], split='train')

Downloading data:   0%|          | 0.00/472k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/806k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/48.1k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading data:   0%|          | 0.00/102k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
test_ne_data = test_ne_data.remove_columns("idiom")
test_e_data = test_e_data.remove_columns("idiom")

In [None]:
def modify_label_ne(example):
    example['label'] = 1
    return example

def modify_label_e(example):
    example['label'] = 0
    return example

train_ne_data = train_ne_data.map(modify_label_ne)
train_e_data = train_e_data.map(modify_label_e)
test_ne_data = test_ne_data.map(modify_label_ne)
test_e_data = test_e_data.map(modify_label_e)

Map:   0%|          | 0/6787 [00:00<?, ? examples/s]

Map:   0%|          | 0/14043 [00:00<?, ? examples/s]

Map:   0%|          | 0/760 [00:00<?, ? examples/s]

Map:   0%|          | 0/2129 [00:00<?, ? examples/s]

In [None]:
new_features = train_ne_data.features.copy()
new_features['label'] = ClassLabel(num_classes = 2, names=["entailment", "non-entailment"])

train_ne_data = train_ne_data.cast(new_features)
train_e_data = train_e_data.cast(new_features)
test_ne_data = test_ne_data.cast(new_features)
test_e_data = test_e_data.cast(new_features)

Casting the dataset:   0%|          | 0/6787 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/14043 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/760 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2129 [00:00<?, ? examples/s]

In [None]:
print(train_ne_data)
print(train_e_data)
print(test_ne_data)
print(test_e_data)

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 6787
})
Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 14043
})
Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 760
})
Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 2129
})


In [None]:
print(train_ne_data[:2], '\n')
print(train_e_data[:2], '\n')
print(test_ne_data[:2], '\n')
print(test_e_data[:2])

{'premise': ['13 , 1991, in the fourth week of the U.S.-led air war against Iraq, a British Tornado warplane dropped a bomb that was intended to take out a key river bridge at Fallujah.', "16) of the evacuation of Attica that the Athenians took it so badly because it was like leaving one's polis; this is on the face of it a paradox because they were going from their country demes to the polis."], 'hypothesis': ['13 , 1991, in the fourth week of the U.S.-led air war against Iraq, a British Tornado warplane gave decisive news that was intended to take out a key river bridge at Fallujah.', "16) of the evacuation of Attica that the Athenians took it so badly because it was like leaving one's polis; this is Encountering a paradox because they were going from their country demes to the polis."], 'label': [1, 1]} 

{'premise': ["( 11–12 February 1778) as if to add insult to injury, Leopold received Mozart's letter telling him that he had not yet finished his commissions for the Dutchman :", '

Combining entailed and non-entailed data

In [None]:
train_data = concatenate_datasets([train_ne_data, train_e_data])
train_data = train_data.shuffle(seed=128)

In [None]:
print(train_data, '\n')
train_data[:4]

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 20830
}) 



{'premise': ["‘ In 1987, when I was chosen to fight my first election at Birmingham's Perry Barr, I was on cloud nine, even though it was a safe Labour seat.",
  'The Ohio bank does not leave its newcomers to sink or swim by themselves.',
  'Going back to the railway station was for the moment out of the question, just in case I bumped into someone who recognized me.',
  'Richard Dorment of the Daily Telegraph said: ‘ What a pity a dealer did not take him aside and tell him the work he proposed to exhibit was unexhibitable … a visual boredom so total that no amount of metaphor or allusion can give it the kiss of life’.'],
 'hypothesis': ["‘ In 1987, when I was chosen to fight my first election at Birmingham's Perry Barr, I was very happy, even though it was a safe Labour seat.",
  'The Ohio bank does not leave its newcomers to fail or succeed in their own effort by themselves.',
  'Going back to the railway station was for the moment not remotely possible, just in case I bumped into so

In [None]:
test_data = concatenate_datasets([test_ne_data, test_e_data])
test_data = test_data.shuffle(seed=128)

In [None]:
print(test_data, '\n')
test_data[:4]

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 2889
}) 



{'premise': ["Mr Wildish said: ‘ I wouldn't have thought she had been sleeping rough.’",
  'When Coleridge got on one and let his imagination run riot, he came up with Kubla Khan.',
  'I shall be in hot water with some gay acquaintances for even listing homosexuality among deviations.',
  'The BMW 7-series has come in from the cold, no doubt about that, but for me the Jaguar still rules the class.'],
 'hypothesis': ["Mr Wildish said: ‘ I wouldn't have thought she had been sleep outdoors",
  'When Coleridge got on one and let his imagination riot, he came up with Kubla Khan.',
  'I shall be in a difficult situation with some gay acquaintances for even listing homosexuality among deviations.',
  'The BMW 7-series has gain widespread acceptance in a group, no doubt about that, but for me the Jaguar still rules the class.'],
 'label': [0, 1, 0, 0]}

Dividing the training set into several folds

In [None]:
num_shards = 9

train_sets = [train_data.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards)]

In [None]:
print(train_sets[0])

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 2315
})


Uploading on Hub

In [None]:
test_data.push_to_hub("an-eve/test_idioms", private=True)
test_ne_data.push_to_hub("an-eve/test_non_entailment_idioms", private=True)
test_e_data.push_to_hub("an-eve/test_entailment_idioms", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/an-eve/test_entailment_idioms/commit/a79eb172ab52295537b72631a51628455d7a25ef', commit_message='Upload dataset', commit_description='', oid='a79eb172ab52295537b72631a51628455d7a25ef', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
for i in range(num_shards):
    concatenated_data = concatenate_datasets([train_sets[j] for j in range(i+1)])
    #print(concatenated_data, '\n')
    concatenated_data.push_to_hub(f"an-eve/train_sets_{i+1}_idioms", private=True)


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

## Uploading datsets and Tokenization

In [12]:
test_data = load_dataset("an-eve/test_idioms")

In [14]:
tokenized_test_dataset = test_data.map(tokenize_function, batched=True)

In [13]:
tokenized_test_ne_dataset = test_ne_data.map(tokenize_function, batched=True)
tokenized_test_e_dataset = test_e_data.map(tokenize_function, batched=True)
tokenized_test_dataset = test_data.map(tokenize_function, batched=True)

NameError: name 'test_ne_data' is not defined

In [None]:
tokenized_train_sets = []

for i in range(num_shards):
    tokenized_data = concatenate_datasets([train_sets[j] for j in range(i+1)]).map(tokenize_function, batched=True)
    tokenized_train_sets.append(tokenized_data)

Map:   0%|          | 0/2315 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Map:   0%|          | 0/4630 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Map:   0%|          | 0/6945 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Map:   0%|          | 0/9260 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Map:   0%|          | 0/11574 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Map:   0%|          | 0/13888 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/16202 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/18516 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/20830 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
tokenized_train_sets[2]

Dataset({
    features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6945
})

In [15]:
train_set_1 = load_dataset("an-eve/train_sets_1_idioms")
tokenized_train_set_1 = train_set_1.map(tokenize_function, batched=True)

## Fine-tuninf BERT on IMPLI gradually increasing the amount of data

In [None]:
model_folder = base_dir + "Models-BERT-" + str(datetime.datetime.now().timestamp())

if os.path.exists(model_folder) == False:
  os.mkdir(model_folder)

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 2315
})


In [20]:
model1 = copy.deepcopy(model)

In [20]:
args = TrainingArguments(
    output_dir = "model1",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    resume_from_checkpoint=True,
    push_to_hub=True,
)

In [21]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_set_1["train"],
    eval_dataset=tokenized_test_dataset["train"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1574,0.992618,0.830391


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [27]:
torch.cuda.empty_cache()

In [32]:
args_test = TrainingArguments("output",
         per_device_eval_batch_size=BATCH_SIZE)

eval_trainer = Trainer(
    model=model,
    args=args_test,
    train_dataset=tokenized_test_dataset["train"],
    eval_dataset=tokenized_test_dataset["train"],
    compute_metrics=compute_metrics)

eval_trainer.evaluate()

{'eval_loss': 1.019603967666626,
 'eval_accuracy': 0.8414676358601593,
 'eval_runtime': 104.3931,
 'eval_samples_per_second': 27.674,
 'eval_steps_per_second': 0.872}