# Fine-tuning model for Natural Language Inference

Dataset: [sick](https://huggingface.co/datasets/sick)

Labels: entailment (0), neutral (1), contradiction (2)


In [4]:
!pip install datasets evaluate transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple


In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
import evaluate
from datasets import load_dataset

import tensorflow as tf
import numpy as np

In [7]:
dataset_name = 'sick'
raw_datasets = load_dataset(dataset_name)
checkpoint = "bert-base-uncased"

In [8]:
print("Example")
print("*"*8+'\n')

raw_datasets["train"][2]

Example
********



{'id': '3',
 'sentence_A': 'The young boys are playing outdoors and the man is smiling nearby',
 'sentence_B': 'The kids are playing outdoors near a man with a smile',
 'label': 0,
 'relatedness_score': 4.699999809265137,
 'entailment_AB': 'A_entails_B',
 'entailment_BA': 'B_entails_A',
 'sentence_A_original': 'The children are playing outdoors, while a man smiles nearby.',
 'sentence_B_original': 'The children are playing outdoors, while a man smiles nearby.',
 'sentence_A_dataset': 'FLICKR',
 'sentence_B_dataset': 'FLICKR'}

In [9]:
print("Example for entailment (label 0)")
print("*"*35+'\n')

print('Premise:', raw_datasets["train"][2]['sentence_A'])
print('Hypothesis:', raw_datasets["train"][2]['sentence_B'])
print('Label:', raw_datasets["train"][2]['label'])

Example for entailment (label 0)
***********************************

Premise: The young boys are playing outdoors and the man is smiling nearby
Hypothesis: The kids are playing outdoors near a man with a smile
Label: 0


In [10]:
print("Example for neutral (label 1)")
print("*"*35+'\n')

print('Premise:', raw_datasets["train"][6]['sentence_A'])
print('Hypothesis:', raw_datasets["train"][6]['sentence_B'])
print('Label:', raw_datasets["train"][6]['label'])

Example for neutral (label 1)
***********************************

Premise: A brown dog is attacking another animal in front of the man in pants
Hypothesis: Two dogs are fighting
Label: 1


In [11]:
print("Example for contradiction (label 2)")
print("*"*35+'\n')

print('Premise:', raw_datasets["train"][14]['sentence_A'])
print('Hypothesis:', raw_datasets["train"][14]['sentence_B'])
print('Label:', raw_datasets["train"][14]['label'])

Example for contradiction (label 2)
***********************************

Premise: Two people are kickboxing and spectators are not watching
Hypothesis: Two people are kickboxing and spectators are watching
Label: 2


In [12]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence_A"], example["sentence_B"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")


Map:   0%|          | 0/4439 [00:00<?, ? examples/s]

Map:   0%|          | 0/495 [00:00<?, ? examples/s]

Map:   0%|          | 0/4906 [00:00<?, ? examples/s]

In [13]:
from transformers import PushToHubCallback

callback = PushToHubCallback(
    "bert-finetuned-nli", save_strategy="epoch", tokenizer=tokenizer
)

/home/jsuter/Documents/repositories/hug-a-face/notebooks/bert-finetuned-nli is already a clone of https://huggingface.co/athrado/bert-finetuned-nli. Make sure you pull the latest changes with `repo.git_pull()`.


In [14]:
tokenizer(raw_datasets["train"][19]['sentence_A'])

{'input_ids': [101, 2093, 3337, 2024, 8660, 1999, 1996, 3727, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
tokenized_datasets["train"].features

{'id': Value(dtype='string', id=None),
 'sentence_A': Value(dtype='string', id=None),
 'sentence_B': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
 'relatedness_score': Value(dtype='float32', id=None),
 'entailment_AB': Value(dtype='string', id=None),
 'entailment_BA': Value(dtype='string', id=None),
 'sentence_A_original': Value(dtype='string', id=None),
 'sentence_B_original': Value(dtype='string', id=None),
 'sentence_A_dataset': Value(dtype='string', id=None),
 'sentence_B_dataset': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [16]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [17]:
import tensorflow

from tensorflow.python.keras.engine import data_adapter

In [18]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

2023-07-28 18:08:37.968787: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
2023-07-28 18:08:38.246858: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
2023-07-28 18:08:38.281559: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
2023-07-28 18:08:40.322221: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
2023-07-28 18:08:40.527009: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN thi

In [19]:
batch_size = 8
num_epochs = 5
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

In [20]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=5, callbacks=[callback])

Epoch 1/5
 17/555 [..............................] - ETA: 21:39 - loss: 1.1504 - accuracy: 0.4191

In [None]:
preds = model.predict(tf_validation_dataset)["logits"]
class_preds = np.argmax(preds, axis=1)

In [None]:
metric = evaluate.load("f1")
metric.compute(predictions=class_preds, references=raw_datasets["validation"]["label"], average='weighted')

In [None]:
model.push_to_hub("nli-model")
tokenizer.push_to_hub("nli-model")

In [None]:
tokenizer.push_to_hub("tokenizer-for-nli-model")

In [None]:
model.push_to_hub("tokenizer-for-nli-model")