In [1]:
import os
os.getcwd()

'd:\\HuggingFace'

In [2]:
#os.environ['TRANSFORMERS_CACHE'] = 'd:\\HuggingFace\\cache\\'
os.environ['TRANSFORMERS_CACHE'] = 'd:/HuggingFace/cache/huggingface/transformers/'
os.environ['HF_HOME'] = 'd:/HuggingFace/cache/huggingface/transformers/'

In [3]:
os.environ['TRANSFORMERS_CACHE']

'd:/HuggingFace/cache/huggingface/transformers/'

In [4]:
from datasets import load_dataset

In [5]:
raw_datasets = load_dataset("glue","mrpc")
raw_datasets

Reusing dataset glue (d:\HuggingFace\cache\huggingface\transformers\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 18.63it/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [6]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [7]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [8]:
raw_datasets["validation"][87]

{'sentence1': 'However , EPA officials would not confirm the 20 percent figure .',
 'sentence2': 'Only in the past few weeks have officials settled on the 20 percent figure .',
 'label': 0,
 'idx': 812}

In [9]:
raw_datasets["train"][15]

{'sentence1': 'Rudder was most recently senior vice president for the Developer & Platform Evangelism Business .',
 'sentence2': 'Senior Vice President Eric Rudder , formerly head of the Developer and Platform Evangelism unit , will lead the new entity .',
 'label': 0,
 'idx': 16}

### Testing the tokenize with some inputs

In [10]:
from transformers import AutoTokenizer

In [11]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 7.02kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 285kB/s]
Downloading: 100%|██████████| 226k/226k [00:01<00:00, 192kB/s]
Downloading: 100%|██████████| 455k/455k [00:01<00:00, 239kB/s]


In [13]:
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

### However, the above won't work as the model expects two sequences as a pair. Tokenizer can handle pairs of sentences as well. 

In [14]:
inputs = tokenizer("This is the first sentence.","This is the second sentence")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 6251, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'sentence',
 '[SEP]']

### In the above we can see how the tokenizer handles the pair

In [None]:
### This will work but will not be efficient in terms of memory and we have to deal with each type of dataset.
### We also have to do any further processing separately as needed
'''
tokenized_dataset = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True
)
'''

In [16]:
### Instead, we will create a function that can work with any element in our dataset dictionary and can be modified

# We also skip padding as padding for the whole dataset is inefficient - we will use dynamic padding

def tokenize_function(example):
    return tokenizer(example["sentence1"],example["sentence1"],truncation=True)

In [17]:
# batched=True in our call to map so the function is applied to multiple elements of our dataset at once, 
# and not on each element separately. This allows for faster preprocessing. 
tokenized_datasets = raw_datasets.map(tokenize_function,batched=True)
tokenized_datasets

100%|██████████| 4/4 [00:01<00:00,  3.94ba/s]
100%|██████████| 1/1 [00:00<00:00, 23.25ba/s]
100%|██████████| 2/2 [00:00<00:00,  9.85ba/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

### Dynamic Padding

In [19]:
from transformers import DataCollatorWithPadding

In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,return_tensors="tf")

In [24]:
### Test the collator

samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[49, 53, 45, 81, 59, 41, 61, 29]

In [25]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': TensorShape([8, 81]),
 'token_type_ids': TensorShape([8, 81]),
 'attention_mask': TensorShape([8, 81]),
 'labels': TensorShape([8])}

### Since the longest sequence was of size 81, the tensors are of shape 81 here

In [26]:
### Let's use this with our tokenized datasets. 
### To create the batches, we can use the to_tf_dataset method which can take an optional collation method
### We can take the datasets here, directly to our model

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=['attention_mask','input_ids', 'token_type_ids'],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8
)

tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=['attention_mask','input_ids', 'token_type_ids'],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8
)

In [27]:
from transformers import TFAutoModelForSequenceClassification

In [29]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading: 100%|██████████| 511M/511M [05:07<00:00, 1.74MB/s]
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Since Bert wasn't trained for classification task, the head of the model has to be discarded and a new suitable head for sequence classification is inserted. The above warning mentions that.

In [30]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [31]:
model.compile(
    optimizer="adam",
    loss = SparseCategoricalCrossentropy(from_logits=True),
    metrics = ["accuracy"]
)

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset
)



<keras.callbacks.History at 0x1ef8d313370>

### Changing the Learning Rate

In [33]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay

In [34]:
batch_size = 8
num_epochs = 3

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.

num_train_steps = len(tf_train_dataset) * num_epochs

lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps = num_train_steps
)

In [35]:
from tensorflow.keras.optimizers import Adam

In [36]:
opt = Adam(learning_rate=lr_scheduler)

In [37]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
import tensorflow as tf

In [40]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [41]:
model.compile(
    optimizer=opt,
    loss = loss,
    metrics = ["accuracy"]
)

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1efba11dc40>

In [42]:
preds = model.predict(tf_validation_dataset)["logits"]

In [44]:
probabilities = tf.nn.softmax(preds)

In [45]:
import numpy as np

In [46]:
class_preds = np.argmax(preds, axis=1)
print(preds.shape, class_preds.shape)

(408, 2) (408,)


In [47]:
from datasets import load_metric

metric = load_metric("glue", "mrpc")
metric.compute(predictions=class_preds, references=raw_datasets["validation"]["label"])

Downloading builder script: 5.76kB [00:00, 1.46MB/s]                   


{'accuracy': 0.6397058823529411, 'f1': 0.7617504051863857}