# Prep the Environment

In [1]:
!pip install ipython-autotime
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 35.0 MB/s 
Installing collected packages: jedi, ipython-autotime
Successfully installed ipython-autotime-0.3.1 jedi-0.18.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 27.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 85.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB

In [2]:
%load_ext autotime

time: 291 µs (started: 2022-12-28 21:16:40 +00:00)


In [3]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive
time: 20.2 s (started: 2022-12-28 21:16:40 +00:00)


In [4]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow import keras
import transformers

time: 2.57 s (started: 2022-12-28 21:17:01 +00:00)


In [5]:
test_df = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/datasets/df_test.csv", index_col=0
)
anli_test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/datasets/ANLI.csv")

time: 3.25 s (started: 2022-12-28 21:17:03 +00:00)


In [6]:
test_df.head()

Unnamed: 0,similarity,sentence1,sentence2
0,neutral,This church choir sings to the masses as they ...,The church has cracks in the ceiling.
1,entailment,This church choir sings to the masses as they ...,The church is filled with song.
2,contradiction,This church choir sings to the masses as they ...,A choir singing at a baseball game.
3,neutral,"A woman with a green headscarf, blue shirt and...",The woman is young.
4,entailment,"A woman with a green headscarf, blue shirt and...",The woman is very happy.


time: 8.41 ms (started: 2022-12-28 21:17:06 +00:00)


In [7]:
anli_test_df.head()

Unnamed: 0,similarity,sentence1,sentence2
0,entailment,"The Parma trolleybus system (Italian: ""Rete fi...",The trolleybus system has over 2 urban routes
1,neutral,Alexandra Lendon Bastedo (9 March 1946 – 12 Ja...,Sharron Macready was a popular character throu...
2,neutral,Alexandra Lendon Bastedo (9 March 1946 – 12 Ja...,Bastedo didn't keep any pets because of her vi...
3,neutral,Alexandra Lendon Bastedo (9 March 1946 – 12 Ja...,Alexandra Bastedo was named by her mother.
4,neutral,Alexandra Lendon Bastedo (9 March 1946 – 12 Ja...,Bastedo cared for all the animals that inhabit...


time: 5.53 ms (started: 2022-12-28 21:17:06 +00:00)


In [8]:
test_df["label"] = test_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)
y_test = tf.keras.utils.to_categorical(test_df.label, num_classes=3)

time: 7.18 ms (started: 2022-12-28 21:17:06 +00:00)


In [9]:
anli_test_df["label"] = anli_test_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)
anli_y_test = tf.keras.utils.to_categorical(anli_test_df.label, num_classes=3)

time: 60 ms (started: 2022-12-28 21:17:06 +00:00)


# Bert

In [10]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 2

time: 411 µs (started: 2022-12-28 21:17:07 +00:00)


In [11]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.
    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.
    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

time: 2.83 ms (started: 2022-12-28 21:17:07 +00:00)


In [12]:
test_data = BertSemanticDataGenerator(
    test_df[["sentence1", "sentence2"]].values.astype("str"),
    y_test,
    batch_size=batch_size,
    shuffle=False,
)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

time: 8.33 s (started: 2022-12-28 21:17:07 +00:00)


In [13]:
anli_test_data = BertSemanticDataGenerator(
    anli_test_df[["sentence1", "sentence2"]].values.astype("str"),
    anli_y_test,
    batch_size=batch_size,
    shuffle=False,
)

time: 1.7 s (started: 2022-12-28 21:17:15 +00:00)


In [14]:
bert_model = keras.models.load_model(
    "/content/drive/MyDrive/Colab Notebooks/saved_model/my_model_bert"
)

time: 31.2 s (started: 2022-12-28 21:17:17 +00:00)


In [15]:
bert_model.evaluate(test_data, verbose="auto")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 70/307 [=====>........................] - ETA: 11s - loss: 0.2789 - accuracy: 0.8991

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.




Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.




[0.27784305810928345, 0.9022801518440247]

time: 22 s (started: 2022-12-28 21:17:48 +00:00)


In [16]:
bert_model.evaluate(anli_test_data, verbose="auto")

Output hidden; open in https://colab.research.google.com to view.

# Roberta

In [31]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 4

time: 483 µs (started: 2022-12-28 22:00:04 +00:00)


In [32]:
# train_df.head()
class RoBertaSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.
    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.
    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.RobertaTokenizer.from_pretrained(
            "roberta-base", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

time: 3.41 ms (started: 2022-12-28 22:00:05 +00:00)


In [33]:
test_data = RoBertaSemanticDataGenerator(
    test_df[["sentence1", "sentence2"]].values.astype("str"),
    y_test,
    batch_size=batch_size,
    shuffle=False,
)

time: 1.01 s (started: 2022-12-28 22:00:05 +00:00)


In [34]:
anli_test_data = RoBertaSemanticDataGenerator(
    anli_test_df[["sentence1", "sentence2"]].values.astype("str"),
    anli_y_test,
    batch_size=batch_size,
    shuffle=False,
)

time: 1.78 s (started: 2022-12-28 22:00:06 +00:00)


In [35]:
roberta_model = keras.models.load_model(
    "/content/drive/MyDrive/Colab Notebooks/saved_model/my_model_roberta_base"
)

time: 16.9 s (started: 2022-12-28 22:00:08 +00:00)


In [36]:
roberta_model.evaluate(test_data, verbose="auto")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 70/307 [=====>........................] - ETA: 11s - loss: 0.2670 - accuracy: 0.9045

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.




[0.2749108076095581, 0.9053338766098022]

time: 17.1 s (started: 2022-12-28 22:00:25 +00:00)


In [37]:
roberta_model.evaluate(anli_test_data, verbose="auto")

Output hidden; open in https://colab.research.google.com to view.

# Albert


In [24]:
max_length = 128  # Maximum length of input sentence to the model.
batch_size = 32
epochs = 8

time: 651 µs (started: 2022-12-28 21:26:24 +00:00)


In [25]:
# train_df.head()
class AlbertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.
    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.
    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.AlbertTokenizer.from_pretrained("albert-base-v2")
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

time: 3.46 ms (started: 2022-12-28 21:26:24 +00:00)


In [26]:
test_data = AlbertSemanticDataGenerator(
    test_df[["sentence1", "sentence2"]].values.astype("str"),
    y_test,
    batch_size=batch_size,
    shuffle=False,
)

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

time: 7.63 s (started: 2022-12-28 21:26:24 +00:00)


In [27]:
anli_test_data = AlbertSemanticDataGenerator(
    anli_test_df[["sentence1", "sentence2"]].values.astype("str"),
    anli_y_test,
    batch_size=batch_size,
    shuffle=False,
)

time: 1.75 s (started: 2022-12-28 21:26:32 +00:00)


In [28]:
albert_model = keras.models.load_model(
    "/content/drive/MyDrive/Colab Notebooks/saved_model/albert_base"
)

time: 19 s (started: 2022-12-28 21:26:34 +00:00)


In [29]:
albert_model.evaluate(test_data, verbose="auto")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


 70/307 [=====>........................] - ETA: 12s - loss: 0.4250 - acc: 0.8326

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.




Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.




[0.4179399609565735, 0.8391693830490112]

time: 17.7 s (started: 2022-12-28 21:26:53 +00:00)


In [30]:
albert_model.evaluate(anli_test_data, verbose="auto")

Output hidden; open in https://colab.research.google.com to view.