In [2]:
# installing tensorflow extra due to incompatibility with conda and tensorflow-text https://github.com/tensorflow/text/issues/644
!pip install transformers[tf] -q --upgrade
!pip install sentence-transformers -q # needed for validating results


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.8/83.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m451.2/451.2 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.7/527.7 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from transformers import BertTokenizer, TFBertModel, TFAutoModel, AutoTokenizer
# from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import Model
import numpy as np

In [113]:
input_data = [
    ['sentence A1', 'sentence B1'],
    ['sentence A2', 'sentence B2'],
    ['sentence A3', 'sentence B3']
]
np.shape(input_data)

(3, 2)

In [55]:
input_data = [
    ['sentence A1', 'sentence B1'],
    ['sentence A2', 'sentence B2'],
    ['sentence A3', 'sentence B3']
]
input_data_a = np.array(input_data)[:,0].tolist()
input_data_b = np.array(input_data)[:,1].tolist()
input_data_a

['sentence A1', 'sentence A2', 'sentence A3']

In [29]:
dataset = tf.data.Dataset.from_tensor_slices(input_data)

In [41]:
for j in dataset.batch(2).take(2):
    print(j.shape, j.numpy())

(2, 2) [[b'sentence A1' b'sentence B1']
 [b'sentence A2' b'sentence B2']]
(1, 2) [[b'sentence A3' b'sentence B3']]


In [33]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = BertTokenizer.from_pretrained(model_name)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [56]:
tokenized_input = tokenizer(input_data, padding='max_length', max_length=512, return_tensors='tf')

In [46]:
token_keys = tokenized_input.keys()
token_keys
# input_ids
# token_type_ids
# attention_mask

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [50]:
tokenizer('sentence A1')

{'input_ids': [101, 6251, 17350, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [59]:
class TFSTLayer(tf.keras.layers.Layer):
    def __init__(self, model_name: str) -> None:
        super(TFSTLayer, self).__init__()
        self.tf_model = TFAutoModel.from_pretrained(model_name)

    def call(self, input_ids, attention_mask, token_type_ids, normalize=True):
        # Compute the model output
        output = self.tf_model(input_ids, attention_mask, token_type_ids)

        # Compute the token embeddings
        token_embeddings = output.last_hidden_state  # shape=(B, max_seq_length, n_embd), dtype=float32

        # Mean Pooling
        embedding = self.mean_pooling(token_embeddings, attention_mask)  # shape=(B, n_embd), dtype=float32

        if normalize:
            embedding, _ = tf.linalg.normalize(embedding, 2, axis=1)  # shape=(B, n_embd), dtype=float32

        return embedding

    def mean_pooling(self, token_embeddings, attention_mask):
        attention_mask = tf.expand_dims(attention_mask, axis=-1)  # shape=(B, max_seq_length, 1), dtype=int32
        attention_mask = tf.broadcast_to(attention_mask, tf.shape(token_embeddings))  # shape=(B, max_seq_length, n_embd), dtype=int32
        attention_mask = tf.cast(attention_mask, dtype=tf.float32)  # shape=(B, max_seq_length, n_embd), dtype=float32
        token_embeddings = token_embeddings * attention_mask  # shape=(B, max_seq_length, n_embd), dtype=float32

        # Taking mean over all the tokens (max_seq_length axis)
        mean_embeddings = tf.reduce_sum(token_embeddings, axis=1)  # shape=(B, n_embd), dtype=float32
        # Alternatively, you can replace the `mean_pooling` method with `tf.keras.layers.GlobalAveragePooling1D`:
        # mean_pooling = tf.keras.layers.GlobalAveragePooling1D()
        # mean_embeddings = mean_pooling(token_embeddings)
        return mean_embeddings

In [60]:
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

tokenizer = BertTokenizer.from_pretrained(model_name)

tfst_model = TFSTLayer(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/91.0M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [69]:
max_seq_length = 512
tokenized_sen = tokenizer(input_data, padding='max_length', max_length=max_seq_length, return_tensors='tf')
# input_ids
# token_type_ids
# attention_mask
tfst_model(tokenized_sen['input_ids'], tokenized_sen['token_type_ids'], tokenized_sen['attention_mask'])

<tf.Tensor: shape=(3, 384), dtype=float32, numpy=
array([[-0.04566031,  0.1295795 , -0.05006356, ...,  0.04747099,
        -0.07742504, -0.1310183 ],
       [ 0.00195537,  0.12067217, -0.0253408 , ...,  0.10155079,
        -0.06389324, -0.06594775],
       [-0.04785425,  0.09694559, -0.06291529, ...,  0.05967952,
        -0.10484544, -0.07434572]], dtype=float32)>

In [74]:
tokenized_sen['input_ids'].dtype, tokenized_sen['token_type_ids'].dtype, tokenized_sen['attention_mask'].dtype

(tf.int32, tf.int32, tf.int32)

In [144]:
def tf_sentence_transformer(model_name:str, max_seq_length) -> tf.keras.Model:
    input_ids = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32)
    attention_mask = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32)
    token_type_ids = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32)
    tfst_layer = TFSTLayer(model_name)
    output = tfst_layer(input_ids, attention_mask, token_type_ids)
    model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=output)
    return model
max_seq_length = 512
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tf_model = tf_sentence_transformer(model_name, max_seq_length)

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [77]:
tf_model([tokenized_sen['input_ids'], tokenized_sen['token_type_ids'], tokenized_sen['attention_mask']])

<tf.Tensor: shape=(3, 384), dtype=float32, numpy=
array([[-0.04566031,  0.1295795 , -0.05006356, ...,  0.04747099,
        -0.07742504, -0.1310183 ],
       [ 0.00195537,  0.12067217, -0.0253408 , ...,  0.10155079,
        -0.06389324, -0.06594775],
       [-0.04785425,  0.09694559, -0.06291529, ...,  0.05967952,
        -0.10484544, -0.07434572]], dtype=float32)>

In [78]:
print(np.shape([tokenized_sen['input_ids'], tokenized_sen['token_type_ids'], tokenized_sen['attention_mask']]))

(3, 3, 512)


In [201]:
from transformers import BertTokenizer, TFBertModel, TFAutoModel, AutoTokenizer, BertTokenizerFast
# from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import Model
import numpy as np



class TFSTLayer(tf.keras.layers.Layer):
    def __init__(self, model_name: str) -> None:
        super(TFSTLayer, self).__init__()
        self.tf_model = TFAutoModel.from_pretrained(model_name)

    def call(self, input_ids, attention_mask, token_type_ids, normalize=True):
        # Compute the model output
        output = self.tf_model(input_ids, attention_mask, token_type_ids)

        # Compute the token embeddings
        token_embeddings = output.last_hidden_state  # shape=(B, max_seq_length, n_embd), dtype=float32

        # Mean Pooling
        embedding = self.mean_pooling(token_embeddings, attention_mask)  # shape=(B, n_embd), dtype=float32

        if normalize:
            embedding, _ = tf.linalg.normalize(embedding, 2, axis=1)  # shape=(B, n_embd), dtype=float32

        return embedding

    def mean_pooling(self, token_embeddings, attention_mask):
        attention_mask = tf.expand_dims(attention_mask, axis=-1)  # shape=(B, max_seq_length, 1), dtype=int32
        attention_mask = tf.broadcast_to(attention_mask, tf.shape(token_embeddings))  # shape=(B, max_seq_length, n_embd), dtype=int32
        attention_mask = tf.cast(attention_mask, dtype=tf.float32)  # shape=(B, max_seq_length, n_embd), dtype=float32
        token_embeddings = token_embeddings * attention_mask  # shape=(B, max_seq_length, n_embd), dtype=float32

        # Taking mean over all the tokens (max_seq_length axis)
        mean_embeddings = tf.reduce_sum(token_embeddings, axis=1)  # shape=(B, n_embd), dtype=float32
        # Alternatively, you can replace the `mean_pooling` method with `tf.keras.layers.GlobalAveragePooling1D`:
        # mean_pooling = tf.keras.layers.GlobalAveragePooling1D()
        # mean_embeddings = mean_pooling(token_embeddings)
        return mean_embeddings

def tf_sentence_transformer(model_name:str, max_seq_length) -> tf.keras.Model:
    input_ids = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32)
    attention_mask = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32)
    token_type_ids = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32)
    tfst_layer = TFSTLayer(model_name)
    output = tfst_layer(input_ids, attention_mask, token_type_ids)
    model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=output)
    return model

class SBERTCosineSimilarityModel(tf.keras.Model):
    def __init__(self, model_name: str, max_seq_length: int):
        super(SBERTCosineSimilarityModel, self).__init__()
        self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
        self.model = tf_sentence_transformer(model_name, max_seq_length)
        self.max_seq_length = max_seq_length
        self.dense_layer = tf.keras.layers.Dense(1, activation = 'tanh')
        self.loss_metric = tf.keras.metrics.Mean(name='train_loss')  # Add this line
    def call(self, inputs):
        input_ids_a = inputs['input_ids_a']
        input_ids_b = inputs['input_ids_b']
        attention_mask_a = inputs['attention_mask_a']
        attention_mask_b = inputs['attention_mask_b']
        token_type_ids_a = inputs['token_type_ids_a']
        token_type_ids_b = inputs['token_type_ids_b']
        embeddings_a = self.model([input_ids_a, attention_mask_a, token_type_ids_a])
        embeddings_b = self.model([input_ids_b, attention_mask_b, token_type_ids_b])

        normalized_a = tf.nn.l2_normalize(embeddings_a, axis=1)
        normalized_b = tf.nn.l2_normalize(embeddings_b, axis=1)

        similarity_scores = tf.reduce_sum(
            tf.multiply(normalized_a, normalized_b),
            axis=1
        )
        return similarity_scores

    def train_step(self, data):
        inputs, targets = data
        with tf.GradientTape() as tape:
            similarity_scores = self(inputs)
            loss = tf.keras.losses.MSE(targets, similarity_scores)

        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

def tokenize_pairs(input_a, input_b, tokenizer, max_length):
    data = {}
    tokenized_sen_a = tokenizer(input_a, padding='max_length', max_length=max_length)
    tokenized_sen_b = tokenizer(input_b, padding='max_length', max_length=max_length)
    data['input_ids_a'] = np.array(tokenized_sen_a['input_ids'])
    data['input_ids_b'] = np.array(tokenized_sen_b['input_ids'])
    data['attention_mask_a'] = np.array(tokenized_sen_a['attention_mask'])
    data['attention_mask_b'] = np.array(tokenized_sen_b['attention_mask'])
    data['token_type_ids_a'] = np.array(tokenized_sen_a['token_type_ids'])
    data['token_type_ids_b'] = np.array(tokenized_sen_b['token_type_ids'])
    return data


tokenizer = BertTokenizer.from_pretrained(model_name)
input_a = ['sentence A1', 'sentence A2', 'sentence A3']
input_b = ['sentence B1', 'sentence B2', 'sentence B3']
targets = np.array([0.7, 0.8, 0.85])
# Tokenize the input data
data = tokenize_pairs(input_a, input_b, tokenizer, max_length=512)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

# Initialize your model
model = SBERTCosineSimilarityModel(model_name, max_seq_length)

# Compile your model
model.compile(optimizer=optimizer)


history = model.fit(data, targets, epochs=5, batch_size=32)

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [177]:
from multiprocessing import Pool

def tokenize_sentence(sentence):
    return tokenizer(sentence, padding='max_length', max_length=512)

with Pool(processes=4) as pool:  # replace 4 with the number of CPU cores you want to utilize
    input_ids_a = pool.map(tokenize_sentence, input_a)
    input_ids_b = pool.map(tokenize_sentence, input_b)
def tokenize_sentence_tf(sentence):
    return tf.py_function(tokenize_sentence, [sentence], Tout=[tf.int32, tf.int32, tf.int32])

dataset_a = tf.data.Dataset.from_tensor_slices(input_a).map(tokenize_sentence_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset_b = tf.data.Dataset.from_tensor_slices(input_b).map(tokenize_sentence_tf, num_parallel_calls=tf.data.experimental.AUTOTUNE)


In [178]:
dataset_a

<_ParallelMapDataset element_spec=(TensorSpec(shape=<unknown>, dtype=tf.int32, name=None), TensorSpec(shape=<unknown>, dtype=tf.int32, name=None), TensorSpec(shape=<unknown>, dtype=tf.int32, name=None))>

In [158]:
model.fit(tokenized_input_data, targets, epochs=5, batch_size=32)

ValueError: ignored

In [159]:
np.shape(tokenized_input_data)

(3, 2, 3)

In [162]:
np.shape(tokenized_input_data[0][0])

(3,)

In [109]:
test_model = SBERTCosineSimilarityModel(model_name, max_seq_length)

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [114]:
input_data = [
    ['sentence A1', 'sentence B1'],
    ['sentence A2', 'sentence B2'],
    ['sentence A3', 'sentence B3']
]
input_data_a = np.array(input_data)[:,0].tolist()
input_data_b = np.array(input_data)[:,1].tolist()
test_model((input_data_a, input_data_b))

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.73263586, 0.8240871 , 0.8537085 ], dtype=float32)>

In [121]:
(input_data_a, input_data_b)

(['sentence A1', 'sentence A2', 'sentence A3'],
 ['sentence B1', 'sentence B2', 'sentence B3'])

In [142]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

# Initialize your model
model = SBERTCosineSimilarityModel(model_name, max_seq_length)

# Compile your model
model.compile(optimizer=optimizer)




All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
model.trainable_variables

In [143]:
input_data = np.array([['sentence A1', 'sentence A2', 'sentence A3'], ['sentence B1', 'sentence B2', 'sentence B3']]).T

labels = np.array([0.7, 0.8, 0.85])  # These are just example labels, replace with your actual labels

model.fit(input_data, labels, epochs=5, batch_size=32)


Epoch 1/5
['sentence A1', 'sentence A2', 'sentence A3'] ['sentence B1', 'sentence B2', 'sentence B3']




['sentence A1', 'sentence A2', 'sentence A3'] ['sentence B1', 'sentence B2', 'sentence B3']




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f67399c22c0>

In [127]:
input_data = np.array([['sentence A1', 'sentence A2', 'sentence A3'], ['sentence B1', 'sentence B2', 'sentence B3']]).T

input_data.shape

(3, 2)

In [128]:
input_data[:,0]

array(['sentence A1', 'sentence A2', 'sentence A3'], dtype='<U11')

In [120]:
a, b = np.array([0,1])
a, b

(0, 1)

In [100]:
# Prepare your input data and labels
input_data_a, input_data_b = ['sentence A1', 'sentence A2', 'sentence A3'], ['sentence B1', 'sentence B2', 'sentence B3']
labels = np.array([0.7, 0.8, 0.85])  # These are just example labels, replace with your actual labels
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data_a, data_b, labels, batch_size=32):
        self.data_a = data_a
        self.data_b = data_b
        self.labels = labels
        self.batch_size = batch_size

    def __len__(self):
        return math.ceil(len(self.data_a) / self.batch_size)

    def __getitem__(self, idx):
        batch_data_a = self.data_a[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_data_b = self.data_b[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_labels = self.labels[idx * self.batch_size:(idx + 1) * self.batch_size]
        return (batch_data_a, batch_data_b), batch_labels

# Train your model
# Initialize your data generator
train_generator = DataGenerator(input_data_a, input_data_b, labels)

# Train your model
model.fit(train_generator, epochs=5)

AttributeError: ignored

In [170]:
def tokenize_pairs(input_a, input_b, tokenizer, max_length):
    data = {}
    input_ids_a = []
    input_ids_b = []
    attention_mask_a = []
    attention_mask_b = []
    token_type_ids_a = []
    token_type_ids_b = []
    tokenized_sen_a = tokenizer(input_a, padding='max_length', max_length=max_length)
    tokenized_sen_b = tokenizer(input_b, padding='max_length', max_length=max_length)
    data['input_ids_a'] = np.array(tokenized_sen_a['input_ids'])
    data['input_ids_b'] = np.array(tokenized_sen_b['input_ids'])
    data['attention_mask_a'] = np.array(tokenized_sen_a['attention_mask'])
    data['attention_mask_b'] = np.array(tokenized_sen_b['attention_mask'])
    data['token_type_ids_a'] = np.array(tokenized_sen_a['token_type_ids'])
    data['token_type_ids_b'] = np.array(tokenized_sen_b['token_type_ids'])
    return data

# input_data = np.array([['sentence A1', 'sentence A2', 'sentence A3'], ['sentence B1', 'sentence B2', 'sentence B3']]).T
# targets = np.array([0.7, 0.8, 0.85])  # These are just example labels, replace with your actual labels

tokenizer = BertTokenizer.from_pretrained(model_name)
input_a = ['sentence A1', 'sentence A2', 'sentence A3']
input_b = ['sentence B1', 'sentence B2', 'sentence B3']
# Tokenize the input data
data = tokenize_pairs(input_a, input_b, tokenizer, max_length=512)
# model.fit(tokenized_input_data, targets, epochs=5, batch_size=32)


In [182]:
A = tokenizer(input_a, padding='max_length', max_length=512)
np.shape(A['input_ids'])

(3, 512)

In [173]:
data['input_ids_a']

array([[  101,  6251, 17350, ...,     0,     0,     0],
       [  101,  6251, 22441, ...,     0,     0,     0],
       [  101,  6251,  1037, ...,     0,     0,     0]])

In [None]:
tokenized_input_data

In [184]:
import tensorflow as tf
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
input_a = ['sentence A1', 'sentence A2', 'sentence A3']
input_b = ['sentence B1', 'sentence B2', 'sentence B3']
targets = [0.7, 0.8, 0.85]

def tokenize_pairs(input_a, input_b):
    tokenized_sen_a = tokenizer(input_a, padding='max_length', max_length=max_length, truncation=True, return_tensors='tf')
    tokenized_sen_b = tokenizer(input_b, padding='max_length', max_length=max_length, truncation=True, return_tensors='tf')
    return {"input_ids_a": tokenized_sen_a['input_ids'][0],
            "attention_mask_a": tokenized_sen_a['attention_mask'][0],
            "token_type_ids_a": tokenized_sen_a['token_type_ids'][0],
            "input_ids_b": tokenized_sen_b['input_ids'][0],
            "attention_mask_b": tokenized_sen_b['attention_mask'][0],
            "token_type_ids_b": tokenized_sen_b['token_type_ids'][0]}

def tf_tokenize(input_a, input_b, target):
    data = tf.py_function(func=tokenize_pairs, inp=[input_a, input_b],
                          Tout=[tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32])
    for tensor in data:
        tensor.set_shape([None])
    return (data, target)

data = tf.data.Dataset.from_tensor_slices((input_a, input_b, targets))
data = data.map(tf_tokenize)


In [185]:
data

<_MapDataset element_spec=(TensorSpec(shape=(6, None), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.float32, name=None))>

In [198]:
def tokenize_pairs(input_a, input_b):
    print(input_a)
    tokenized_sen_a = tokenizer(input_a, padding='max_length', max_length=512, truncation=True, return_tensors='tf')
    tokenized_sen_b = tokenizer(input_b, padding='max_length', max_length=512, truncation=True, return_tensors='tf')
    return (tokenized_sen_a['input_ids'][0],
            tokenized_sen_a['attention_mask'][0],
            tokenized_sen_a['token_type_ids'][0],
            tokenized_sen_b['input_ids'][0],
            tokenized_sen_b['attention_mask'][0],
            tokenized_sen_b['token_type_ids'][0])

def tf_tokenize(input_a, input_b, target):
    data = tf.py_function(func=tokenize_pairs, inp=[input_a, input_b],
                          Tout=[tf.int32]*6)
    for tensor in data:
        tensor.set_shape([None])
    return ({"input_ids_a": data[0],
            "attention_mask_a": data[1],
            "token_type_ids_a": data[2],
            "input_ids_b": data[3],
            "attention_mask_b": data[4],
            "token_type_ids_b": data[5]}, target)

data = tf.data.Dataset.from_tensor_slices((input_a, input_b, targets))
data = data.map(tf_tokenize)


In [199]:
data

<_MapDataset element_spec=({'input_ids_a': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'attention_mask_a': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'token_type_ids_a': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'input_ids_b': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'attention_mask_b': TensorSpec(shape=(None,), dtype=tf.int32, name=None), 'token_type_ids_b': TensorSpec(shape=(None,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.float32, name=None))>

In [200]:
for item in data.take(1):
    print(i)

tf.Tensor(b'sentence A1', shape=(), dtype=string)
tf.Tensor(b'sentence A2', shape=(), dtype=string)


InvalidArgumentError: ignored