In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Model
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split

import itertools


In [30]:
def f(x):
    return x * np.sin(x)

xMin = 3
xMax = 15
nSample = 5000

# input
x = np.linspace(xMin, xMax, nSample)
x_actual = np.linspace(xMin, xMax, nSample)
y_actual = f(x)

np.random.shuffle(x)

np.random.seed(17)
epsilon1 = np.random.normal(0.0, 0.3, nSample)
epsilon2 = np.random.normal(0.0, 0.3, nSample)

y = f(x) + epsilon1 + epsilon2 * x

# plt.scatter(x, y, label = "Dataset", color = "pink", s = 9)
# plt.plot(x_actual, y_actual, label = "Underlying", color = "magenta")
# plt.legend()
# plt.show()

X_train = x.reshape(-1, 1)
y_train = y.reshape(-1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
X_test = np.arange(xMin, xMax+5, 0.01).reshape(-1, 1)
y_test = f(X_test)

In [34]:
print(X_train.shape)

(4000, 1)


In [19]:
max_length = 64
batch_size = 32

num_epochs = 20
attention_mask = tf.where(tf.math.is_nan(X_train), 0, 1)

In [None]:
def embed_data(data, tokenizer, max_length):
    # Tokenize the data using the specified tokenizer
    tokenized_data = tokenizer(
        data,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='tf'
    )

    # Convert tokenized inputs to int64 tensors
    input_ids = tf.cast(tokenized_data['input_ids'], tf.int64)

    return input_ids

In [21]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [25]:
input_ids = tokenizer(str(X_train), padding=True, truncation=True, return_tensors="tf")["input_ids"]


In [33]:
print(input_ids.shape)

(1, 58)


In [32]:
train_set = tf.data.Dataset.from_tensor_slices(({"input_ids": input_ids}, y_train)).batch(batch_size)


ValueError: Dimensions 1 and 4000 are not compatible

In [4]:
# train_set = tf.data.Dataset.from_tensor_slices((X_train,y_train))
# val_set = tf.data.Dataset.from_tensor_slices((X_val,y_val))
# test_set = tf.data.Dataset.from_tensor_slices((X_test,y_test))

AttributeError: '_BatchDataset' object has no attribute 'tolist'

In [5]:
train_set = train_set.batch(batch_size)
val_set = val_set.batch(batch_size)
test_set = test_set.batch(batch_size)

In [6]:
def create_attention_mask(X):
    # Assuming X is a 2D tensor of shape (batch_size, num_features)
    attention_mask = tf.cast(tf.not_equal(X, 0), tf.float32)
    return attention_mask

In [7]:
train_dataset = train_set.map(lambda X, y: (X, create_attention_mask(X), y))
val_dataset = val_set.map(lambda X, y: (X, create_attention_mask(X), y))
test_dataset = test_set.map(lambda X, y: (X, create_attention_mask(X), y))

In [8]:
def loss_function(y_true, y_pred):
    mean, log_var = y_pred
    loss = 0.5 * tf.reduce_sum(tf.exp(log_var) + tf.square(y_true - mean) - 1 - log_var, axis=-1)
    return tf.reduce_mean(loss)

In [9]:
class CustomMetric(tf.keras.metrics.Metric):
    def __init__(self, name='custom_metric', **kwargs):
        super(CustomMetric, self).__init__(name=name, **kwargs)
        self.mse = tf.keras.metrics.MeanSquaredError()
        self.coverage = self.add_weight(name='coverage', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        mean, log_var = y_pred
        std = tf.exp(0.5 * log_var)
        coverage = tf.reduce_mean(tf.cast(tf.abs(y_true - mean) <= 1.96 * std, tf.float32))
        self.coverage.assign_add(coverage)
        self.mse.update_state(y_true, mean, sample_weight)

    def result(self):
        return self.mse.result(), self.coverage

    def reset_state(self):
        self.mse.reset_state()
        self.coverage.assign(0)


In [11]:
model = TFBertModel.from_pretrained('bert-base-uncased')
for layer in model.layers:
    layer.trainable = True
    

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [12]:
# try without freezing. 

In [16]:
class BayesianBertRegressor(tf.keras.Model):
    def __init__(self, bert_model, dropout_rate=0.1):
        super(BayesianBertRegressor, self).__init__()
        self.bert = bert_model
        self.dropout = Dropout(dropout_rate)
        self.mean_layer = Dense(1)
        self.log_var_layer = Dense(1)

    def call(self, input_ids):
        output = self.bert(input_ids=input_ids)[0][:, 0]
        output = self.dropout(output, training=True)
        mean = self.mean_layer(output)
        log_var = self.log_var_layer(output)
        return mean, log_var


In [17]:
regressor = BayesianBertRegressor(model)
regressor.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                  loss=loss_function,
                  metrics=[CustomMetric])

In [18]:
history = regressor.fit(X_train, (y_train, y_train),
                        validation_data=(X_val, (y_val, y_val)),
                        epochs=num_epochs,
                        batch_size=batch_size)

Epoch 1/20


TypeError: Exception encountered when calling layer 'embeddings' (type TFBertEmbeddings).

Value passed to parameter 'indices' has DataType float32 not in list of allowed values: int32, int64

Call arguments received by layer 'embeddings' (type TFBertEmbeddings):
  • input_ids=tf.Tensor(shape=(None, 1), dtype=float32)
  • position_ids=None
  • token_type_ids=tf.Tensor(shape=(None, 1), dtype=int32)
  • inputs_embeds=None
  • past_key_values_length=0
  • training=False

In [9]:
n_trees_values = [25, 50, 80]
max_depth_values = [3, 5, 7]
n_estimators_values = [30, 50, 80]
subportion_values = [0.5, 0.7, 0.9]
n_runs_values = [20, 50, 80]

# Define divisor values based on n_estimators
divisors_by_n_estimators = {
    30: [3, 5, 7],
    50: [3, 5, 7],
    80: [3, 5, 7],
    120: [3, 5, 7]
}

# Generate parameter combinations
parameter_combinations = []
for n_trees, max_depth, n_estimators, subportion, n_runs in itertools.product(
    n_trees_values, max_depth_values, n_estimators_values, subportion_values, n_runs_values
):
    if n_estimators in divisors_by_n_estimators:
        divisors = divisors_by_n_estimators[n_estimators]
        for divisor in divisors:
            n_samples = n_estimators // divisor
            parameter_combinations.append((n_trees, max_depth, n_estimators, subportion, n_samples, n_runs))


In [11]:
print(len(parameter_combinations)/60)

12.15
