In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Model
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

import matplotlib.pyplot as plt
from lime.lime_text import LimeTextExplainer
import shap
from collections import defaultdict
import itertools

In [2]:
# load and preprocess data: cali housing

cali_housing_path = '../data/California_Houses.csv'
RANDOM_SEED = 492
cali_df = pd.read_csv(cali_housing_path)
y_series = cali_df['Median_House_Value']
y = pd.DataFrame(y_series, columns=['Median_House_Value'])
features = [col for col in cali_df.columns if col != 'Median_House_Value']
X = cali_df[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# preprocessing

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [3]:
# make the data into strings
X_train_texts = X_train_scaled.astype(str).apply(' '.join, axis=1).tolist()
X_test_texts = X_test_scaled.astype(str).apply(' '.join, axis=1).tolist()

In [4]:
# Load the pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [22]:
class BERTRegression(Model):
    def __init__(self, max_length, dense_size, dropout_rate=0.1, num_mc_samples=10):
        super(BERTRegression, self).__init__()
        self.max_length = max_length
        self.bert_model = TFBertModel.from_pretrained('bert-base-uncased')
        self.bert_model.trainable = True
        self.dropout = Dropout(dropout_rate)
        self.dense_layer = Dense(dense_size, activation='relu')
        self.mc_dropout = Dropout(dropout_rate)
        self.output_layer = Dense(1, activation='linear')
        self.num_mc_samples = num_mc_samples
        self.explainer = LimeTextExplainer()

    def call(self, inputs, training=False):
        input_ids, attention_mask, token_type_ids = inputs
        bert_output = self.bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[1]
        dropout_output = self.dropout(bert_output, training=training)
        hidden_output = self.dense_layer(dropout_output)
        mc_dropout_output = self.mc_dropout(hidden_output, training=training)
        output = self.output_layer(mc_dropout_output)

        if training:
            return output
        else:
            output_samples = tf.stack([self(inputs, training=True) for _ in range(self.num_mc_samples)])
            output_mean = tf.reduce_mean(output_samples, axis=0)
            output_sd = tf.math.reduce_std(output_samples, axis=0)
            return output_mean, output_sd

    def predict_with_uncertainty(self, inputs):
        output_mean, output_sd = self(inputs, training=False)
        return output_mean, output_sd

    def explain_lime(self, text_instance):
        explanations = []

        def predict_function(texts):
            inputs = self.tokenizer.batch_encode_plus(
                texts,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='tf'
            )
            outputs = self.predict_with_uncertainty([
                inputs['input_ids'],
                inputs['attention_mask'],
                inputs['token_type_ids']
            ])
            return outputs[0]

        exp = self.explainer.explain_instance(
            text_instance,
            predict_function,
            num_features=self.max_length,  # Adjust num_features as needed
            labels=[0]
        )
        explanations.append(exp)
        return explanations

In [23]:
max_length = 13
dense_size = 55
dropout_rate = 0.1
num_mc_samples = 50

In [24]:
# Initialize model
berty_pilot = BERTRegression(max_length, dense_size, dropout_rate, num_mc_samples)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [25]:
# Tokenizer for encoding inputs
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode inputs
encoded_inputs_train = tokenizer.batch_encode_plus(
    X_train_texts,
    max_length=max_length,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

In [26]:
print(encoded_inputs_train[:0])

{'input_ids': <tf.Tensor: shape=(0, 13), dtype=int32, numpy=array([], shape=(0, 13), dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(0, 13), dtype=int32, numpy=array([], shape=(0, 13), dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(0, 13), dtype=int32, numpy=array([], shape=(0, 13), dtype=int32)>}


In [27]:
# Compile the model
berty_pilot.compile(optimizer='adam', loss='mse')


In [29]:
# Fit the model
history = berty_pilot.fit(
    [encoded_inputs_train['input_ids'], encoded_inputs_train['attention_mask'], encoded_inputs_train['token_type_ids']],
    y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/20
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step - loss: 50684096512.0000

KeyboardInterrupt: 

In [None]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


In [None]:
# Example test instance
instance_index = 0
X_test_instance = X_test_scaled.iloc[instance_index]
true_value = y_test.values[instance_index]

# Tokenize and encode the instance
test_inputs = tokenizer.encode_plus(
    ' '.join(X_test_instance.astype(str).tolist()),
    max_length=max_length,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

In [None]:
# get explanation
explanations = bert_model.explain_lime(test_inputs)

In [None]:
# Print explanations
for exp in explanations:
    print(exp)