In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Model, load_model

from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

import matplotlib.pyplot as plt
from lime.lime_text import LimeTextExplainer
import shap
from collections import defaultdict
import itertools

In [2]:
# load and preprocess data: cali housing

cali_housing_path = '../data/California_Houses.csv'
RANDOM_SEED = 492
cali_df = pd.read_csv(cali_housing_path)
y_series = cali_df['Median_House_Value']
y = pd.DataFrame(y_series, columns=['Median_House_Value'])
features = [col for col in cali_df.columns if col != 'Median_House_Value']
X = cali_df[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# preprocessing

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [3]:
# make the data into strings
X_train_texts = X_train_scaled.astype(str).apply(' '.join, axis=1).tolist()
X_test_texts = X_test_scaled.astype(str).apply(' '.join, axis=1).tolist()

In [4]:
# Load the pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [5]:
class BERTRegression(Model):
    def __init__(self, max_length, dense_size, dropout_rate=0.1, num_mc_samples=10, num_features=13):
        super(BERTRegression, self).__init__()
        self.max_length = max_length
        self.bert_model = TFBertModel.from_pretrained('bert-base-uncased')
        self.bert_model.trainable = True
        self.dropout = Dropout(dropout_rate)
        self.dense_layer = Dense(dense_size, activation='relu')
        self.mc_dropout = Dropout(dropout_rate)
        self.output_layer = Dense(1, activation='linear')
        self.num_mc_samples = num_mc_samples
        self.explainer = LimeTextExplainer()
        self.num_features = num_features
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def call(self, inputs, training=True):
        input_ids, attention_mask, token_type_ids = inputs
        bert_output = self.bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[1]
        dropout_output = self.dropout(bert_output, training=training)
        hidden_output = self.dense_layer(dropout_output)
        mc_dropout_output = self.mc_dropout(hidden_output, training=training)
        output = self.output_layer(mc_dropout_output)

        if training:
            return output
        else:
            output_samples = tf.stack([self(inputs, training=False) for _ in range(self.num_mc_samples)])
            output_mean = tf.reduce_mean(output_samples, axis=0)
            output_sd = tf.math.reduce_std(output_samples, axis=0)
            return output_mean, output_sd

    def predict_with_uncertainty(self, inputs):
        output_mean, output_sd = self(inputs, training=False)
        return output_mean, output_sd

    def explain_lime(self, text_instance):
        explanations = []

        def predict_function(texts):
            inputs = self.tokenizer.batch_encode_plus(
                texts,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='tf'
            )
            outputs = self.predict_with_uncertainty([
                inputs['input_ids'],
                inputs['attention_mask'],
                inputs['token_type_ids']
            ])
            return outputs[0].numpy()

        exp = self.explainer.explain_instance(
            text_instance,
            predict_function,
            num_features=self.num_features
        )
        explanations.append(exp)
        return explanations

In [36]:
# set values
max_length = 230
dense_size = 55
dropout_rate = 0.1
num_mc_samples = 50
num_features = 13

In [37]:
# Initialize model
berty_pilot = BERTRegression(max_length, dense_size, dropout_rate, num_mc_samples, num_features)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [38]:
# Tokenizer for encoding inputs
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [35]:
# Function to concatenate column names and values
def concatenate_columns_with_names(df):
    concatenated = df.apply(lambda row: ' '.join([f"{col} {val}" for col, val in row.items()]), axis=1)
    return concatenated.tolist()
# Concatenate column names and values for each instance
X_train_texts_with_names = concatenate_columns_with_names(X_train_scaled)

# Calculate the maximum tokenized length
max_tokenized_length = 0
for text in X_train_texts_with_names:
    encoded_input = tokenizer.encode(text, add_special_tokens=True)  # add_special_tokens adds [CLS] and [SEP]
    tokenized_length = len(encoded_input)
    if tokenized_length > max_tokenized_length:
        max_tokenized_length = tokenized_length

print(f"Maximum tokenized length: {max_tokenized_length}")


Maximum tokenized length: 229


In [40]:

# Encode the concatenated inputs
encoded_inputs_train_cn = tokenizer.batch_encode_plus(
    X_train_texts_with_names,
    max_length=max_length,  
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

In [41]:
#check
print(X_train_texts_with_names[:1])

['Median_Income -0.22023367473901587 Median_Age 0.18302885044421452 Tot_Rooms -0.505329788280167 Tot_Bedrooms -0.5832710453838817 Population -0.625332763570887 Households -0.5546840263301412 Latitude 1.3115180113240754 Longitude -1.6397854101401088 Distance_to_coast -0.3915220068123694 Distance_to_LA 1.4909602397221111 Distance_to_SanDiego 1.4493012599111588 Distance_to_SanJose -0.9188291839326336 Distance_to_SanFrancisco -1.2106935875117162']


In [42]:
#check
print(encoded_inputs_train_cn[:2])

{'input_ids': <tf.Tensor: shape=(2, 230), dtype=int32, numpy=
array([[  101,  3991,  1035,  3318,  1011,  1014,  1012, 10545, 21926,
        21619,  2581, 22610, 23499, 24096, 27814,  2581,  3991,  1035,
         2287,  1014,  1012,  9500, 22407, 27531,  2692, 22932, 20958,
        16932, 25746,  2000,  2102,  1035,  4734,  1011,  1014,  1012,
        28952, 16703,  2683,  2581,  2620,  2620, 22407, 24096,  2575,
         2581,  2000,  2102,  1035, 18390,  1011,  1014,  1012,  5388,
        16703,  2581, 10790, 19961, 22025, 22025,  2620, 16576,  2313,
         1011,  1014,  1012, 22810, 22394, 22907,  2575, 19481, 19841,
         2620,  2620,  2581,  3911,  1011,  1014,  1012,  4583, 21472,
         2620, 12740, 23833, 22394, 24096, 23632,  2475, 15250,  1015,
         1012, 23532, 22203, 17914, 14526, 16703, 12740, 23352,  2549,
        20413,  1011,  1015,  1012,  6191,  2683,  2581, 27531, 23632,
        24096, 12740, 10790,  2620,  2620,  3292,  1035,  2000,  1035,
         3023, 

In [43]:
# Compile the model
berty_pilot.compile(optimizer='adam', loss='mse')


In [None]:
# Fit the model
history = berty_pilot.fit(
    [encoded_inputs_train_cn['input_ids'], encoded_inputs_train_cn['attention_mask'], encoded_inputs_train_cn['token_type_ids']],
    y_train,
    epochs=1,
    batch_size=32,
    validation_split=0.2
)

[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - loss: 56196300800.0000

In [14]:
tot_history=[]

In [15]:
tot_history=tot_history.append(history)

NameError: name 'history' is not defined

In [16]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


NameError: name 'history' is not defined

In [None]:
berty_pilot.save('models/bert_cali_model')

In [None]:
# Custom object dictionary
custom_model = {'BERTRegression': BERTRegression}

# Load the model with custom objects
berty_pilot = load_model('path_to_saved_model', custom_objects=custom_model)

In [None]:
# Example test instance
instance_index = 0
X_test_instance = X_test_scaled.iloc[instance_index]
true_value = y_test.values[instance_index]
X_test_instance_with_names = concatenate_columns_with_names(X_test_instance)

# Tokenize and encode the instance
test_inputs = tokenizer.encode_plus(
    X_test_instance_with_names,
    max_length=max_length,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

In [None]:
# get explanation
explanations = bert_model.explain_lime(test_inputs)

In [None]:
# Print explanations
for exp in explanations:
    print(exp)