In [1]:
from transformers import TFAutoModel

In [2]:
bert = TFAutoModel.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [3]:
bert.summary()

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [4]:
import tensorflow as tf

In [5]:
input_ids = tf.keras.layers.Input(shape=(512,), name = "input_ids", dtype="int32")
mask = tf.keras.layers.Input(shape=(512,), name = "attention_mask", dtype="int32")

In [6]:
embeddings = bert.bert(input_ids, attention_mask = mask)[1]  # Extract max pooled activations

In [7]:
a = tf.keras.layers.Dense(512, activation="relu")(embeddings)
b = tf.keras.layers.Dense(3, activation="softmax", name="outputs")(a)

In [8]:
model = tf.keras.Model(inputs = [input_ids, mask], outputs=b)

In [9]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 512)          393728      bert[0][1]                   

In [10]:
model.layers[2].trainable = False

In [11]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 108310272   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense (Dense)                   (None, 512)          393728      bert[0][1]                   

In [12]:
optimizer = tf.keras.optimizers.Adam(lr=5e-5, decay=1e-6) # recommended values for bert
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [13]:
with open("../input/sentiment-analysis-prep-2/element_spec.txt", 'r') as f:
    element_spec = f.read()
print(element_spec)

({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(16, 3), dtype=tf.uint8, name=None))


In [14]:
element_spec = ({'input_ids': tf.TensorSpec(shape=(16, 512), dtype=tf.int64, name=None), 
                 'attention_mask': tf.TensorSpec(shape=(16, 512), dtype=tf.int64, name=None)}, 
                  tf.TensorSpec(shape=(16, 3), dtype=tf.uint8, name=None))

In [15]:
train_ds = tf.data.experimental.load("../input/sentiment-analysis-prep-2/train", element_spec=element_spec)
val_ds = tf.data.experimental.load("../input/sentiment-analysis-prep-2/validation", element_spec=element_spec)

In [16]:
print(train_ds.take(1))
print(val_ds.take(1))

<TakeDataset shapes: ({input_ids: (16, 512), attention_mask: (16, 512)}, (16, 3)), types: ({input_ids: tf.int64, attention_mask: tf.int64}, tf.uint8)>
<TakeDataset shapes: ({input_ids: (16, 512), attention_mask: (16, 512)}, (16, 3)), types: ({input_ids: tf.int64, attention_mask: tf.int64}, tf.uint8)>


In [17]:
history = model.fit(train_ds,
                   validation_data=val_ds,
                   epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [18]:
model.save("bert-sentiment-model")