<a href="https://colab.research.google.com/github/axel-sirota/pre-trained-nlp-models/blob/main/module3/PreTrainedNLP_Mod3Demo1_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import multiprocessing
import os
import random
import warnings

import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import tensorflow_datasets as tfds

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  tf.random.set_seed(42)
  random.seed(42)

def set_session_with_gpus_and_cores():
  cores = multiprocessing.cpu_count()
  gpus = len(tf.config.list_physical_devices('GPU'))
  config = tf.compat.v1.ConfigProto( device_count = {'GPU': gpus  , 'CPU': cores} , intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.compat.v1.Session(config=config)
  tf.compat.v1.keras.backend.set_session(sess)


max_features = 250000
maxlen = 512

set_seeds_and_trace()
set_session_with_gpus_and_cores()
warnings.filterwarnings('ignore')

In [2]:

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-2_H-128_A-2')


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/382 [00:00<?, ?B/s]

In [3]:
# Load the IMDB dataset
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

# Define max_length for padding
max_length = 512

# Tokenization and padding function
def tokenize_and_pad(text, label):
    # Tokenize the text
    tokenized_text = tokenizer.encode_plus(
        tf.compat.as_text(text.numpy()),
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_attention_mask=False,
        return_token_type_ids=False,
        return_tensors='tf'
    )

    # Extract and pad input ids
    return tokenized_text['input_ids'][0], label

# Use tf.py_function to apply tokenize_and_pad
def encode_map_fn(text, label):
    encoded_text, label = tf.py_function(
        tokenize_and_pad,
        inp=[text, label],
        Tout=[tf.int32, tf.int64]
    )
    encoded_text.set_shape((max_length,))
    label.set_shape(())
    return encoded_text, label

# Apply the function to the dataset
train_dataset = train_dataset.map(encode_map_fn).shuffle(10000).batch(32)
test_dataset = test_dataset.map(encode_map_fn).shuffle(10000).batch(32)


Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteHBRPOI/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteHBRPOI/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteHBRPOI/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [4]:
for i in train_dataset.take(1):
  print(i)

(<tf.Tensor: shape=(32, 512), dtype=int32, numpy=
array([[  101,  1996,  3291, ...,  2145,  2024,   102],
       [  101,  7929,  1011, ...,     0,     0,     0],
       [  101,  1037, 11421, ...,  2745, 15107,   102],
       ...,
       [  101,  2023,  2003, ...,     0,     0,     0],
       [  101,  1996,  3638, ...,     0,     0,     0],
       [  101,  2026,  2564, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(32,), dtype=int64, numpy=
array([0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0])>)


In [5]:

# Load BERT model
model = TFBertForSequenceClassification.from_pretrained('google/bert_uncased_L-2_H-128_A-2')

# Compile the model
optimizer = Adam(learning_rate=2e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model
model.fit(train_dataset, epochs=3, batch_size=32)


model.safetensors:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x793dfafad690>

In [7]:

# Evaluate the model
result = model.evaluate(test_dataset)
print(f"Test loss: {result[0]}")
print(f"Test accuracy: {result[1]}")

Test loss: 0.30710843205451965
Test accuracy: 0.8719599843025208
