In [1]:
# BERT - Bidirectional Encoder Representations from Transformers
# - introduced by Google AI Language
# - used to implement NLP task

# Applications of BERT
# - Chatbot
# - Text Classification
# - Text Summarization
# - Text Generation

# Working of BERT
#  - it uses transformers - understands the relations b/w words and sub-words

# Transformers includes 2 mechanisms:
# - Encoder : take input / read input
# - Decoder : produce predictions


# The sky is _____

# BERT uses
# - Masked LM - MLM - Masked Language Model
#   - it enables bi-directional learning from text by masking(hiding)


# NSP - Next Sentence Prediction

In [41]:
# !pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-text
  Downloading tensorflow_text-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 5.2 MB/s 
[?25hCollecting tensorflow<2.11,>=2.10.0
  Downloading tensorflow-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (578.0 MB)
[K     |████████████████████████████████| 578.0 MB 7.3 kB/s 
Collecting keras<2.11,>=2.10.0
  Downloading keras-2.10.0-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 45.3 MB/s 
[?25hCollecting tensorflow-estimator<2.11,>=2.10.0
  Downloading tensorflow_estimator-2.10.0-py2.py3-none-any.whl (438 kB)
[K     |████████████████████████████████| 438 kB 55.7 MB/s 
[?25hCollecting tensorboard<2.11,>=2.10
  Downloading tensorboard-2.10.1-py3-none-any.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 41.5 MB/s 
Collecting

In [17]:
import tensorflow as tf
import os
import tensorflow_hub as hub
import tensorflow_text as text

In [2]:
path = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", path, untar=True, cache_dir=".", cache_subdir=".")

In [3]:
dataset

'././aclImdb_v1'

In [4]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [5]:
dataset_dir

'././aclImdb'

In [6]:
train_path = os.path.join(dataset_dir, 'train')

In [7]:
import shutil

In [8]:
shutil.rmtree(os.path.join(train_path, 'unsup'))

In [9]:
train_dir = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=32
)

target_names = train_dir.class_names
train_df = train_dir.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

Found 25000 files belonging to 2 classes.


In [10]:
test_dir = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test', batch_size=32
)

test_df = test_dir.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

Found 25000 files belonging to 2 classes.


In [11]:
for text, label_batch in train_df.take(1):
  for i in range(3):
    print("Review : ",text.numpy()[i])
    label = label_batch.numpy()[i]
    print("Label : ", target_names[label])

Review :  b'I\'ve read up a little bit on Che before watching this film and you wanna know something, he was a real hero for the people because he only wanted to see equality for everyone and that he hated what the oppressive forces were doing to his people as well as all other Latin Americans in general! Now, I don\'t know about others, but to me he did the right thing by wanting socialism so that everyone had to pay their fair share. However, the powerful elite obviously weren\'t going to go for that. So, rather than understanding what Che Guevera wanted, they were forced to kill him in attempting to suppress the revolution. It didn\'t work since there were too many of his other followers who only picked up where he left off. A good example of this was when Castro continued his leadership in Cuba. As far as I\'m concerned and as Che said it himself right before he died: "If you kill me, that\'s fine. But you\'re only killing a man, you\'ll NEVER kill the cause!" I couldn\'t have said

In [18]:
bert_model_name = "small_bert/bert_en_uncased_L-4_H-512_A-8"

In [19]:
map_model = {
    "small_bert/bert_en_uncased_L-4_H-512_A-8" : "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2"
}
map_model_preprocess = {
    "small_bert/bert_en_uncased_L-4_H-512_A-8" : "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
}

In [20]:
tf_handle_encoder = map_model[bert_model_name]
tf_handle_preprocess = map_model_preprocess[bert_model_name]

In [21]:
tf_handle_preprocess

'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [22]:
bert_preprocess_model = hub.KerasLayer(tf_handle_preprocess)

In [23]:
sample_text = ["that movie was really awesome"]
preprocessed_text = bert_preprocess_model(sample_text)

In [24]:
preprocessed_text.keys()

dict_keys(['input_word_ids', 'input_type_ids', 'input_mask'])

In [26]:
preprocessed_text["input_word_ids"].shape

TensorShape([1, 128])

In [28]:
bert_model = hub.KerasLayer(tf_handle_encoder)

In [29]:
bert_results = bert_model(preprocessed_text)

In [33]:
bert_results["pooled_output"][0,:10]

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 0.8775037 ,  0.919565  ,  0.08322005,  0.41122985, -0.31452075,
        0.9362951 ,  0.9951427 , -0.96579766, -0.08412668, -0.99787635],
      dtype=float32)>

In [35]:
bert_results["sequence_output"][0, :10]

<tf.Tensor: shape=(10, 512), dtype=float32, numpy=
array([[-0.02524775, -0.04272904,  1.1051615 , ...,  0.02208868,
         0.88753396, -0.7204938 ],
       [-0.50972354, -0.51962763,  0.26222688, ..., -0.2015189 ,
         0.19569634, -0.47689688],
       [-0.522543  , -0.08875371,  0.52246547, ...,  0.41171634,
         1.0428624 ,  0.3984005 ],
       ...,
       [-0.05070562, -0.39190173, -0.1581223 , ...,  0.4556942 ,
         0.9764831 , -0.08788436],
       [-0.01296163, -0.53714925, -0.13502379, ...,  0.7128638 ,
         0.87337637, -0.01667614],
       [-0.0256803 , -0.7105179 , -0.3199417 , ...,  0.49497187,
         1.0928558 ,  0.05831265]], dtype=float32)>