In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pathlib
import shutil
import string
import re

# Making sure tensorflow is working properly
print("TensorFlow Versions: ", tf.__version__)
if tf.config.list_physical_devices('GPU'):
  print("TensorFlow **IS** using the GPU")
else:
  print("TensorFlow **IS NOT** using the GPU")

# This prevents some error messages caused by reaching memory limits
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)


TensorFlow Versions:  2.7.0
TensorFlow **IS** using the GPU


In [2]:
data_dir = pathlib.Path('/home/addy/.keras/datasets')/'aclImdb'

if data_dir.exists():
    print("Found the 'aclImdb' dataset.")

else:
    print("Downloading 'aclImdb' dataset.")
    
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                        untar=True)

    data_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
    
    unsup = data_dir/ 'train' / 'unsup'
    if unsup.exists():
        print("Deleting unsupervised texts.")
        shutil.rmtree(unsup)


Found the 'aclImdb' dataset.


In [3]:
# Initializing parameters

batch_size = 64     # Reduce this if you get memory errors

seed = 2476          # A random seed to get replicable results

epochs = 10         # The number of training epochs

max_features = 10000

sequence_length = 250

In [7]:
list_ds = tf.data.Dataset.list_files(str(data_dir/'train'/'*/*'), shuffle=False)
for f in list_ds.take(5):
  print(f.numpy())

b'/home/addy/.keras/datasets/aclImdb/train/neg/0_3.txt'
b'/home/addy/.keras/datasets/aclImdb/train/neg/10000_4.txt'
b'/home/addy/.keras/datasets/aclImdb/train/neg/10001_4.txt'
b'/home/addy/.keras/datasets/aclImdb/train/neg/10002_1.txt'
b'/home/addy/.keras/datasets/aclImdb/train/neg/10003_1.txt'


2022-01-03 09:25:27.148211: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-03 09:25:27.578701: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2404 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1660 Ti, pci bus id: 0000:01:00.0, compute capability: 7.5


In [8]:
text_count = len(list((data_dir/"train").glob('*/*.txt')))
list_ds = list_ds.shuffle(text_count, reshuffle_each_iteration=False)
print(text_count)


25000


In [9]:
class_names = np.array(sorted([item.name for item in (data_dir/"train").glob('*') if '.' not in item.name]))
print(class_names)

['neg' 'pos']


In [66]:
val_size = int(text_count * 0.2)
# val_size = 10
train_ds = list_ds.skip(val_size)
val_ds = list_ds.take(val_size)


In [67]:
print(tf.data.experimental.cardinality(train_ds).numpy())
print(tf.data.experimental.cardinality(val_ds).numpy())

20000
5000


In [68]:
def get_label(file_path):
  # Convert the path to a list of path components
  parts = tf.strings.split(file_path, os.path.sep)
  # The second to last is the class-directory
  one_hot = parts[-2] == class_names
  # Integer encode the label
  return tf.argmax(one_hot)


In [69]:
def decode_text(text):
  output = bert_tokenizer(
      text.numpy().decode('UTF-8'), 
      truncation=True, 
      padding='max_length', 
      return_tensors="tf", 
      max_length=sequence_length)
  return output['input_ids']


In [70]:
def process_path(file_paths):
  
  label = get_label(file_paths)
  # Load the raw data from the file as a string
  text = tf.io.read_file(file_paths)
  text = tf.squeeze(text)
  text = tf.py_function(decode_text, [text], Tout=tf.int32)[0]
  return text, label

In [71]:
AUTOTUNE = tf.data.AUTOTUNE

train_vec_ds = train_ds.map(process_path, num_parallel_calls=AUTOTUNE).batch(batch_size)
val_vec_ds = val_ds.map(process_path, num_parallel_calls=AUTOTUNE).batch(batch_size)


In [72]:
def getItemsFromDataset(dataset):
    items = np.asarray([i for x,y in dataset for i in x.numpy()])
    labels = np.asarray([i for x,y in dataset for i in y.numpy()])
    return items, labels

In [73]:
for i in getItemsFromDataset(val_vec_ds):
    print(i)

[[  101  1247  1110 ...     0     0     0]
 [  101  2409   138 ...     0     0     0]
 [  101 25863   117 ...     0     0     0]
 ...
 [  101  1752  1645 ...   120   135   102]
 [  101  5055   125 ...     0     0     0]
 [  101   146  1108 ...     0     0     0]]
[0 1 1 ... 0 1 1]


In [146]:
# Initializing the training dataset

train_ds = tf.keras.utils.text_dataset_from_directory(
  data_dir/'train',
  validation_split=0.2,
  subset="training",
  seed=seed,
  batch_size=batch_size)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [147]:
# Initializing the validation dataset

val_ds = tf.keras.utils.text_dataset_from_directory(
  data_dir/'train',
  validation_split=0.2,
  subset="validation",
  seed=seed,
  batch_size=batch_size)


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [148]:
test_ds = tf.keras.utils.text_dataset_from_directory(
    data_dir/'test', 
    batch_size=batch_size)


Found 25000 files belonging to 2 classes.


In [26]:
from transformers import (TFBertModel, BertTokenizer)

In [27]:
bert_model = TFBertModel.from_pretrained("bert-base-cased")  # Automatically loads the config
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [74]:
bert_model.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
Total params: 108,310,272
Trainable params: 108,310,272
Non-trainable params: 0
_________________________________________________________________


In [75]:
bert_tokenizer("Hello There, how are you doing", truncation=True, padding='max_length', return_tensors="tf", max_length=20)

{'input_ids': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[ 101, 8667, 1247,  117, 1293, 1132, 1128, 1833,  102,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)>}

In [76]:
def vectorize_text(text):
  tokenized = bert_tokenizer(text, truncation=True, padding='max_length', return_tensors="tf", max_length=sequence_length)
  return tokenized

In [20]:
# Initializing the validation dataset

val_ds = tf.keras.utils.text_dataset_from_directory(
  data_dir/'train',
  validation_split=0.2,
  subset="validation",
  seed=seed,
  batch_size=batch_size)


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [21]:
def decode_text(texts):
  # print(len(text.numpy()))
  return [bert_tokenizer(
      text.numpy().decode('UTF-8'), 
      truncation=True, 
      padding='max_length', 
      return_tensors="tf", 
      max_length=sequence_length)['input_ids'] for text in texts]
  # return output


In [22]:

def vectorize(x, label):
    return tf.py_function(decode_text, [x], Tout=tf.int32), label
val_ds = val_ds.map(vectorize)

In [86]:
for image, label in val_ds.take(1):
  print("Image shape: ", image.numpy())
  print("Label: ", label.numpy())


<TakeDataset shapes: (), types: tf.string>

In [88]:
input_ids_in = tf.keras.layers.Input(shape=(sequence_length,), name='input_token', dtype='int32')

embedding_layer = bert_model(input_ids_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(1, activation='sigmoid')(X)
model = tf.keras.Model(inputs=[input_ids_in], outputs = X)

for layer in model.layers[:3]:
  layer.trainable = False



In [89]:
model.summary()


Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_token (InputLayer)    [(None, 250)]             0         
                                                                 
 tf_bert_model (TFBertModel)  multiple                 108310272 
                                                                 
 bidirectional_2 (Bidirectio  (None, 250, 100)         327600    
 nal)                                                            
                                                                 
 global_max_pooling1d_2 (Glo  (None, 100)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 50)                5050      
                                                                 
 dropout_39 (Dropout)        (None, 50)                0   

In [90]:

# Choosing the tf.keras.optimizers.Adam with 
# the tf.keras.losses.BinaryCrossentropy loss function and
# the tf.metrics.BinaryAccuracy as metric

model.compile(
  optimizer=tf.keras.optimizers.Adam(1e-4),
  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
  metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

# Finally fitting the model to the data in train_ds 
# with val_ds as the validation dataset
# running for 10 epochs

history = model.fit(
  train_vec_ds,
  validation_data=val_vec_ds,
  epochs=epochs
)

Epoch 1/10
 18/313 [>.............................] - ETA: 10:10 - loss: 0.9282 - binary_accuracy: 0.5035

KeyboardInterrupt: 

In [84]:
train_ds.batches

AttributeError: 'SkipDataset' object has no attribute 'batches'