<a href="https://colab.research.google.com/github/blawok/sentiment-sagemaker/blob/master/sentiment_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers tensorflow_datasets

[K     |████████████████████████████████| 778kB 2.8MB/s 
[K     |████████████████████████████████| 1.1MB 14.3MB/s 
[K     |████████████████████████████████| 3.0MB 21.4MB/s 
[K     |████████████████████████████████| 890kB 45.6MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [16]:
import tensorflow as tf
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds

from transformers import DistilBertTokenizer,TFDistilBertModel

## Load data

In [None]:
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews', 
          split = (tfds.Split.TRAIN, tfds.Split.TEST),
          as_supervised=True,
          with_info=True)

In [4]:
ds_info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [5]:
for feat, targ in ds_train.take(5):
  print(targ)
  print(feat)

tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor(b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and h

## Preprocessing

In [14]:
# can be up to 512 for BERT
MAX_SEQUENCE_LENGTH = 512
batch_size = 6

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [7]:
def convert_example_to_feature(review):
  return tokenizer.encode_plus(review, 
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
                truncation_strategy='longest_first',
                truncation=True
              )

In [8]:
def map_example_to_dict(input_ids, attention_masks, label):
  return {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
  }, label


def encode_examples(ds, limit=-1):
  input_ids_list = []
  attention_mask_list = []
  label_list = []

  if (limit > 0):
      ds = ds.take(limit)
    
  for review, label in tfds.as_numpy(ds):

    bert_input = convert_example_to_feature(review.decode())
  
    input_ids_list.append(bert_input['input_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([label])

  return tf.data.Dataset.from_tensor_slices((input_ids_list,
                                             attention_mask_list,
                                             label_list)).map(map_example_to_dict)

In [9]:
# train dataset
ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(batch_size)

# test dataset
ds_test_encoded = encode_examples(ds_test).batch(batch_size)

## Training

In [12]:
def create_distilbert_model(freeze = True):
    qa = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    qa_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)

    transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

    qa_embedding = transformer_model(qa, attention_mask=qa_mask)[0]
    cls_encodings = tf.squeeze(qa_embedding[:, 0:1, :], axis=1)
    dropout = tf.keras.layers.Dropout(0.1)(cls_encodings)
    output_dense = tf.keras.layers.Dense(1, activation='sigmoid')(dropout)

    model = tf.keras.models.Model(inputs=[qa, qa_mask], outputs=output_dense)
    return model

In [22]:
db_model = create_distilbert_model()

- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [23]:
learning_rate = 2e-5

# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1

# classifier Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

db_model.compile(optimizer=optimizer, 
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

In [24]:
db_model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_1 (TFDisti ((None, 512, 768),)  66362880    input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, 1, 768)]     0           tf_distil_bert_model_1

In [25]:
db_model.fit(ds_train_encoded,
             epochs=number_of_epochs,
             validation_data=ds_test_encoded)

   5/4167 [..............................] - ETA: 16:38:29 - loss: 0.7184 - accuracy: 0.4667

KeyboardInterrupt: ignored