In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%cd gdrive/MyDrive/colab_projects/nlp/imdb/nbs/

/content/gdrive/MyDrive/colab_projects/nlp/imdb/nbs


In [3]:
!pip install transformers



In [4]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import tensorflow as tf
from transformers import BertTokenizer
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

In [5]:
pd.options.display.max_columns=999
pd.options.display.max_rows=999

In [6]:
df = pd.read_csv("../data/imdb.csv")
df_train = df[:25000]
df_test = df[25000:]

labels_index = {'positive':1, 'negative':0}

In [7]:
model_name = 'bert-base-uncased'
max_length = 1000

In [7]:
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

In [7]:
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [8]:
transformer_model.summary()

Model: "tf_bert_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


In [9]:
bert = transformer_model.layers[0]

In [10]:
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model

In [11]:
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
dense_inter = Dense(128, activation='relu')(pooled_output)
logits = Dense(2)(dense_inter)
model = Model(inputs=inputs, outputs=logits)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 1000)]            0         
_________________________________________________________________
bert (TFBertMainLayer)       TFBaseModelOutputWithPool 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               98432     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 109,580,930
Trainable params: 109,580,930
Non-trainable params: 0
_________________________________________________________________


In [12]:
optimizer = Adam(learning_rate=1e-05, epsilon=1e-08, decay=0.01, clipnorm=1.0)
loss = CategoricalCrossentropy(from_logits=True)
metric = 'accuracy'
model.compile(optimizer=optimizer, loss=loss, metrics=metric)

In [13]:
y_train = to_categorical(df_train['sentiment'].map(labels_index))

In [14]:
y_test = to_categorical(df_test['sentiment'].map(labels_index))

In [15]:
X_train = tokenizer(text=df_train['review'].to_list(),
                    add_special_tokens=True,
                    max_length=max_length,
                    truncation=True,
                    padding=True, 
                    return_tensors='tf',
                    return_token_type_ids=False,
                    return_attention_mask=False,
                    verbose=True)

In [16]:
X_test = tokenizer(text=df_test['review'].to_list(),
                    add_special_tokens=True,
                    max_length=max_length,
                    truncation=True,
                    padding=True, 
                    return_tensors='tf',
                    return_token_type_ids=False,
                    return_attention_mask=False,
                    verbose=True)

In [17]:
X_train

{'input_ids': <tf.Tensor: shape=(25000, 1000), dtype=int32, numpy=
array([[  101,  2028,  1997, ...,     0,     0,     0],
       [  101,  1037,  6919, ...,     0,     0,     0],
       [  101,  1045,  2245, ...,     0,     0,     0],
       ...,
       [  101, 10225, 25318, ...,     0,     0,     0],
       [  101,  9779,  2232, ...,     0,     0,     0],
       [  101,  2023,  2143, ...,     0,     0,     0]], dtype=int32)>}

In [18]:
y_train

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [19]:
model.fit(X_train["input_ids"], y_train, batch_size=4, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f4a2ad76710>

In [21]:
score, acc = model.evaluate(X_test["input_ids"], y_test)
print('Test accuracy:', acc)

Test accuracy: 0.9105600118637085
