### Install Transformers

In [None]:
!pip3 install --quiet transformers

In [None]:
import tensorflow as tf

### Download Data From Kaggle

In [None]:
!pip3 install kaggle --quiet

In [None]:
#Make a directory for Kaggle
!mkdir .kaggle

In [None]:
#Connect Google drive to colab
from google.colab import drive
drive.mount('/gdrive')

In [None]:
#Copy kaggle.json file
!cp '/gdrive/My Drive/AI-ML/Machine-Learning/Code/Utilities/kaggle.json' /content/.kaggle/kaggle.json

In [None]:
!mkdir ~/.kaggle
!cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
!kaggle config set -n path -v{/content}
!chmod 600 /root/.kaggle/kaggle.json

Verify Kaggle connection

In [None]:
!kaggle datasets list

In [None]:
#Download Movie Reviews data
!kaggle competitions download -c word2vec-nlp-tutorial -p /content

In [None]:
!ls -l

### Data Processing

Load data

In [None]:
!pip install sentencepiece

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('./labeledTrainData.tsv.zip', sep='\t')
print(df.shape)

In [None]:
df.sample(n=5)

In [None]:
#Sentences and labels
sentences = df.review.values
labels = df.sentiment.values

Tokenize data using Bert Tokenizer

In [None]:
from transformers import *

In [None]:
#Get BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [None]:
sentences[0]

In [None]:
#Check tokenized text
print(tokenized_texts[0])

In [None]:
#We will use only first 200 tokens to do classification (this value can be changed)
max_length = 200
tokenized_texts = [sent[:max_length] for sent in tokenized_texts]

In [None]:
for i in range(len(tokenized_texts)):
    sent = tokenized_texts[i]
    sent = ['[CLS]'] + sent + ['[SEP]']
    tokenized_texts[i] = sent

In [None]:
print(tokenized_texts[0])

In [None]:
#Convert tokens into IDs
input_ids = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_texts]

In [None]:
print(input_ids[0])

In [None]:
#Pad our tokens which might be less than max_length size
input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, maxlen=max_length+2, truncating='post', padding='post')

Split data between training and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#80% data will be used for training while 20% will be used for test
trainX, testX, trainY, testY = train_test_split(input_ids, labels, test_size=0.2, random_state=12345)

Create Attention masks : Attention masks are useful to ignore padding tokens. Mask value will be set to 0 for padding tokens and 1 for actual tokens. We will create mask both for training and test data

In [None]:
# Create attention masks for training
train_attn_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in trainX:
  seq_mask = [float(i>0) for i in seq]
  train_attn_masks.append(seq_mask)

In [None]:
# Create attention masks for Test
test_attn_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in testX:
  seq_mask = [float(i>0) for i in seq]
  test_attn_masks.append(seq_mask)

At this point, we have the data ready

### Build Model

In [None]:
#Load Pre-trained Bert Model with a Binary Classification layer at the top.
#Huggingface library provides TFBertForSequenceClassification for the same
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

In [None]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
model.summary()

### Train Model

In [None]:
import numpy as np

In [None]:
train_x_data = {'input_ids': np.array(trainX), 'attention_mask': np.array(train_attn_masks)}
test_x_data = {'input_ids': np.array(testX), 'attention_mask': np.array(test_attn_masks)}

In [None]:
model.fit(train_x_data, trainY, validation_data=(test_x_data, testY), batch_size=16, epochs=2)