In [1]:
# This was my first time working with Transformers, and as such, 
# I had to do a lot of research to understand how to use them. 
# Honestly speaking, the task of doing sentiment analysis on _speech data_ 
# as my first ever attempt on implementing a Transformer
# seemed a bit too daunting to me, so I decided to do it on text data instead.
# I'll try to implement it on speech data next time.

In [2]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import pandas as pd
import numpy as np

SEQ_LEN = 50

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
# initialize model and tokenizer
bert = TFAutoModel.from_pretrained("bert-base-cased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

NameError: name 'TFAutoModel' is not defined

In [4]:
# read data and drop dupes
df = pd.read_csv('train.tsv', sep='\t')
df.drop_duplicates(subset="SentenceId", keep="first", inplace=True)

arr = df['Sentiment'].values  # take sentiment column in df as array
labels = np.zeros((arr.size, arr.max()+1))  # initialize empty (all zero) label array
labels[np.arange(arr.size), arr] = 1  # add ones in indices where we have a value

In [5]:
# define function to handle tokenization
def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, max_length=SEQ_LEN,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [6]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
63,64,2,"This quiet , introspective and entertaining in...",4
81,82,3,"Even fans of Ismail Merchant 's work , I suspe...",1
116,117,4,A positively thrilling combination of ethnogra...,3
156,157,5,Aggressive self-glorification and a manipulati...,1


In [7]:
# initialize two arrays for input tensors
Xids = np.zeros((len(df), SEQ_LEN))
Xmask = np.zeros((len(df), SEQ_LEN))

# loop through data and tokenize everything
for i, sentence in enumerate(df['Phrase']):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)

In [8]:
# create tensorflow dataset object
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

# restructure dataset format for BERT
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels
  
dataset = dataset.map(map_func)  # apply the mapping function

In [9]:
# shuffle and batch the dataset
dataset = dataset.shuffle(10000).batch(32)

DS_LEN = len(list(dataset))  # get dataset length

SPLIT = 0.9  # we will create a 90-10 split

# create training-validation sets
train = dataset.take(round(DS_LEN*SPLIT))
val = dataset.skip(round(DS_LEN*SPLIT))

# free up space
del dataset

In [10]:
# build the model
input_ids = tf.keras.layers.Input(shape=(50,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(50,), name='attention_mask', dtype='int32')

input_ids = tf.keras.layers.Input(shape=(50,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(50,), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)[0]  # we only keep tensor 0 (last_hidden_state)

X = tf.keras.layers.GlobalMaxPool1D()(embeddings)  # reduce tensor dimensionality
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(128, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(X)  # adjust based on number of sentiment classes

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

In [11]:
# freeze the BERT layer
model.layers[2].trainable = False

In [12]:
# compile the model
optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [13]:
# and train it
history = model.fit(train
, epochs=20)

Epoch 1/20
 30/240 [==>...........................] - ETA: 7:01 - loss: 2.8887 - accuracy: 0.3073

KeyboardInterrupt: 

In [35]:
# predict on validation set

model.predict(val)



array([[5.23120537e-02, 7.41626099e-02, 1.80392206e-01, 4.21050906e-01,
        2.72082299e-01],
       [1.03024424e-04, 4.08375170e-04, 4.79532080e-03, 9.90068853e-01,
        4.62434255e-03],
       [2.05169678e-01, 5.00361264e-01, 1.73943996e-01, 9.77965295e-02,
        2.27284953e-02],
       ...,
       [8.32067132e-02, 2.08578676e-01, 2.32360780e-01, 3.26637417e-01,
        1.49216458e-01],
       [3.35033298e-01, 4.20318872e-01, 1.85568348e-01, 5.25106154e-02,
        6.56889798e-03],
       [7.37440661e-02, 1.88221768e-01, 2.31935114e-01, 3.57562900e-01,
        1.48536190e-01]], dtype=float32)

In [31]:
# evaluate the model

model.evaluate(val)



[1.1203852891921997, 0.5335689187049866]