In [38]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow_datasets as tfds

import pandas as pd
import re

import keras_nlp
import numpy as np

from tqdm import tqdm 

AUTO = tf.data.AUTOTUNE

In [3]:
tf.__version__

'2.15.0'

In [4]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

In [5]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


2024-04-12 23:12:01.359641: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22495 MB memory:  -> device: 0, name: NVIDIA RTX A5000, pci bus id: 0000:65:00.0, compute capability: 8.6
2024-04-12 23:12:01.360168: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 22418 MB memory:  -> device: 1, name: NVIDIA RTX A5000, pci bus id: 0000:b3:00.0, compute capability: 8.6


In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /opt/userdata/22BCE2700/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
import os

In [8]:
os.listdir('./')

['data',
 '.ipynb_checkpoints',
 'EmotionExtraction-BERT_Encoder.ipynb',
 'EmotionExtraction-LightWeight_Encoder.ipynb',
 'checkpoints',
 'logs']

In [9]:
df1 = pd.read_csv('./data/goemotions1.csv')
df2 = pd.read_csv('./data/goemotions2.csv')
df3 = pd.read_csv('./data/goemotions3.csv')

In [10]:
list(df1.columns)

['text',
 'id',
 'author',
 'subreddit',
 'link_id',
 'parent_id',
 'created_utc',
 'rater_id',
 'example_very_unclear',
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [11]:
df1 = df1.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [12]:
df2 = df2.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [13]:
df3 = df3.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [14]:
df1.shape

(70000, 29)

In [15]:
total_df = pd.concat([df1, df2, df3])

In [16]:
total_df.shape

(211225, 29)

In [17]:
total_df = total_df.dropna()

In [18]:
total_df.shape

(211225, 29)

In [19]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [20]:
total_df['text'][10:16]

10    I have, and now that you mention it, I think t...
11    I wanted to downvote this, but it's not your f...
12                                BUT IT'S HER TURN! /s
13                                         That is odd.
14                                    Build a wall? /jk
15    I appreciate it, that's good to know. I hope I...
Name: text, dtype: object

In [21]:
total_df['text'] = total_df['text'].apply(clean_text)
total_df['text'] = total_df['text'].str.replace('\d+', '')

In [22]:
total_df['text'][10:16]

10              mention think thats triggered nostalgia
11                          wanted downvote fault homie
12                                                 turn
13                                                  odd
14                                        build wall jk
15    appreciate thats good know hope ill apply know...
Name: text, dtype: object

In [23]:
MAX_NB_WORDS = 50000

MAX_SEQUENCE_LENGTH = 250

EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(total_df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 32388 unique tokens.


In [24]:
X = tokenizer.texts_to_sequences(total_df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (211225, 250)


In [25]:
total_df.columns

Index(['text', 'admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [26]:
Y = total_df.loc[ : , ['admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']].astype(np.float32).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (211225, 28)


In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(total_df['text'],Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(190102,) (190102, 28)
(21123,) (21123, 28)


In [28]:
total_df['text'][:1]

0    game hurt
Name: text, dtype: object

In [29]:
Y[:1]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]], dtype=float32)

In [42]:
import urllib.request
emotions = urllib.request.urlopen(
   'https://raw.githubusercontent.com/google-research/google-research'
   '/master/goemotions/data/emotions.txt').read().decode('utf8').split('\n')

In [43]:
def preprocess_dataset(split, batch_size=128):
 def one_hot_encode(x):
   vec = tf.stack([x[emotion] for emotion in emotions], 0)
   return x['comment_text'], tf.cast(vec, tf.uint8)
     
 ds = tfds.load('goemotions', split=split)
 ds = ds.map(one_hot_encode, num_parallel_calls=tf.data.AUTOTUNE)
 ds = ds.shuffle(buffer_size=batch_size * 10)
 ds = ds.batch(batch_size, drop_remainder=False)
 ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)

 return ds
    
ds_splits = ['train', 'test', 'validation']
datasets = {split: preprocess_dataset(split) for split in ds_splits}

In [98]:
with strategy.scope():
    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
        "distil_bert_base_en_uncased",
        sequence_length=256,
    )
    encoder = keras_nlp.models.DistilBertBackbone.from_preset(
        "distil_bert_base_en_uncased"
    )
    encoder.trainable = True

In [99]:
with strategy.scope():
    txt = tf.keras.layers.Input(shape=(), dtype=tf.string)
    x = preprocessor(txt)
    x = encoder(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(Y_train.shape[1], activation='softmax')(x)
    model = tf.keras.Model(inputs=[txt], outputs=x)

In [100]:
with strategy.scope():
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    metrics = [tf.keras.metrics.CategoricalAccuracy('accuracy', dtype=tf.float32)]
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
    model.compile(optimizer, loss, metrics=metrics)

In [57]:
BATCH_SIZE = 128
EPOCHS = 5
with strategy.scope():
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath='./checkpoints/DistilBERT',
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        save_best_only=True)
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs/DistilBERT')

In [58]:
with strategy.scope():
    history = model.fit(datasets['train'], validation_data=datasets['validation'], 
                    epochs=EPOCHS, verbose=2, 
                    callbacks=[model_checkpoint_callback, tensorboard_callback])

2024-04-12 23:17:35.310764: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:553] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.


Epoch 1/5
INFO:tensorflow:Collective all_reduce tensors: 101 all_reduces, num_devices = 2, group_size = 2, implementation = CommunicationImplementation.NCCL, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 101 all_reduces, num_devices = 2, group_size = 2, implementation = CommunicationImplementation.NCCL, num_packs = 1


INFO:tensorflow:Collective all_reduce IndexedSlices: 1 all_reduces, num_devices =2, group_size = 2, implementation = CommunicationImplementation.NCCL


INFO:tensorflow:Collective all_reduce IndexedSlices: 1 all_reduces, num_devices =2, group_size = 2, implementation = CommunicationImplementation.NCCL


INFO:tensorflow:Collective all_reduce tensors: 101 all_reduces, num_devices = 2, group_size = 2, implementation = CommunicationImplementation.NCCL, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 101 all_reduces, num_devices = 2, group_size = 2, implementation = CommunicationImplementation.NCCL, num_packs = 1


INFO:tensorflow:Collective all_reduce IndexedSlices: 1 all_reduces, num_devices =2, group_size = 2, implementation = CommunicationImplementation.NCCL


INFO:tensorflow:Collective all_reduce IndexedSlices: 1 all_reduces, num_devices =2, group_size = 2, implementation = CommunicationImplementation.NCCL
2024-04-12 23:22:12.021158: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:553] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.


340/340 - 294s - loss: 2.0801 - accuracy: 0.5195 - val_loss: 1.8791 - val_accuracy: 0.5678 - 294s/epoch - 865ms/step
Epoch 2/5
340/340 - 258s - loss: 1.6976 - accuracy: 0.5928 - val_loss: 1.9291 - val_accuracy: 0.5509 - 258s/epoch - 760ms/step
Epoch 3/5
340/340 - 257s - loss: 1.4640 - accuracy: 0.6471 - val_loss: 2.1883 - val_accuracy: 0.5461 - 257s/epoch - 756ms/step
Epoch 4/5
340/340 - 257s - loss: 1.2222 - accuracy: 0.7188 - val_loss: 2.4385 - val_accuracy: 0.5446 - 257s/epoch - 755ms/step
Epoch 5/5
340/340 - 257s - loss: 1.0441 - accuracy: 0.7748 - val_loss: 2.8052 - val_accuracy: 0.5477 - 257s/epoch - 754ms/step


In [63]:
model.save_weights('./DistilBERT/EmotionExtractor')

## Unit Testing: 

In [101]:
model.load_weights('./DistilBERT/EmotionExtractor')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7b8966a8e650>

In [102]:
model.save('./DistilBERT-Model/EmotionExtractor')

AttributeError: 'Functional' object has no attribute 'save_model'

In [96]:
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_10 (InputLayer)       [(None,)]                    0         []                            
                                                                                                  
 distil_bert_preprocessor_7  {'token_ids': (None, 256),   0         ['input_10[0][0]']            
  (DistilBertPreprocessor)    'padding_mask': (None, 25                                           
                             6)}                                                                  
                                                                                                  
 distil_bert_backbone (Dist  (None, None, 768)            6636288   ['distil_bert_preprocessor_7[0
 ilBertBackbone)                                          0         ][0]',                  

In [94]:
output = model.predict(["Bitter memories from my time in Jerusalem were dragged up."])



In [95]:
output

array([[1.1260102e-05, 1.8199594e-04, 1.9724222e-04, 2.1902709e-04,
        1.0893871e-04, 2.8319449e-05, 9.6456934e-05, 4.9224614e-05,
        1.2968556e-05, 1.3710093e-02, 4.0495829e-06, 2.0342408e-05,
        6.7239736e-05, 5.1817087e-06, 2.2618801e-08, 4.6228288e-06,
        1.0153386e-05, 6.1536695e-05, 3.2656222e-05, 3.2949103e-08,
        1.4956079e-04, 2.8351725e-08, 1.3910272e-03, 1.0224977e-06,
        4.2469433e-04, 9.6052492e-01, 6.8824006e-06, 2.2680474e-02]],
      dtype=float32)

In [None]:
prediction.pop(25)

In [76]:
prediction

array([1.1260102e-05, 1.8199594e-04, 1.9724222e-04, 2.1902709e-04,
       1.0893871e-04, 2.8319449e-05, 9.6456934e-05, 4.9224614e-05,
       1.2968556e-05, 1.3710093e-02, 4.0495829e-06, 2.0342408e-05,
       6.7239736e-05, 5.1817087e-06, 2.2618801e-08, 4.6228288e-06,
       1.0153386e-05, 6.1536695e-05, 3.2656222e-05, 3.2949103e-08,
       1.4956079e-04, 2.8351725e-08, 1.3910272e-03, 1.0224977e-06,
       4.2469433e-04, 9.6052492e-01, 6.8824006e-06, 2.2680474e-02],
      dtype=float32)

In [74]:
np.argmax(prediction)

0.9605249

In [69]:
emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

In [70]:
emotions[np.argmax(output)]

'sadness'

## Further Hypertuning and Optimization: 

In [86]:
with strategy.scope():
    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
        "distil_bert_base_en_uncased",
        sequence_length=256,
    )
    encoder = keras_nlp.models.DistilBertBackbone.from_preset(
        "distil_bert_base_en_uncased"
    )
    encoder.trainable = True

In [87]:
with strategy.scope():
    txt = tf.keras.layers.Input(shape=(), dtype=tf.string)
    x = preprocessor(txt)
    x = encoder(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(1024, activation='tanh')(x)
    x = tf.keras.layers.Dense(28, activation='softmax')(x)
    model = tf.keras.Model(inputs=[txt], outputs=x)

In [88]:
with strategy.scope():
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    metrics = [tf.keras.metrics.CategoricalAccuracy('accuracy', dtype=tf.float32)]
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
    model.compile(optimizer, loss, metrics=metrics)

In [89]:
with strategy.scope():
    history = model.fit(datasets['train'], validation_data=datasets['validation'], 
                    epochs=10, verbose=2)

2024-04-13 02:02:45.213944: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:553] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.


Epoch 1/10
INFO:tensorflow:Collective all_reduce tensors: 103 all_reduces, num_devices = 2, group_size = 2, implementation = CommunicationImplementation.NCCL, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 103 all_reduces, num_devices = 2, group_size = 2, implementation = CommunicationImplementation.NCCL, num_packs = 1


INFO:tensorflow:Collective all_reduce IndexedSlices: 1 all_reduces, num_devices =2, group_size = 2, implementation = CommunicationImplementation.NCCL


INFO:tensorflow:Collective all_reduce IndexedSlices: 1 all_reduces, num_devices =2, group_size = 2, implementation = CommunicationImplementation.NCCL


INFO:tensorflow:Collective all_reduce tensors: 103 all_reduces, num_devices = 2, group_size = 2, implementation = CommunicationImplementation.NCCL, num_packs = 1


INFO:tensorflow:Collective all_reduce tensors: 103 all_reduces, num_devices = 2, group_size = 2, implementation = CommunicationImplementation.NCCL, num_packs = 1


INFO:tensorflow:Collective all_reduce IndexedSlices: 1 all_reduces, num_devices =2, group_size = 2, implementation = CommunicationImplementation.NCCL


INFO:tensorflow:Collective all_reduce IndexedSlices: 1 all_reduces, num_devices =2, group_size = 2, implementation = CommunicationImplementation.NCCL
2024-04-13 02:07:24.025595: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:553] The `assert_cardinality` transformation is currently not handled by the auto-shard rewrite and will be removed.


340/340 - 296s - loss: 2.0749 - accuracy: 0.5194 - val_loss: 1.8370 - val_accuracy: 0.5640 - 296s/epoch - 872ms/step
Epoch 2/10
340/340 - 259s - loss: 1.6870 - accuracy: 0.5950 - val_loss: 1.8702 - val_accuracy: 0.5557 - 259s/epoch - 761ms/step
Epoch 3/10
340/340 - 257s - loss: 1.4662 - accuracy: 0.6533 - val_loss: 2.0552 - val_accuracy: 0.5536 - 257s/epoch - 756ms/step
Epoch 4/10
340/340 - 258s - loss: 1.2613 - accuracy: 0.7173 - val_loss: 2.2283 - val_accuracy: 0.5311 - 258s/epoch - 759ms/step
Epoch 5/10
340/340 - 257s - loss: 1.1099 - accuracy: 0.7687 - val_loss: 2.4264 - val_accuracy: 0.5297 - 257s/epoch - 756ms/step
Epoch 6/10
340/340 - 257s - loss: 0.9984 - accuracy: 0.8100 - val_loss: 2.6299 - val_accuracy: 0.5354 - 257s/epoch - 755ms/step
Epoch 7/10


KeyboardInterrupt: 