In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

import pandas as pd
import re

import keras_nlp
import numpy as np

from tqdm import tqdm 

AUTO = tf.data.AUTOTUNE

2024-04-12 22:34:17.984601: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-12 22:34:18.018013: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-12 22:34:18.018040: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-12 22:34:18.018925: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-12 22:34:18.024423: I tensorflow/core/platform/cpu_feature_guar

Using TensorFlow backend


In [2]:
tf.__version__

'2.15.0'

In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

In [4]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


2024-04-12 22:34:27.374011: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22495 MB memory:  -> device: 0, name: NVIDIA RTX A5000, pci bus id: 0000:65:00.0, compute capability: 8.6
2024-04-12 22:34:27.374510: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 22418 MB memory:  -> device: 1, name: NVIDIA RTX A5000, pci bus id: 0000:b3:00.0, compute capability: 8.6


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

In [None]:
df1 = pd.read_csv('./data/goemotions1.csv')
df2 = pd.read_csv('./data/goemotions2.csv')
df3 = pd.read_csv('./data/goemotions3.csv')

In [None]:
list(df1.columns)

In [None]:
df1 = df1.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [None]:
df2 = df2.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [None]:
df3 = df3.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [None]:
df1.shape

In [None]:
total_df = pd.concat([df1, df2, df3])

In [None]:
total_df.shape

In [None]:
total_df = total_df.dropna()

In [None]:
total_df.shape

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [None]:
total_df['text'][10:16]

In [None]:
total_df['text'] = total_df['text'].apply(clean_text)
total_df['text'] = total_df['text'].str.replace('\d+', '')

In [None]:
with strategy.scope():
    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
        "distil_bert_base_en_uncased",
        sequence_length=256,
    )
    encoder = keras_nlp.models.DistilBertBackbone.from_preset(
        "distil_bert_base_en_uncased"
    )
    encoder.trainable = True

In [7]:
with strategy.scope():
    txt = tf.keras.layers.Input(shape=(), dtype=tf.string)
    x = preprocessor(txt)
    x = encoder(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(Y_train.shape[1], activation='softmax')(x)
    model = tf.keras.Model(inputs=[txt], outputs=x)

In [8]:
with strategy.scope():
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    metrics = [tf.keras.metrics.CategoricalAccuracy('accuracy', dtype=tf.float32)]
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
    model.compile(optimizer, loss, metrics=metrics)

In [None]:
BATCH_SIZE = 128
EPOCHS = 5

In [None]:
with strategy.scope():
    history = model.fit(X_train, Y_train1, validation_data=(X_test, Y_test1), 
                        batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)