In [9]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

import pandas as pd
import re

import keras_nlp
import numpy as np

from tqdm import tqdm 

AUTO = tf.data.AUTOTUNE

In [10]:
tf.__version__

'2.15.0'

In [11]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

In [12]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [13]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /opt/userdata/22BCE2700/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
import os

In [16]:
os.listdir('./')

['.ipynb_checkpoints', 'NER', 'Emotion Extraction']

In [14]:
df1 = pd.read_csv('./data/goemotions1.csv')
df2 = pd.read_csv('./data/goemotions2.csv')
df3 = pd.read_csv('./data/goemotions3.csv')

FileNotFoundError: [Errno 2] No such file or directory: './data/goemotions1.csv'

In [None]:
list(df1.columns)

In [None]:
df1 = df1.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [None]:
df2 = df2.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [None]:
df3 = df3.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [None]:
df1.shape

In [None]:
total_df = pd.concat([df1, df2, df3])

In [None]:
total_df.shape

In [None]:
total_df = total_df.dropna()

In [None]:
total_df.shape

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [None]:
total_df['text'][10:16]

In [None]:
total_df['text'] = total_df['text'].apply(clean_text)
total_df['text'] = total_df['text'].str.replace('\d+', '')

In [None]:
total_df['text'][10:16]

In [None]:
MAX_NB_WORDS = 50000

MAX_SEQUENCE_LENGTH = 250

EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(total_df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X = tokenizer.texts_to_sequences(total_df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
total_df.columns

In [None]:
Y = total_df.loc[ : , ['admiration', 'amusement', 'anger', 'annoyance', 'approval',
       'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
       'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
       'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride',
       'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']].astype(np.float32).values
print('Shape of label tensor:', Y.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(total_df['text'],Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
total_df['text'][:1]

In [None]:
Y[:1]

In [None]:
with strategy.scope():
    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
        "distil_bert_base_en_uncased",
        sequence_length=256,
    )
    encoder = keras_nlp.models.DistilBertBackbone.from_preset(
        "distil_bert_base_en_uncased"
    )
    encoder.trainable = True

In [7]:
with strategy.scope():
    txt = tf.keras.layers.Input(shape=(), dtype=tf.string)
    x = preprocessor(txt)
    x = encoder(x)
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(Y_train.shape[1], activation='softmax')(x)
    model = tf.keras.Model(inputs=[txt], outputs=x)

In [8]:
with strategy.scope():
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    metrics = [tf.keras.metrics.CategoricalAccuracy('accuracy', dtype=tf.float32)]
    loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
    model.compile(optimizer, loss, metrics=metrics)

In [None]:
BATCH_SIZE = 128
EPOCHS = 5
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./checkpoints/DistilBERT',
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs/DistilBERT')

In [None]:
with strategy.scope():
    history = model.fit(datasets['train'], validation_data=datasets['validation'], 
                    epochs=EPOCHS, verbose=2, 
                    callbacks=[model_checkpoint_callback, tensorboard_callback])