In [89]:
import modin.pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input
from sklearn.model_selection import train_test_split
import numpy as np

In [57]:
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

In [58]:
df1 = pd.read_csv('./data/goemotions1.csv')
df2 = pd.read_csv('./data/goemotions2.csv')
df3 = pd.read_csv('./data/goemotions3.csv')

In [59]:
list(df1.columns)

['text',
 'id',
 'author',
 'subreddit',
 'link_id',
 'parent_id',
 'created_utc',
 'rater_id',
 'example_very_unclear',
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [60]:
df1 = df1.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [61]:
df2 = df2.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [62]:
df3 = df3.loc[ : , ['text', 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']]

In [63]:
df1.shape

(70000, 29)

In [64]:
total_df = pd.concat([df1, df2, df3])

In [65]:
total_df.shape

(211225, 29)

In [84]:
total_df = total_df.dropna()

In [85]:
total_df.shape

(211225, 29)

In [52]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

In [66]:
total_df['text'][:5]

0                                      That game hurt.
1     >sexuality shouldn’t be a grouping category I...
2       You do right, if you don't care then fuck 'em!
3                                   Man I love reddit.
4    [NAME] was nowhere near them, he was by the Fa...
Name: text, dtype: object

In [67]:
total_df['text'] = total_df['text'].apply(clean_text)
total_df['text'] = total_df['text'].str.replace('\d+', '')

In [69]:
total_df['text'][:5]

0                                            game hurt
1    sexuality shouldnt grouping category makes dif...
2                              right dont care fuck em
3                                      man love reddit
4                             name nowhere near falcon
Name: text, dtype: object

In [75]:
MAX_NB_WORDS = 50000

MAX_SEQUENCE_LENGTH = 250

EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(total_df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 32388 unique tokens.


In [76]:
X = tokenizer.texts_to_sequences(total_df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (211225, 250)


In [81]:
Y = total_df.iloc[ : , 1:].astype(np.float32).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (211225, 28)


In [83]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(190102, 250) (190102, 28)
(21123, 250) (21123, 28)


In [90]:
input = Input(shape=(MAX_SEQUENCE_LENGTH, ))
x = Embedding(MAX_NB_WORDS, EMBEDDING_DIM)(input)
x = SpatialDropout1D(0.2)(x)
x = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(x)
output = Dense(28, activation='softmax')(x)
model = tf.keras.Model(inputs=input, outputs=output)

In [91]:
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer='adam', metrics=['accuracy'])

In [92]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 250)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 250, 100)          5000000   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 250, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense_1 (Dense)             (None, 28)                2828      
                                                                 
Total params: 5,083,228
Trainable params: 5,083,228
Non-trainable params: 0
___________________________________________________