In [8]:
import numpy as np 
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
STOPWORDS = set(stopwords.words('english'))

In [9]:
df = pd.concat([pd.read_csv("train.txt",sep=';',header=None),pd.read_csv("test.txt",sep=';',header=None),pd.read_csv("val.txt",sep=';',header=None)])
df = df.reset_index()
del df['index']
df.columns = ['text','label']
df

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
19995,im having ssa examination tomorrow in the morn...,sadness
19996,i constantly worry about their fight against n...,joy
19997,i feel its important to share this info for th...,joy
19998,i truly feel that if you are passionate enough...,joy


In [10]:
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 

def clean_text(text):
    
    text = [lemmatizer.lemmatize(word,pos="v") for word in text.split() if word not in STOPWORDS]
    text= ' '.join(text)
    text = [lemmatizer.lemmatize(word,pos="a") for word in text.split() if word not in STOPWORDS]
    text= ' '.join(text)
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    return text
df['text'] = df['text'].apply(clean_text)

In [11]:
df['text'] = df['text'].str.replace('\d+', '')
df['text']

0                                     didnt feel humiliate
1        go feel hopeless damn hopeful around someone c...
2                    im grab minute post feel greedy wrong
3        ever feel nostalgic fireplace know still property
4                                             feel grouchy
                               ...                        
19995    im ssa examination tomorrow morning im quite w...
19996    constantly worry fight nature push limit inner...
19997           feel important share info experience thing
19998    truly feel passionate enough something stay tr...
19999    feel like wanna buy cute make see online even one
Name: text, Length: 20000, dtype: object

In [12]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 13518 unique tokens.


In [13]:
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (20000, 250)


In [14]:
Y = pd.get_dummies(df['label']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (20000, 6)


In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(18000, 250) (18000, 6)
(2000, 250) (2000, 6)


In [16]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 250, 100)          5000000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 250, 100)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 6)                 606       
Total params: 5,081,006
Trainable params: 5,081,006
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
epochs = 10
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
#model.fit(X_train,Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=callbacks_list,shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [18]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.302
  Accuracy: 0.909


In [41]:
new_complaint = ['i am anxious of COVID-19']
new_complaint = [clean_text(new_complaint[0])]
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['anger','fear','joy','love','sadness','surprise']
print(pred, labels[np.argmax(pred)])

[[0.0439792  0.6529806  0.00621751 0.01418804 0.00707484 0.2755597 ]] fear


In [46]:
model.save('emotionClassifierMain.h5')

In [49]:
from tensorflow.keras.models import load_model
models = load_model('emotionClassifierMain.h5')

new_complaint = ['i am anxious of COVID-19']
new_complaint = [clean_text(new_complaint[0])]
seq = tokenizer.texts_to_sequences(new_complaint)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = models.predict(padded)
labels = ['anger','fear','joy','love','sadness','surprise']
print(pred, labels[np.argmax(pred)])

[[0.0439792  0.6529806  0.00621751 0.01418804 0.00707484 0.2755597 ]] fear


In [48]:
models.weights

[<tf.Variable 'embedding/embeddings:0' shape=(50000, 100) dtype=float32, numpy=
 array([[-0.01057449,  0.03210204,  0.01201031, ..., -0.00078739,
          0.03969959, -0.03208946],
        [ 0.07459956, -0.12140439,  0.08571819, ...,  0.02012362,
         -0.09786146,  0.20183416],
        [ 0.00773674,  0.04811166,  0.01266558, ..., -0.00181367,
         -0.06616006, -0.07472518],
        ...,
        [-0.00136031, -0.04992164, -0.01082174, ..., -0.0189304 ,
         -0.00041965, -0.03148166],
        [-0.01161108, -0.0490936 , -0.02135062, ...,  0.00335604,
          0.03284726,  0.02408278],
        [ 0.00160886,  0.04714615, -0.0274642 , ...,  0.045414  ,
         -0.03262911,  0.00295988]], dtype=float32)>,
 <tf.Variable 'lstm/lstm_cell_5/kernel:0' shape=(100, 400) dtype=float32, numpy=
 array([[ 1.3527183e-01, -1.8261538e-01,  1.3329285e-01, ...,
          1.8107904e-01, -8.3155960e-02,  1.5517287e-01],
        [-1.1723689e-01,  6.8732217e-02, -2.8288853e-01, ...,
         -1.43