In [None]:
import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')

#google drive project path
project_path = '/content/drive/MyDrive/DS/NLP_Final_Project/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import gensim
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer


from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model

from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, Dropout

from sklearn.metrics import classification_report,confusion_matrix, accuracy_score

Fixing random seed for reproducibility

In [None]:
np.random.seed(123)
tf.random.set_seed(123)

**Loading the Dataset**

In [None]:
#load the dataset and update column names
df = pd.read_csv(project_path+'preprocessed_data.csv')
df.columns = ['content', 'emotion']
print(df.shape)
print(df.head(6))

(44538, 2)
                                             content  emotion
0                              merci pour le partage  Neutral
1                 dorian gray rainbow scarf lovewins    Happy
2  replace wish artist uses next installation ent...    Happy
3  thank following back great hear diverse & amp ...    Happy
4                     beautiful jewel portrait r rex    Happy
5                              always loved painting    Happy


In [None]:
# drop rows with null values
df_new= df.dropna(axis=0)
df_new.shape

(44473, 2)

In [None]:
train_df, test_df = train_test_split(df_new, test_size=0.35, random_state=42)
print(f'Train: {len(train_df)}; Test: {len(test_df)}')

Train: 28907; Test: 15566


split the dataset in train and test

In [None]:
# splitting the dataframe into training, validation, and testing data.
X_train = train_df['content']
y_train = train_df['emotion']

X = test_df['content']
y = test_df['emotion']

X_test, X_valid, y_test, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

print(f'X_train: {len(X_train)}; X_test: {len(X_test)}; X_valid: {len(X_valid)}')
print(f'y_train: {len(y_train)}; y_test: {len(y_test)}; y_valid: {len(y_valid)}')

X_train: 28907; X_test: 11674; X_valid: 3892
y_train: 28907; y_test: 11674; y_valid: 3892


In [None]:
# convert category to numbers and convert the class vector to binary class matrix
encoder = OneHotEncoder()
y_train = encoder.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_test = encoder.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()
y_valid = encoder.fit_transform(np.array(y_valid).reshape(-1, 1)).toarray()
labels = np.unique(encoder.inverse_transform(y_train))

In [None]:
# Tokenize words
tokenizer = Tokenizer(oov_token='UNK')
tokenizer.fit_on_texts(pd.concat([X_train, X_test], axis=0))

In [None]:
tokenizer.word_index

{'UNK': 1,
 'feel': 2,
 'feeling': 3,
 'like': 4,
 'im': 5,
 'really': 6,
 'know': 7,
 'time': 8,
 'little': 9,
 'get': 10,
 'people': 11,
 'would': 12,
 'one': 13,
 'want': 14,
 'think': 15,
 'still': 16,
 'even': 17,
 'ive': 18,
 'life': 19,
 'make': 20,
 'bit': 21,
 'much': 22,
 'love': 23,
 'something': 24,
 'things': 25,
 'going': 26,
 'could': 27,
 'day': 28,
 'way': 29,
 'back': 30,
 'go': 31,
 'see': 32,
 'good': 33,
 'pretty': 34,
 'need': 35,
 'always': 36,
 'today': 37,
 'right': 38,
 'also': 39,
 'work': 40,
 'say': 41,
 'feelings': 42,
 'feels': 43,
 'around': 44,
 'made': 45,
 'cant': 46,
 'though': 47,
 'well': 48,
 'got': 49,
 'felt': 50,
 'happy': 51,
 'never': 52,
 'help': 53,
 'days': 54,
 'didnt': 55,
 'quite': 56,
 'every': 57,
 'many': 58,
 'someone': 59,
 'look': 60,
 'less': 61,
 'makes': 62,
 'new': 63,
 'sure': 64,
 'last': 65,
 'find': 66,
 'enough': 67,
 'lot': 68,
 'away': 69,
 'left': 70,
 'anything': 71,
 'home': 72,
 'come': 73,
 'take': 74,
 'kind': 75,

In [None]:
##converting a sentence to list of indexes
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
sequences_val = tokenizer.texts_to_sequences(X_valid)

In [None]:
sequences_train[0]

[1264]

Deciding the maximum length of review so that we can either pad or truncate in the end

In [None]:
def max_length(data):
    length = []
    for a in range(len(data)):
        length.append(len(data[a]))

    length = sorted(length)

    return length

In [None]:
x = max_length(X_train.to_list())

morethan_500_train = 0
for i in range(len(x)):
    if x[i] > 500:
        morethan_500_train += 1

print('train max',max(x))
print('train over 500',morethan_500_train)

y = max_length(X_test.to_list())
morethan_500_test = 0
for i in range(len(y)):
    if y[i] > 500:
        morethan_500_test += 1

print('test max',max(y))
print('test over 500',morethan_500_test)

z = max_length(X_valid.to_list())
morethan_500_val = 0
for i in range(len(z)):
    if y[i] > 500:
        morethan_500_val += 1
print('val max',max(z))
print('val over 500',morethan_500_val)


train max 232
train over 500 0
test max 223
test over 500 0
val max 212
val over 500 0


Max length is 232 and there are no strings which are having a length of more than 500,so it's better to set the max length to 232

In [None]:
#Truncate or pad the input sequences so that they are all the same length for modeling.
X_train = pad_sequences(sequences_train, maxlen=232,  padding= 'post', truncating= 'post')
X_test = pad_sequences(sequences_test, maxlen=232,  padding= 'post', truncating= 'post')
X_val = pad_sequences(sequences_val, maxlen=232,  padding= 'post', truncating= 'post')

In [None]:
print('Before padding and truncating',sequences_train[0], '\n', 'After Padding','\n',X_train[0])

Before padding and truncating [1264] 
 After Padding 
 [1264    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0  

Word Embedding using Glove

We used Glove Embeddings, because they have a good representation of the words

In [None]:
# Read word vectors
word_embeddings = dict()

f = open('glove.6B.200d.txt', encoding='utf8')

for line in f:
    values = line.split()
    word = values[0]
    embedding_vector = np.asarray(values[1:], dtype = 'float32')
    word_embeddings[word] = embedding_vector

f.close()

print('Length of word embeddings is: %s' % len(word_embeddings))


Length of word embeddings is: 205389


Creating a embedding matrix cosisting of all zeros for all words in the word list of our dataset each one with a length of 100. After creating that, we replace the zeros with the embedding we have called from the Glove.

In [None]:
vocabSize = len(tokenizer.index_word) + 1
print(f"Vocabulary size = {vocabSize}")

Vocabulary size = 22269


In [None]:
# Assign word vectors to our dictionary/vocabulary
embedding_vector_length = 200
hits = 0
misses = 0
embedding_matrix = np.zeros((vocabSize,embedding_vector_length))

for word, i in tokenizer.word_index.items():
    embedding_vector = word_embeddings.get(word)

    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits +=1
    else:
        misses +=1

In [None]:
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18674 words (3594 misses)


In [None]:
print(tokenizer.word_index['one'])
print(word_embeddings['one'])
print(embedding_matrix[13])

13
[-5.2065e-02  3.8853e-01 -2.9030e-01 -2.3361e-01 -3.1615e-02  2.3060e-02
 -4.4796e-01  1.2344e-01 -6.5561e-04 -4.0855e-01  1.2563e-01  2.6825e-01
  3.1878e-01  3.5943e-01  1.3036e-01 -1.1321e-01 -1.7510e-01 -4.6895e-02
 -2.2966e-01 -1.9651e-01  1.4041e-01  3.3850e+00  4.7921e-01 -5.6123e-01
  2.6515e-01 -4.3217e-01 -6.4114e-02 -3.6190e-01 -7.9262e-02 -2.4169e-01
  1.4490e-03  1.7877e-01  2.2364e-01  1.2274e-01 -2.2854e-01 -6.2958e-02
 -6.5682e-01 -5.0515e-01 -4.3035e-01 -1.4219e-01 -3.9620e-01 -1.7886e-01
  1.9574e-01  3.2614e-01 -4.1829e-01  1.9406e-01  2.9966e-01 -2.8482e-02
 -1.8621e-01  6.5620e-02 -1.5799e-01 -4.2951e-02  1.4633e-01  5.3079e-01
  4.0991e-01  3.8414e-01 -3.6313e-02 -1.3173e-01 -1.0549e-01  4.0602e-01
 -8.8740e-02  9.3583e-02 -5.8678e-01 -2.7815e-02  2.5363e-02  1.8599e-01
 -4.0163e-02  9.5220e-02 -2.7638e-01 -1.6227e-02  3.1520e-01 -2.5334e-01
  1.2982e-01  2.4421e-01 -1.3057e-01  5.7597e-01 -3.8969e-01 -2.0226e-01
 -1.4958e-01 -3.0165e-01  1.6215e-01 -1.5015e-01

**Building RNN using bidirectional LSTM**

In [None]:
# Build neural network architecture

adam = Adam(learning_rate=0.005)
model = Sequential()
model.add(Embedding(vocabSize, 200, input_length=X_train.shape[1], weights=[embedding_matrix], trainable=False))
model.add(Bidirectional(LSTM(256, dropout=0.2,recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(128, dropout=0.2,recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(128, dropout=0.2,recurrent_dropout=0.2)))
model.add(Dense(7, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 232, 200)          4453800   
                                                                 
 bidirectional_3 (Bidirectio  (None, 232, 512)         935936    
 nal)                                                            
                                                                 
 bidirectional_4 (Bidirectio  (None, 232, 256)         656384    
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 256)              394240    
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 7)                 1799      
                                                      

In [None]:
#to stop the training when the loss starts to increase
callbacks = [EarlyStopping(monitor='val_categorical_accuracy', patience=5, min_delta=0.001, restore_best_weights=True),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_categorical_accuracy', save_best_only=True)]

In [None]:
history = model.fit(X_train,
                    y_train,
                    validation_data=(X_valid, y_valid),
                    verbose=1,
                    batch_size=32,
                    epochs=4,
                    callbacks=[callbacks]
                   )

Epoch 1/4
  1/904 [..............................] - ETA: 8:24:14 - loss: 1.9175 - accuracy: 0.4062

In [None]:
#prediction on train data
Y_train_pred = model.predict(X_train)

y_train_pred =[]
for i in Y_train_pred:
    y_train_pred.append(np.argmax(i))

Y_train =[]
for i in y_train:
    Y_train.append(np.argmax(i))

print('Classification Report:\n\n',classification_report(y_train,Y_train_pred, target_names=labels))

In [None]:
#Plot training and validation accuracy along with loss
plt.figure(figsize=(20, 6))

plt.subplot(1, 2, 1)
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')

plt.show()

In [None]:
#prediction on test data
Y_test_pred = model.predict(X_test)

y_test_pred =[]
for i in Y_test_pred:
    y_test_pred.append(np.argmax(i))

Y_test =[]
for i in y_test:
    Y_test.append(np.argmax(i))

print('Classification Report:\n\n',classification_report(y_test,Y_test_pred, target_names=labels))