<a href="https://colab.research.google.com/github/abhi-11nav/Text-Emotion-Detection/blob/main/Text_Emotion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the necessary libraries 

import pandas as pd
import numpy as np 

In [2]:
# Cloning the github repository 

!git clone https://github.com/abhi-11nav/Text-Emotion-Detection.git

Cloning into 'Text-Emotion-Detection'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 45 (delta 26), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (45/45), done.


In [3]:
# Importing data

data = pd.read_csv("/content/Text-Emotion-Detection/tweet_emotions.csv")

In [4]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


Funeral ceremony...gloomy friday...

In [5]:
# Let us drop the tweet id

data.drop("tweet_id", axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [7]:
# Let us check if the tweet has any missing values 

data.isna().any()

sentiment    False
content      False
dtype: bool

No missing values

In [8]:
# Let us check the number of categories in sentiment variable

data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

The data seems imbalanced. Let us deal with it after a bit

In [9]:
# Let us look at the sentences

data['content'][0]

'@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =['

In [10]:
data['content'][1]

'Layin n bed with a headache  ughhhh...waitin on your call...'

Text Preprocessing

In [11]:
# Importing libraries

import re 

import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [12]:
def text_preprocess(dataset,list_name):
  
  for i in range(dataset.shape[0]):
    list_name.append(re.sub('[^a-zA-Z]',' ',str(dataset.iloc[i,1])))

  print("Number and other symbols eliminated from the text")

  # String spacing 
  for x in range(len(list_name)):
    list_name[x] = " ".join(y for y in str(list_name[x]).split()).lower()

  print("Text reorganized and converted to small letter")
  
  for index in range(len(list_name)):
    temp_list= []
    # Lemmatization
    for word in list_name[index].split():
      if word not in stopwords.words('english'):
        temp_list.append(word)
    list_name[index] = " ".join(lemmatizer.lemmatize(words) for words in temp_list )

In [13]:
sentences = []

text_preprocess(data,sentences)

Number and other symbols eliminated from the text
Text reorganized and converted to small letter


In [14]:
p_data = pd.concat([pd.DataFrame(np.array(sentences), columns=["Content"]), data['sentiment']], axis=1)

**WORD2VEC MANUAL**

Applying one hot encoding '

In [None]:
from keras.preprocessing.text import one_hot

In [None]:
p_data.head()

Unnamed: 0,Content,sentiment
0,tiffanylue know listenin bad habit earlier sta...,empty
1,layin n bed headache ughhhh waitin call,sadness
2,funeral ceremony gloomy friday,sadness
3,want hang friend soon,enthusiasm
4,dannycastillo want trade someone houston ticke...,neutral


In [None]:
# List conatining sentences

sentences = [sent for sent in p_data["Content"]]

In [None]:
# Unique words

unique_word_list = []

for index in range(len(sentences)):
  [unique_word_list.append(w) for w in sentences[index].split()]


unique_words = list(set(unique_word_list))

print(len(unique_words))

42763


In [None]:
# Vocabulary_size 

vocab_size = len(unique_words)

In [None]:
one_hot_encoder = [one_hot(words, vocab_size) for words in sentences]

In [None]:
one_hot_encoder[0]

[37387, 32288, 33984, 12232, 17768, 4014, 42550, 34624, 26440]

Padding

In [None]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Finidng the sentence with maximum length

max_length = 0

for sent in sentences:
  if len(sent) > max_length:
    max_length = len(sent)

In [None]:
max_length

133

In [None]:
embedded_docs = pad_sequences(one_hot_encoder, padding='pre', maxlen=max_length)

In [None]:
embedded_docs[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0, 37387, 32288,
       33984, 12232,

Converting values into vector features

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Model, Sequential

In [None]:
vector_dimension = 50

In [None]:
model = Sequential()

Embedding layer and LSTM RNN Model

In [None]:
model.add(Embedding(vocab_size, output_dim = vector_dimension,input_length=max_length))
model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(13, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')

In [None]:
from keras.callbacks import EarlyStopping

checkpoint = EarlyStopping(patience = 20)

In [None]:
history = model.fit(embedded_docs, y, validation_split=0.15, epochs = 100,callbacks=checkpoint)

Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


(40000, 133, 50)

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
model = GaussianNB()

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.1,random_state=101)

In [None]:
model.fit(train_X, train_y)

ValueError: ignored

In [None]:
train_X = np.array(train_X)

  """Entry point for launching an IPython kernel.


In [None]:
train_X = np.array([x.reshape(-1,1) for x in train_X])

  """Entry point for launching an IPython kernel.


In [None]:
predictions = model.predict(test_X)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(test_y, predictions)

0.019

3

Text preprocessing done

Converting text to vectors 

Word2vec

In [15]:
# Importing necessary libraries

import gensim

from gensim.models import Word2Vec

from tqdm import tqdm

from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
# Words list

words_list = []

# looping through to append words
for index in range(len(sentences)):
  words_list.append(nltk.word_tokenize(sentences[index]))

print(len(words_list)," length of sentences")

40000  length of sentences


In [17]:
empty_lists = []

for i,wl in enumerate(words_list):
  if not wl:
    empty_lists.append(i)

print("The number of empty lists are: ", len(empty_lists))

The number of empty lists are:  21


Since there are 21 empty lists. We will combine them with the labels and drop the 21 rows

In [18]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

preprocessed_data = pd.concat([pd.DataFrame(np.array(words_list)),pd.DataFrame(data['sentiment'])], axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
# Checking for null values 

preprocessed_data.isna().any()

0            False
sentiment    False
dtype: bool

In [20]:
# We have empty lists that we have to get rid of and we have the indexes of those lists store in empty_lists list

# Verifying elemnts from the list

for indexes in empty_lists:
  print(preprocessed_data.iloc[indexes,0])

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


There we go, our empty lists. 

In [21]:
preprocessed_data.drop(empty_lists, axis=0, inplace=True)

In [22]:
word_lists = [lists for lists in preprocessed_data.iloc[:,0]]

In [23]:
model = gensim.models.Word2Vec(words_list, window=5, min_count = 2)

In [24]:
# Empty list 
X = []

# Looping though words
for words in tqdm(word_lists):
  X.append(np.mean([model.wv[word] for word in words if word in model.wv.index2word], axis=0))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 39979/39979 [00:20<00:00, 1946.58it/s]


In [25]:
# Coverting them to arrays

X = np.array(X)
y = preprocessed_data['sentiment']

  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
labels = []
corresponding_num = []

for ind,lab in enumerate(y.unique()):
  labels.append(lab)
  corresponding_num.append(ind)

In [27]:
encodings = [val for val in y]

In [28]:
for i,value in enumerate(encodings):
  for ind,unique in enumerate(labels):
    if value==unique:
      encodings[i] = ind

In [29]:
encodings = np.array(encodings)

In [30]:
y = encodings

Checking types

In [31]:
# Converting all the arrays to same data type

X = np.array([val.astype(np.float64) for val in X])

  This is separate from the ipykernel package so we can avoid doing imports until


Checking for null values in the array

In [32]:
pd.DataFrame(X).isna().sum()

0    227
dtype: int64

Found 227 null values

In [33]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

vector_data = pd.concat([pd.DataFrame(X),pd.DataFrame(y)], axis=1)

In [34]:
vector_data.head()

Unnamed: 0,0,0.1
0,"[0.5506941080093384, 0.003065967932343483, -0....",0
1,"[0.4870327413082123, 0.017306378111243248, -0....",1
2,"[0.31976938247680664, -0.0656706765294075, -0....",1
3,"[0.7915036678314209, -0.07395879179239273, -0....",2
4,"[0.6090165972709656, 0.023950688540935516, -0....",3


In [35]:
vector_data.isna().any()

0     True
0    False
dtype: bool

Dropping all the null values

In [36]:
vector_data.dropna(inplace=True)

In [37]:
vector_data.shape

(39752, 2)

In [101]:
X = np.array([feat for feat in vector_data.iloc[:,0]])
y = np.array([label for label in vector_data.iloc[:,1]])

In [102]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y, train_size = 0.93, random_state= 12)

In [103]:
from sklearn.naive_bayes import GaussianNB

In [104]:
gnb = GaussianNB()

gnb.fit(train_X, train_y)

GaussianNB()

In [105]:
predictions = gnb.predict(test_X)

In [106]:
from sklearn.metrics import accuracy_score

In [107]:
score = accuracy_score(test_y, predictions)

In [108]:
print("And the final score is ...... ..... ...", score)

And the final score is ...... ..... ... 0.0693496227093065


In [109]:
train_X.shape

(36969, 100)

In [110]:
train_y.shape

(36969,)

Converting to categories

In [111]:
from keras.utils import to_categorical

In [112]:
train_y = to_categorical(train_y,13)

In [113]:
# Covertiing test_y to binary 

test_y = to_categorical(test_y,13)

## LSTM RNN MODEL

Implementing Bi-directional Long short term Memory recurrent neural network 

In [138]:
# Importing the necessary libraries

import tensorflow 
from tensorflow import keras

from keras.layers import Dense, Flatten, Input, LSTM, Bidirectional, Embedding, Dropout, CuDNNLSTM
from keras.models import Model, Sequential

In [139]:
train_X.shape[1:]

(100,)

The fluctuations are normal within certain limits and depend on the fact that you use a heuristic method but in your case they are excessive. Despite all the performance takes a definite direction and therefore the system works. From the graphs you have posted, the problem depends on your data so it's a difficult training. If you have already tried to change the learning rate try to change training algorithm. You would agree to test your data: first compute the Bayes error rate using a KNN (use the trick regression in case you need), in this way you can check whether the input data contain all the information you need. Then try the LSTM without the validation or dropout to verify that it has the ability to achieve the result for you necessary. If the training algorithm is not suitable you should have the same problems even without the validation or dropout. Just at the end adjust the training and the validation size to get the best result in the test set. Statistical learning theory is not a topic that can be talked about at one time, we must proceed step by step.


source :https://stats.stackexchange.com/questions/345990/why-does-the-loss-accuracy-fluctuate-during-the-training-keras-lstm

In [142]:
input = Input(shape=(100,1))
lstm = CuDNNLSTM(100, return_sequences=True)(input)
dropout = Dropout(0.2)(lstm)
lstm2 = CuDNNLSTM(100)(dropout)
dropout2 = Dropout(0.2)(lstm2)
lstm3 = Dense(100, activation="relu")(dropout2)
dropout3 = Dropout(0.2)(lstm3)
prediction = Dense(13, activation="softmax")(dropout3)

In [143]:
# Model

model = Model(inputs = input, outputs = prediction)

In [144]:
model.summary()

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 100, 1)]          0         
                                                                 
 cu_dnnlstm (CuDNNLSTM)      (None, 100, 100)          41200     
                                                                 
 dropout_17 (Dropout)        (None, 100, 100)          0         
                                                                 
 cu_dnnlstm_1 (CuDNNLSTM)    (None, 100)               80800     
                                                                 
 dropout_18 (Dropout)        (None, 100)               0         
                                                                 
 dense_17 (Dense)            (None, 100)               10100     
                                                                 
 dropout_19 (Dropout)        (None, 100)               0   

In [146]:
# Setting the learning rate for the optimizer. 

adam_optimizer = keras.optimizers.Adam(learning_rate=1e-3, decay=1e-6)

# Compiling the model

model.compile(optimizer=adam_optimizer, loss="categorical_crossentropy", metrics="accuracy")

Keras callbacks

In [147]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [155]:
save_best = ModelCheckpoint("lstm_model.h5", save_best_only=True)

In [156]:
train_X.shape

(36969, 100)

In [157]:
train_y.shape

(36969, 13)

In [158]:
test_X.shape

(2783, 100)

In [159]:
test_y.shape

(2783, 13)

In [160]:
model.fit(train_X, train_y, validation_data=(test_X,test_y),epochs=1000,callbacks=[save_best])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

KeyboardInterrupt: ignored