<a href="https://colab.research.google.com/github/abhi-11nav/Text-Emotion-Detection/blob/main/Text_Emotion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the necessary libraries 

import pandas as pd
import numpy as np 

In [2]:
# Cloning the github repository 

!git clone https://github.com/abhi-11nav/Text-Emotion-Detection.git

Cloning into 'Text-Emotion-Detection'...
remote: Enumerating objects: 54, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 54 (delta 32), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (54/54), done.


In [3]:
# Importing data

data = pd.read_csv("/content/Text-Emotion-Detection/tweet_emotions.csv")

In [4]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


Funeral ceremony...gloomy friday...

In [5]:
# Let us drop the tweet id

data.drop("tweet_id", axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [7]:
# Let us check if the tweet has any missing values 

data.isna().any()

sentiment    False
content      False
dtype: bool

No missing values

In [8]:
# Let us check the number of categories in sentiment variable

data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

Since the data is imbalanced, we'll be deadling with it 

Data Imbalance

### Eliminating the last two categories of sentiment as they are least represented. 

In [9]:
# dropping the last two samples

# Appending indexes to remove
indexes_to_remove = []


for index in data[data['sentiment']=="boredom"].index:
  indexes_to_remove.append(index)

for index in data[data['sentiment']=="anger"].index:
  indexes_to_remove.append(index)

In [10]:
len(indexes_to_remove)

289

In [11]:
data.drop(indexes_to_remove, inplace=True, axis=0)

In [12]:
data["sentiment"].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
Name: sentiment, dtype: int64

In [13]:
labels = [label for label in data["sentiment"].unique()]

In [14]:
balanced_df = pd.DataFrame()

for label in labels: 
  balanced_df = pd.concat([data[data["sentiment"]==label].sample(759),balanced_df], axis=0)

In [15]:
balanced_df["sentiment"].value_counts()

relief        759
happiness     759
hate          759
fun           759
love          759
surprise      759
worry         759
neutral       759
enthusiasm    759
sadness       759
empty         759
Name: sentiment, dtype: int64

 Now we have a balanced dataset

In [16]:
# shuffling samples and resetting indexes

balanced_df = balanced_df.sample(len(balanced_df))

In [17]:
balanced_df.reset_index(inplace=True)

In [18]:
balanced_df.head()

Unnamed: 0,index,sentiment,content
0,25206,happiness,"@msstacy13 Well, thanks for thinking of me! An..."
1,15706,hate,I totally forgot thats its friday till I read ...
2,37973,worry,Take antibacterial to school to clean your han...
3,2209,hate,Have you ever stepped on a slug by accident? I...
4,12418,empty,@TreeinCally He's got adjusting his mate's on...


In [19]:
balanced_df.drop("index", inplace=True, axis=1)

In [20]:
# Changing the name of the data frame

data = balanced_df

In [21]:
# Let us look at the sentences

data['content'][0]

"@msstacy13 Well, thanks for thinking of me! And if you ever do get the scratch for one, well, I'm right here!"

In [22]:
data['content'][1]

"I totally forgot thats its friday till I read @AlyssaNoelleD's tweet. Ha. I feel dumb  *Take A Breath*"

Text Preprocessing

In [23]:
# Importing libraries

import re 

import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [24]:
def text_preprocess(dataset,list_name):
  
  for i in range(dataset.shape[0]):
    list_name.append(re.sub('[^a-zA-Z]',' ',str(dataset.iloc[i,1])))

  print("Number and other symbols eliminated from the text")

  # String spacing 
  for x in range(len(list_name)):
    list_name[x] = " ".join(y for y in str(list_name[x]).split()).lower()

  print("Text reorganized and converted to small letter")
  
  for index in range(len(list_name)):
    temp_list= []
    # Lemmatization
    for word in list_name[index].split():
      if word not in stopwords.words('english'):
        temp_list.append(word)
    list_name[index] = " ".join(lemmatizer.lemmatize(words) for words in temp_list )

In [25]:
sentences = []

text_preprocess(data,sentences)

Number and other symbols eliminated from the text
Text reorganized and converted to small letter


In [26]:
p_data = pd.concat([pd.DataFrame(np.array(sentences), columns=["Content"]), data['sentiment']], axis=1)

In [27]:
p_data.head()

Unnamed: 0,Content,sentiment
0,msstacy well thanks thinking ever get scratch ...,happiness
1,totally forgot thats friday till read alyssano...,hate
2,take antibacterial school clean hand cant go loo,worry
3,ever stepped slug accident hate bug,hate
4,treeincally got adjusting mate online shop zzz...,empty


Text preprocessing done

Converting text to vectors 

Word2vec

In [28]:
# Importing necessary libraries

import gensim

from gensim.models import Word2Vec

from tqdm import tqdm

from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [29]:
# Words list

words_list = []

# looping through to append words
for index in range(len(sentences)):
  words_list.append(nltk.word_tokenize(sentences[index]))

print(len(words_list)," length of sentences")

8349  length of sentences


In [30]:
empty_lists = []

for i,wl in enumerate(words_list):
  if not wl:
    empty_lists.append(i)

print("The number of empty lists are: ", len(empty_lists))

The number of empty lists are:  2


Since there are 21 empty lists. We will combine them with the labels and drop the 21 rows

In [31]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

preprocessed_data = pd.concat([pd.DataFrame(np.array(words_list)),pd.DataFrame(data['sentiment'])], axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
# Checking for null values 

preprocessed_data.isna().any()

0            False
sentiment    False
dtype: bool

In [33]:
# We have empty lists that we have to get rid of and we have the indexes of those lists store in empty_lists list

# Verifying elemnts from the list

for indexes in empty_lists:
  print(preprocessed_data.iloc[indexes,0])

[]
[]


There we go, our empty lists. 

In [34]:
preprocessed_data.drop(empty_lists, axis=0, inplace=True)

In [35]:
word_lists = [lists for lists in preprocessed_data.iloc[:,0]]

In [36]:
model = gensim.models.Word2Vec(words_list, window=5, min_count = 2)

In [37]:
# Empty list 
X = []

# Looping though words
for words in tqdm(word_lists):
  X.append(np.mean([model.wv[word] for word in words if word in model.wv.index2word], axis=0))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 8347/8347 [00:01<00:00, 4737.44it/s]


In [38]:
# Coverting them to arrays

X = np.array(X)
y = preprocessed_data['sentiment']

  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
labels = []
corresponding_num = []

for ind,lab in enumerate(y.unique()):
  labels.append(lab)
  corresponding_num.append(ind)

In [40]:
encodings = [val for val in y]

In [41]:


for i,value in enumerate(encodings):
  for ind,unique in enumerate(labels):
    if value==unique:
      encodings[i] = ind

In [42]:

encodings = np.array(encodings)

In [43]:



y = encodings

Checking types

In [44]:
# Converting all the arrays to same data type

X = np.array([val.astype(np.float64) for val in X])

  This is separate from the ipykernel package so we can avoid doing imports until


Checking for null values in the array

In [45]:
pd.DataFrame(X).isna().sum()

0    75
dtype: int64

Found 227 null values

In [46]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

vector_data = pd.concat([pd.DataFrame(X),pd.DataFrame(y)], axis=1)

In [47]:
vector_data.head()

Unnamed: 0,0,0.1
0,"[0.5185931921005249, -0.2783828675746918, -0.1...",0
1,"[0.3185077905654907, -0.1711403876543045, -0.1...",1
2,"[0.39131638407707214, -0.21000224351882935, -0...",2
3,"[0.24084103107452393, -0.13017770648002625, -0...",1
4,"[0.23725177347660065, -0.12525340914726257, -0...",3


In [48]:
vector_data.isna().any()

0     True
0    False
dtype: bool

Dropping all the null values

In [49]:
vector_data.dropna(inplace=True)

In [50]:
vector_data.shape

(8272, 2)

In [51]:
X = np.array([feat for feat in vector_data.iloc[:,0]])
y = np.array([label for label in vector_data.iloc[:,1]])

In [52]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y, train_size = 0.93, random_state= 12)

In [53]:
from sklearn.naive_bayes import GaussianNB

In [54]:
gnb = GaussianNB()

gnb.fit(train_X, train_y)

GaussianNB()

In [55]:
predictions = gnb.predict(test_X)

In [56]:
from sklearn.metrics import accuracy_score

In [57]:
score = accuracy_score(test_y, predictions)

In [58]:
print("And the final score is ...... ..... ...", score)

And the final score is ...... ..... ... 0.11896551724137931


In [59]:
train_X.shape

(7692, 100)

In [60]:
train_y.shape

(7692,)

Converting to categories

In [61]:
from keras.utils import to_categorical

In [62]:
train_y = to_categorical(train_y,13)

In [63]:
# Covertiing test_y to binary 
test_y = to_categorical(test_y,13)

## TERM FREQUENCY - INVERSE DOCUMENT FREQUENCY

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
p_data.head()

Unnamed: 0,Content,sentiment
0,supersense ooooo explanation thank god u would...,love
1,last day senior bye bff,sadness
2,noiselesssound heard regina girl song le deux ...,sadness
3,someone said wolverine feel like watching x me...,fun
4,shanselman still class loader even custom asse...,empty


In [None]:
X = p_data['Content']
y= p_data["sentiment"]

In [None]:
labels = []
corresponding_num = []

encodings = [val for val in y]

for ind,lab in enumerate(y.unique()):
  labels.append(lab)
  corresponding_num.append(ind)

for i,value in enumerate(encodings):
  for ind,unique in enumerate(labels):
    if value==unique:
      encodings[i] = ind

encodings = np.array(encodings)

y = encodings

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y , test_size=0.11, random_state=10)

In [None]:
train_sent = [sent for sent in train_X]
test_sent = [sent for sent in test_X]

In [None]:
train_X = vectorizer.fit_transform(train_sent)
test_X = vectorizer.transform(test_sent)

In [None]:
sentences[0]

'supersense ooooo explanation thank god u would forever wondering love good night mare tho'

In [None]:
# Covertiing test_y to binary 
from keras.utils import to_categorical


train_y = to_categorical(train_y,13)

In [None]:
test_y = to_categorical(test_y,13)

In [None]:
train_X.shape

(7430, 13539)

## LSTM RNN MODEL

Implementing Bi-directional Long short term Memory recurrent neural network 

In [93]:
# Importing the necessary libraries

import tensorflow 
from tensorflow import keras

from keras.layers import Dense, Flatten, Input, LSTM, Bidirectional, Embedding, Dropout, CuDNNLSTM, GRU
from keras.models import Model, Sequential

In [94]:
train_X.shape[1:]

(100,)

The fluctuations are normal within certain limits and depend on the fact that you use a heuristic method but in your case they are excessive. Despite all the performance takes a definite direction and therefore the system works. From the graphs you have posted, the problem depends on your data so it's a difficult training. If you have already tried to change the learning rate try to change training algorithm. You would agree to test your data: first compute the Bayes error rate using a KNN (use the trick regression in case you need), in this way you can check whether the input data contain all the information you need. Then try the LSTM without the validation or dropout to verify that it has the ability to achieve the result for you necessary. If the training algorithm is not suitable you should have the same problems even without the validation or dropout. Just at the end adjust the training and the validation size to get the best result in the test set. Statistical learning theory is not a topic that can be talked about at one time, we must proceed step by step.


source :https://stats.stackexchange.com/questions/345990/why-does-the-loss-accuracy-fluctuate-during-the-training-keras-lstm

In [95]:
input = Input(shape=(100,1))
lstm = GRU(3, return_sequences=True)(input)
dropout = Dropout(0.2)(lstm)
lstm2 = GRU(3, return_sequences=True)(dropout)
dropout2 = Dropout(0.2)(lstm2)
flatten= Flatten()(dropout2)
prediction = Dense(13, activation="softmax")(flatten)

In [96]:
# Model

model = Model(inputs = input, outputs = prediction)

In [97]:
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 100, 1)]          0         
                                                                 
 gru (GRU)                   (None, 100, 3)            54        
                                                                 
 dropout_8 (Dropout)         (None, 100, 3)            0         
                                                                 
 gru_1 (GRU)                 (None, 100, 3)            72        
                                                                 
 dropout_9 (Dropout)         (None, 100, 3)            0         
                                                                 
 flatten_1 (Flatten)         (None, 300)               0         
                                                                 
 dense_4 (Dense)             (None, 13)                3913

In [98]:
# Setting the learning rate for the optimizer. 

adam_optimizer = keras.optimizers.Adam(learning_rate=1e-3, decay=1e-6)

# Compiling the model

model.compile(optimizer=adam_optimizer, loss="categorical_crossentropy", metrics="accuracy")

Keras callbacks

In [99]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [106]:
patience = EarlyStopping(patience=200)

save_best = ModelCheckpoint("lstm_model.h5", save_best_only=True)

In [101]:
train_X.shape

(7692, 100)

In [102]:
train_y.shape

(7692, 13)

In [103]:
test_X.shape

(580, 100)

In [104]:
test_y.shape

(580, 13)

In [107]:
model.fit(train_X, train_y, validation_data=(test_X,test_y),epochs=25,callbacks=[save_best, patience])

Epoch 1/25
Epoch 2/25
Epoch 3/25

KeyboardInterrupt: ignored