<a href="https://colab.research.google.com/github/abhi-11nav/Text-Emotion-Detection/blob/main/Text_Emotion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the necessary libraries 

import pandas as pd
import numpy as np 

In [2]:
# Cloning the github repository 

!git clone https://github.com/abhi-11nav/Text-Emotion-Detection.git

Cloning into 'Text-Emotion-Detection'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (57/57), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 57 (delta 34), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (57/57), done.


In [3]:
# Importing data

data = pd.read_csv("/content/Text-Emotion-Detection/tweet_emotions.csv")

In [4]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


Funeral ceremony...gloomy friday...

In [5]:
# Let us drop the tweet id

data.drop("tweet_id", axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [7]:
# Let us check if the tweet has any missing values 

data.isna().any()

sentiment    False
content      False
dtype: bool

No missing values

In [8]:
# Let us check the number of categories in sentiment variable

data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

Since the data is imbalanced, we'll be deadling with it 

Data Imbalance

### Eliminating the last two categories of sentiment as they are least represented. 

In [9]:
# dropping the last two samples

# Appending indexes to remove
indexes_to_remove = []


for index in data[data['sentiment']=="boredom"].index:
  indexes_to_remove.append(index)

for index in data[data['sentiment']=="anger"].index:
  indexes_to_remove.append(index)

In [10]:
len(indexes_to_remove)

289

In [11]:
data.drop(indexes_to_remove, inplace=True, axis=0)

In [12]:
data["sentiment"].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
Name: sentiment, dtype: int64

In [13]:
labels = [label for label in data["sentiment"].unique()]

In [14]:
balanced_df = pd.DataFrame()

for label in labels: 
  balanced_df = pd.concat([data[data["sentiment"]==label].sample(759),balanced_df], axis=0)

In [15]:
balanced_df["sentiment"].value_counts()

relief        759
happiness     759
hate          759
fun           759
love          759
surprise      759
worry         759
neutral       759
enthusiasm    759
sadness       759
empty         759
Name: sentiment, dtype: int64

 Now we have a balanced dataset

In [16]:
# shuffling samples and resetting indexes

balanced_df = balanced_df.sample(len(balanced_df))

In [17]:
balanced_df.reset_index(inplace=True)

In [18]:
balanced_df.head()

Unnamed: 0,index,sentiment,content
0,2970,surprise,Woke up and there was sun! And then it started...
1,31759,enthusiasm,"today was boring, but i did a lot of homework...."
2,16458,empty,Ugh... I definitely speak too soon...
3,29281,neutral,Gonna go watch Jackson's band play and then go...
4,18646,sadness,Has been a long day - Matinee and evening show...


In [19]:
balanced_df.drop("index", inplace=True, axis=1)

In [20]:
# Changing the name of the data frame

data = balanced_df

In [21]:
# Let us look at the sentences

data['content'][0]

'Woke up and there was sun! And then it started to rain'

In [22]:
data['content'][1]

"today was boring, but i did a lot of homework. tomorrow should be amazing, i can't wait."

Text Preprocessing

In [23]:
# Importing libraries

import re 

import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [24]:
def text_preprocess(dataset,list_name):
  
  for i in range(dataset.shape[0]):
    list_name.append(re.sub('[^a-zA-Z]',' ',str(dataset.iloc[i,1])))

  print("Number and other symbols eliminated from the text")

  # String spacing 
  for x in range(len(list_name)):
    list_name[x] = " ".join(y for y in str(list_name[x]).split()).lower()

  print("Text reorganized and converted to small letter")
  
  for index in range(len(list_name)):
    temp_list= []
    # Lemmatization
    for word in list_name[index].split():
      if word not in stopwords.words('english'):
        temp_list.append(word)
    list_name[index] = " ".join(lemmatizer.lemmatize(words) for words in temp_list )

In [25]:
sentences = []

text_preprocess(data,sentences)

Number and other symbols eliminated from the text
Text reorganized and converted to small letter


In [26]:
p_data = pd.concat([pd.DataFrame(np.array(sentences), columns=["Content"]), data['sentiment']], axis=1)

In [27]:
p_data.head()

Unnamed: 0,Content,sentiment
0,woke sun started rain,surprise
1,today boring lot homework tomorrow amazing wait,enthusiasm
2,ugh definitely speak soon,empty
3,gonna go watch jackson band play going band party,neutral
4,long day matinee evening show sad tomorrow las...,sadness


Text preprocessing done

## One hot Encoding and Padding sequences 

Reference : https://phdstatsphys.wordpress.com/2018/12/27/word2vec-how-to-train-and-update-it/

In [37]:
import gensim.downloader as api
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [32]:
google_word2vec = api.load('word2vec-google-news-300')



In [71]:
!pip install wget

import wget

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [74]:
!wget -c 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'

--2022-11-29 23:10:10--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 3.5.11.164, 52.216.212.184, 52.216.162.117, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|3.5.11.164|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2022-11-29 23:10:10 ERROR 404: Not Found.



In [58]:
from nltk import sent_tokenize

In [59]:
from tqdm import tqdm

corpus = []

for sent in tqdm(sentences):
  corpus.append(sent.split())

100%|██████████| 8349/8349 [00:00<00:00, 12589.43it/s]


In [60]:
# Importing other libraries

from gensim.models import Word2Vec

In [61]:
word2vec_model = Word2Vec(size = 300, window=5,min_count = 1, workers = 2)

In [62]:
word2vec_model.build_vocab(corpus)

In [None]:
word2vec_model.intersect_word2vec_format(google_word2vec, lockf=1.0, binary=True)

In [None]:
word2vec_model.train(sentences,epochs = 5)

Converting text to vectors 

Word2vec

In [None]:
# Importing necessary libraries

import gensim

from gensim.models import Word2Vec

from tqdm import tqdm

from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Words list

words_list = []

# looping through to append words
for index in range(len(sentences)):
  words_list.append(nltk.word_tokenize(sentences[index]))

print(len(words_list)," length of sentences")

8349  length of sentences


In [None]:
empty_lists = []

for i,wl in enumerate(words_list):
  if not wl:
    empty_lists.append(i)

print("The number of empty lists are: ", len(empty_lists))

The number of empty lists are:  2


Since there are 21 empty lists. We will combine them with the labels and drop the 21 rows

In [None]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

preprocessed_data = pd.concat([pd.DataFrame(np.array(words_list)),pd.DataFrame(data['sentiment'])], axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
# Checking for null values 

preprocessed_data.isna().any()

0            False
sentiment    False
dtype: bool

In [None]:
# We have empty lists that we have to get rid of and we have the indexes of those lists store in empty_lists list

# Verifying elemnts from the list

for indexes in empty_lists:
  print(preprocessed_data.iloc[indexes,0])

[]
[]


There we go, our empty lists. 

In [None]:
preprocessed_data.drop(empty_lists, axis=0, inplace=True)

In [None]:
word_lists = [lists for lists in preprocessed_data.iloc[:,0]]

In [None]:
model = gensim.models.Word2Vec(words_list, window=5, min_count = 2)

In [None]:
# Empty list 
X = []

# Looping though words
for words in tqdm(word_lists):
  X.append(np.mean([model.wv[word] for word in words if word in model.wv.index2word], axis=0))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 8347/8347 [00:01<00:00, 4737.44it/s]


In [None]:
# Coverting them to arrays

X = np.array(X)
y = preprocessed_data['sentiment']

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
labels = []
corresponding_num = []

for ind,lab in enumerate(y.unique()):
  labels.append(lab)
  corresponding_num.append(ind)

In [None]:
encodings = [val for val in y]

In [None]:


for i,value in enumerate(encodings):
  for ind,unique in enumerate(labels):
    if value==unique:
      encodings[i] = ind

In [None]:

encodings = np.array(encodings)

In [None]:



y = encodings

Checking types

In [None]:
# Converting all the arrays to same data type

X = np.array([val.astype(np.float64) for val in X])

  This is separate from the ipykernel package so we can avoid doing imports until


Checking for null values in the array

In [None]:
pd.DataFrame(X).isna().sum()

0    75
dtype: int64

Found 227 null values

In [None]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

vector_data = pd.concat([pd.DataFrame(X),pd.DataFrame(y)], axis=1)

In [None]:
vector_data.head()

Unnamed: 0,0,0.1
0,"[0.5185931921005249, -0.2783828675746918, -0.1...",0
1,"[0.3185077905654907, -0.1711403876543045, -0.1...",1
2,"[0.39131638407707214, -0.21000224351882935, -0...",2
3,"[0.24084103107452393, -0.13017770648002625, -0...",1
4,"[0.23725177347660065, -0.12525340914726257, -0...",3


In [None]:
vector_data.isna().any()

0     True
0    False
dtype: bool

Dropping all the null values

In [None]:
vector_data.dropna(inplace=True)

In [None]:
vector_data.shape

(8272, 2)

In [None]:
X = np.array([feat for feat in vector_data.iloc[:,0]])
y = np.array([label for label in vector_data.iloc[:,1]])

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y, train_size = 0.93, random_state= 12)

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()

gnb.fit(train_X, train_y)

GaussianNB()

In [None]:
predictions = gnb.predict(test_X)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
score = accuracy_score(test_y, predictions)

In [None]:
print("And the final score is ...... ..... ...", score)

And the final score is ...... ..... ... 0.11896551724137931


In [None]:
train_X.shape

(7692, 100)

In [None]:
train_y.shape

(7692,)

Converting to categories

In [None]:
from keras.utils import to_categorical

In [None]:
train_y = to_categorical(train_y,13)

In [None]:
# Covertiing test_y to binary 
test_y = to_categorical(test_y,13)

## TERM FREQUENCY - INVERSE DOCUMENT FREQUENCY

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
p_data.head()

Unnamed: 0,Content,sentiment
0,supersense ooooo explanation thank god u would...,love
1,last day senior bye bff,sadness
2,noiselesssound heard regina girl song le deux ...,sadness
3,someone said wolverine feel like watching x me...,fun
4,shanselman still class loader even custom asse...,empty


In [None]:
X = p_data['Content']
y= p_data["sentiment"]

In [None]:
labels = []
corresponding_num = []

encodings = [val for val in y]

for ind,lab in enumerate(y.unique()):
  labels.append(lab)
  corresponding_num.append(ind)

for i,value in enumerate(encodings):
  for ind,unique in enumerate(labels):
    if value==unique:
      encodings[i] = ind

encodings = np.array(encodings)

y = encodings

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y , test_size=0.11, random_state=10)

In [None]:
train_sent = [sent for sent in train_X]
test_sent = [sent for sent in test_X]

In [None]:
train_X = vectorizer.fit_transform(train_sent)
test_X = vectorizer.transform(test_sent)

In [None]:
sentences[0]

'supersense ooooo explanation thank god u would forever wondering love good night mare tho'

In [None]:
# Covertiing test_y to binary 
from keras.utils import to_categorical


train_y = to_categorical(train_y,13)

In [None]:
test_y = to_categorical(test_y,13)

In [None]:
train_X.shape

(7430, 13539)

## LSTM RNN MODEL

Implementing Bi-directional Long short term Memory recurrent neural network 

In [None]:
# Importing the necessary libraries

import tensorflow 
from tensorflow import keras

from keras.layers import Dense, Flatten, Input, LSTM, Bidirectional, Embedding, Dropout, CuDNNLSTM, GRU
from keras.models import Model, Sequential

In [None]:
train_X.shape[1:]

(100,)

The fluctuations are normal within certain limits and depend on the fact that you use a heuristic method but in your case they are excessive. Despite all the performance takes a definite direction and therefore the system works. From the graphs you have posted, the problem depends on your data so it's a difficult training. If you have already tried to change the learning rate try to change training algorithm. You would agree to test your data: first compute the Bayes error rate using a KNN (use the trick regression in case you need), in this way you can check whether the input data contain all the information you need. Then try the LSTM without the validation or dropout to verify that it has the ability to achieve the result for you necessary. If the training algorithm is not suitable you should have the same problems even without the validation or dropout. Just at the end adjust the training and the validation size to get the best result in the test set. Statistical learning theory is not a topic that can be talked about at one time, we must proceed step by step.


source :https://stats.stackexchange.com/questions/345990/why-does-the-loss-accuracy-fluctuate-during-the-training-keras-lstm

In [None]:
input = Input(shape=(100,1))
lstm = GRU(3, return_sequences=True)(input)
dropout = Dropout(0.2)(lstm)
lstm2 = GRU(3, return_sequences=True)(dropout)
dropout2 = Dropout(0.2)(lstm2)
flatten= Flatten()(dropout2)
prediction = Dense(13, activation="softmax")(flatten)

In [None]:
# Model

model = Model(inputs = input, outputs = prediction)

In [None]:
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 100, 1)]          0         
                                                                 
 gru (GRU)                   (None, 100, 3)            54        
                                                                 
 dropout_8 (Dropout)         (None, 100, 3)            0         
                                                                 
 gru_1 (GRU)                 (None, 100, 3)            72        
                                                                 
 dropout_9 (Dropout)         (None, 100, 3)            0         
                                                                 
 flatten_1 (Flatten)         (None, 300)               0         
                                                                 
 dense_4 (Dense)             (None, 13)                3913

In [None]:
# Setting the learning rate for the optimizer. 

adam_optimizer = keras.optimizers.Adam(learning_rate=1e-3, decay=1e-6)

# Compiling the model

model.compile(optimizer=adam_optimizer, loss="categorical_crossentropy", metrics="accuracy")

Keras callbacks

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
patience = EarlyStopping(patience=200)

save_best = ModelCheckpoint("lstm_model.h5", save_best_only=True)

In [None]:
train_X.shape

(7692, 100)

In [None]:
train_y.shape

(7692, 13)

In [None]:
test_X.shape

(580, 100)

In [None]:
test_y.shape

(580, 13)

In [None]:
model.fit(train_X, train_y, validation_data=(test_X,test_y),epochs=250,callbacks=[save_best, patience])

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250

KeyboardInterrupt: ignored

In [None]:
sentences

['msstacy well thanks thinking ever get scratch one well right',
 'totally forgot thats friday till read alyssanoelled tweet ha feel dumb take breath',
 'take antibacterial school clean hand cant go loo',
 'ever stepped slug accident hate bug',
 'treeincally got adjusting mate online shop zzzzzzzzzzzzzzz boring',
 'allyycase trying sleep working',
 'jackfm sulking free day',
 'much amazing day pervert ruined',
 'maariiaan hahahah yeah right plus twitteraddict find well almost day likely',
 'darenyeow oh wow really good think im going use one lol',
 'dear allergy hate please go away love always victim',
 'skoosie probably hate happens',
 'even arnold save park http bit ly wsm j providing job',
 'carlkr please wear glass next video look amazing',
 'bummed even one testimonial flickr',
 'dammit slept even le weekend enjoy azeroth bbl',
 'abeen good question nepal pm declaring resign actually resigning two different thing',
 'hope tonight okay night',
 'emmagriffiths community see one mode