<a href="https://colab.research.google.com/github/abhi-11nav/Text-Emotion-Detection/blob/main/Text_Emotion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the necessary libraries 

import pandas as pd
import numpy as np 

In [2]:
# Cloning the github repository 

!git clone https://github.com/abhi-11nav/Text-Emotion-Detection.git

Cloning into 'Text-Emotion-Detection'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 27 (delta 14), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (27/27), done.


In [3]:
# Importing data

data = pd.read_csv("/content/Text-Emotion-Detection/tweet_emotions.csv")

In [4]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [5]:
# Let us drop the tweet id

data.drop("tweet_id", axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [7]:
# Let us check if the tweet has any missing values 

data.isna().any()

sentiment    False
content      False
dtype: bool

No missing values

In [8]:
# Let us check the number of categories in sentiment variable

data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

The data seems imbalanced. Let us deal with it after a bit

In [9]:
# Let us look at the sentences

data['content'][0]

'@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =['

In [10]:
data['content'][1]

'Layin n bed with a headache  ughhhh...waitin on your call...'

Text Preprocessing

In [11]:
# Importing libraries

import re 

import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [12]:
def text_preprocess(dataset,list_name):
  
  for i in range(dataset.shape[0]):
    list_name.append(re.sub('[^a-zA-Z]',' ',str(dataset.iloc[i,1])))

  print("Number and other symbols eliminated from the text")

  # String spacing 
  for x in range(len(list_name)):
    list_name[x] = " ".join(y for y in str(list_name[x]).split()).lower()

  print("Text reorganized and converted to small letter")
  
  for index in range(len(list_name)):
    temp_list= []
    # Lemmatization
    for word in list_name[index].split():
      if word not in stopwords.words('english'):
        temp_list.append(word)
    list_name[index] = " ".join(lemmatizer.lemmatize(words) for words in temp_list )

In [13]:
sentences = []

text_preprocess(data,sentences)

Number and other symbols eliminated from the text
Text reorganized and converted to small letter


In [14]:
p_data = pd.concat([pd.DataFrame(np.array(sentences), columns=["Content"]), data['sentiment']], axis=1)

Applying one hot encoding '

In [15]:
from keras.preprocessing.text import one_hot

In [16]:
p_data.head()

Unnamed: 0,Content,sentiment
0,tiffanylue know listenin bad habit earlier sta...,empty
1,layin n bed headache ughhhh waitin call,sadness
2,funeral ceremony gloomy friday,sadness
3,want hang friend soon,enthusiasm
4,dannycastillo want trade someone houston ticke...,neutral


In [17]:
# List conatining sentences

sentences = [sent for sent in p_data["Content"]]

In [18]:
# Unique words

unique_word_list = []

for index in range(len(sentences)):
  [unique_word_list.append(w) for w in sentences[index].split()]


unique_words = list(set(unique_word_list))

print(len(unique_words))

42763


In [19]:
# Vocabulary_size 

vocab_size = len(unique_words)

In [20]:
one_hot_encoder = [one_hot(words, vocab_size) for words in sentences]

In [21]:
one_hot_encoder[0]

[28341, 16310, 11285, 32128, 626, 2994, 20353, 34621, 26035]

Padding

In [22]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [23]:
# Finidng the sentence with maximum length

max_length = 0

for sent in sentences:
  if len(sent) > max_length:
    max_length = len(sent)

In [24]:
max_length

133

In [25]:
embedded_docs = pad_sequences(one_hot_encoder, padding='pre', maxlen=max_length)

In [26]:
embedded_docs[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0, 28341, 16310,
       11285, 32128,

Converting values into vector features

In [35]:
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Model, Sequential

In [28]:
vector_dimension = 50

In [29]:
model = Sequential()

Embedding layer

In [31]:
model.add(Embedding(vocab_size, output_dim = vector_dimension,input_length=max_length))
model.compile(optimizer='adam', metrics='mse')

In [None]:
predictions = []

from tqdm import tqdm

predictions.append(np.mean([model.predict(doc) for doc in tqdm(embedded_docs)], axis=0))



 19%|█▉        | 7756/40000 [12:44<57:26,  9.36it/s]  



 19%|█▉        | 7757/40000 [12:44<59:22,  9.05it/s]



 19%|█▉        | 7758/40000 [12:44<1:01:22,  8.76it/s]



 19%|█▉        | 7759/40000 [12:45<1:03:38,  8.44it/s]



 19%|█▉        | 7760/40000 [12:45<1:05:05,  8.26it/s]



 19%|█▉        | 7761/40000 [12:45<1:05:49,  8.16it/s]



 19%|█▉        | 7762/40000 [12:45<1:06:40,  8.06it/s]



 19%|█▉        | 7763/40000 [12:45<1:05:44,  8.17it/s]



 19%|█▉        | 7764/40000 [12:45<1:05:23,  8.22it/s]



 19%|█▉        | 7765/40000 [12:45<1:05:44,  8.17it/s]



 19%|█▉        | 7766/40000 [12:45<1:06:32,  8.07it/s]



 19%|█▉        | 7767/40000 [12:46<1:10:30,  7.62it/s]



 19%|█▉        | 7768/40000 [12:46<1:11:39,  7.50it/s]



 19%|█▉        | 7769/40000 [12:46<1:10:57,  7.57it/s]



 19%|█▉        | 7770/40000 [12:46<1:09:31,  7.73it/s]



 19%|█▉        | 7771/40000 [12:46<1:10:19,  7.64it/s]



 19%|█▉        | 7772/40000 [12:46<1:09:48,  7.69it/s]



 19%|█▉        | 7774/40000 [12:46<1:04:30,  8.33it/s]



 19%|█▉        | 7776/40000 [12:47<57:28,  9.35it/s]  



 19%|█▉        | 7778/40000 [12:47<57:37,  9.32it/s]



 19%|█▉        | 7779/40000 [12:47<1:00:38,  8.86it/s]



 19%|█▉        | 7780/40000 [12:47<1:02:00,  8.66it/s]



 19%|█▉        | 7782/40000 [12:47<59:54,  8.96it/s]  



 19%|█▉        | 7784/40000 [12:48<58:36,  9.16it/s]



 19%|█▉        | 7785/40000 [12:48<1:02:48,  8.55it/s]



 19%|█▉        | 7786/40000 [12:48<1:03:35,  8.44it/s]



 19%|█▉        | 7788/40000 [12:48<1:01:28,  8.73it/s]



 19%|█▉        | 7789/40000 [12:48<1:04:15,  8.35it/s]



 19%|█▉        | 7790/40000 [12:48<1:05:42,  8.17it/s]



 19%|█▉        | 7791/40000 [12:48<1:07:25,  7.96it/s]



 19%|█▉        | 7792/40000 [12:49<1:07:04,  8.00it/s]



 19%|█▉        | 7793/40000 [12:49<1:03:51,  8.41it/s]



 19%|█▉        | 7794/40000 [12:49<1:04:29,  8.32it/s]



 19%|█▉        | 7795/40000 [12:49<1:04:37,  8.31it/s]



 19%|█▉        | 7796/40000 [12:49<1:06:07,  8.12it/s]



 19%|█▉        | 7797/40000 [12:49<1:06:33,  8.06it/s]



 19%|█▉        | 7799/40000 [12:49<58:56,  9.11it/s]  



 20%|█▉        | 7800/40000 [12:49<59:42,  8.99it/s]



 20%|█▉        | 7801/40000 [12:50<1:00:51,  8.82it/s]



 20%|█▉        | 7802/40000 [12:50<1:02:53,  8.53it/s]



 20%|█▉        | 7803/40000 [12:50<1:04:51,  8.27it/s]



 20%|█▉        | 7805/40000 [12:50<1:01:56,  8.66it/s]



 20%|█▉        | 7806/40000 [12:50<1:03:32,  8.44it/s]



 20%|█▉        | 7808/40000 [12:50<54:44,  9.80it/s]  



 20%|█▉        | 7810/40000 [12:51<52:59, 10.12it/s]



 20%|█▉        | 7811/40000 [12:51<55:27,  9.67it/s]



 20%|█▉        | 7812/40000 [12:51<56:48,  9.44it/s]



 20%|█▉        | 7813/40000 [12:51<59:58,  8.94it/s]



 20%|█▉        | 7814/40000 [12:51<1:01:18,  8.75it/s]



 20%|█▉        | 7816/40000 [12:51<59:08,  9.07it/s]  



 20%|█▉        | 7817/40000 [12:51<1:00:20,  8.89it/s]



 20%|█▉        | 7818/40000 [12:51<1:02:09,  8.63it/s]



 20%|█▉        | 7819/40000 [12:52<1:02:40,  8.56it/s]



 20%|█▉        | 7820/40000 [12:52<1:07:01,  8.00it/s]



 20%|█▉        | 7822/40000 [12:52<59:00,  9.09it/s]  



 20%|█▉        | 7823/40000 [12:52<1:00:29,  8.86it/s]



 20%|█▉        | 7824/40000 [12:52<1:01:54,  8.66it/s]



 20%|█▉        | 7825/40000 [12:52<1:02:30,  8.58it/s]



 20%|█▉        | 7826/40000 [12:52<1:04:48,  8.27it/s]



 20%|█▉        | 7827/40000 [12:53<1:06:20,  8.08it/s]



 20%|█▉        | 7829/40000 [12:53<1:02:57,  8.52it/s]



 20%|█▉        | 7830/40000 [12:53<1:05:20,  8.21it/s]



 20%|█▉        | 7831/40000 [12:53<1:05:33,  8.18it/s]



 20%|█▉        | 7832/40000 [12:53<1:06:11,  8.10it/s]



 20%|█▉        | 7833/40000 [12:53<1:09:10,  7.75it/s]



 20%|█▉        | 7835/40000 [12:53<58:50,  9.11it/s]  



 20%|█▉        | 7837/40000 [12:54<57:43,  9.28it/s]



 20%|█▉        | 7839/40000 [12:54<53:31, 10.01it/s]



 20%|█▉        | 7841/40000 [12:54<51:30, 10.41it/s]



 20%|█▉        | 7843/40000 [12:54<56:38,  9.46it/s]



 20%|█▉        | 7845/40000 [12:54<53:39,  9.99it/s]



 20%|█▉        | 7847/40000 [12:55<53:48,  9.96it/s]



 20%|█▉        | 7849/40000 [12:55<57:27,  9.32it/s]



 20%|█▉        | 7851/40000 [12:55<58:12,  9.20it/s]



 20%|█▉        | 7852/40000 [12:55<1:00:00,  8.93it/s]



 20%|█▉        | 7853/40000 [12:55<59:15,  9.04it/s]  



 20%|█▉        | 7854/40000 [12:55<1:00:48,  8.81it/s]



 20%|█▉        | 7856/40000 [12:56<55:48,  9.60it/s]  



 20%|█▉        | 7858/40000 [12:56<52:19, 10.24it/s]



 20%|█▉        | 7860/40000 [12:56<54:06,  9.90it/s]



 20%|█▉        | 7861/40000 [12:56<58:22,  9.18it/s]



 20%|█▉        | 7862/40000 [12:56<1:00:04,  8.91it/s]



 20%|█▉        | 7863/40000 [12:56<1:00:38,  8.83it/s]



 20%|█▉        | 7864/40000 [12:57<1:01:19,  8.73it/s]



 20%|█▉        | 7865/40000 [12:57<1:02:25,  8.58it/s]



 20%|█▉        | 7867/40000 [12:57<54:22,  9.85it/s]  



 20%|█▉        | 7868/40000 [12:57<56:50,  9.42it/s]



 20%|█▉        | 7870/40000 [12:57<55:33,  9.64it/s]



 20%|█▉        | 7871/40000 [12:57<57:15,  9.35it/s]



 20%|█▉        | 7872/40000 [12:57<1:00:53,  8.79it/s]



 20%|█▉        | 7874/40000 [12:58<55:12,  9.70it/s]  



 20%|█▉        | 7875/40000 [12:58<59:28,  9.00it/s]



 20%|█▉        | 7876/40000 [12:58<1:01:32,  8.70it/s]



 20%|█▉        | 7877/40000 [12:58<1:02:24,  8.58it/s]



 20%|█▉        | 7878/40000 [12:58<1:03:39,  8.41it/s]



 20%|█▉        | 7879/40000 [12:58<1:05:49,  8.13it/s]



 20%|█▉        | 7880/40000 [12:58<1:07:23,  7.94it/s]



 20%|█▉        | 7881/40000 [12:58<1:06:58,  7.99it/s]



 20%|█▉        | 7882/40000 [12:59<1:08:24,  7.82it/s]



 20%|█▉        | 7884/40000 [12:59<57:12,  9.36it/s]  



 20%|█▉        | 7885/40000 [12:59<59:13,  9.04it/s]



 20%|█▉        | 7886/40000 [12:59<1:03:49,  8.39it/s]



 20%|█▉        | 7888/40000 [12:59<56:56,  9.40it/s]  



 20%|█▉        | 7889/40000 [12:59<58:48,  9.10it/s]



 20%|█▉        | 7891/40000 [13:00<59:23,  9.01it/s]



 20%|█▉        | 7892/40000 [13:00<1:02:36,  8.55it/s]



 20%|█▉        | 7893/40000 [13:00<1:03:54,  8.37it/s]



 20%|█▉        | 7895/40000 [13:00<58:26,  9.16it/s]  



 20%|█▉        | 7896/40000 [13:00<1:00:06,  8.90it/s]



 20%|█▉        | 7897/40000 [13:00<1:03:20,  8.45it/s]



 20%|█▉        | 7898/40000 [13:00<1:04:50,  8.25it/s]



 20%|█▉        | 7900/40000 [13:01<56:05,  9.54it/s]  



 20%|█▉        | 7901/40000 [13:01<59:49,  8.94it/s]



 20%|█▉        | 7902/40000 [13:01<58:23,  9.16it/s]



 20%|█▉        | 7903/40000 [13:01<1:00:10,  8.89it/s]



 20%|█▉        | 7905/40000 [13:01<54:45,  9.77it/s]  



 20%|█▉        | 7907/40000 [13:01<55:50,  9.58it/s]



 20%|█▉        | 7908/40000 [13:01<58:42,  9.11it/s]



 20%|█▉        | 7909/40000 [13:02<58:00,  9.22it/s]



 20%|█▉        | 7910/40000 [13:02<59:34,  8.98it/s]



 20%|█▉        | 7911/40000 [13:02<58:38,  9.12it/s]



 20%|█▉        | 7912/40000 [13:02<58:13,  9.18it/s]



 20%|█▉        | 7913/40000 [13:02<56:59,  9.38it/s]



In [45]:
model.predict(embedded_docs)



KeyboardInterrupt: ignored

LSTM RNN MODEL

In [33]:
f_model = Sequential()

In [36]:
f_model.add(LSTM(100))
f_model.add(Dropout(0.1))
f_model.add(LSTM(100))
f_model.add(Dense(13, activation='softmax'))

f_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')

In [None]:
vector_docs = model.predict(embedded_docs)



Text preprocessing done

Converting text to vectors 

Word2vec

In [None]:
# Importing necessary libraries

import gensim

from gensim.models import Word2Vec

from tqdm import tqdm

from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Words list

words_list = []

# looping through to append words
for index in range(len(sentences)):
  words_list.append(nltk.word_tokenize(sentences[index]))

print(len(words_list)," length of sentences")

40000  length of sentences


In [None]:
model = gensim.models.Word2Vec(words_list, window=5, min_count = 2)

In [None]:
# Empty list 
X = []

# Looping though words
for words in tqdm(words_list):
  X.append(np.mean([model.wv[word] for word in words if word in model.wv.index2word], axis=0))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 40000/40000 [00:14<00:00, 2721.69it/s]


In [None]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

preprocessed_data = pd.concat([pd.DataFrame(np.array(X)),pd.DataFrame(data['sentiment'])], axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
preprocessed_data.shape

(40000, 2)

In [None]:
# eliminating missing values 

preprocessed_data.isna().any()

0             True
sentiment    False
dtype: bool

In [None]:
# Dropping misisng value

preprocessed_data.dropna(inplace=True)

In [37]:
# Assinging feature and label

#X = preprocessed_data[0]
y = p_data['sentiment']

In [38]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

In [39]:
y = le.fit_transform(y)

In [40]:
#X = np.array(X)

NameError: ignored

In [41]:
from keras.utils import to_categorical

y = to_categorical(y)

Train test split

In [None]:
import tensorflow as tf

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.1,random_state=101)

In [None]:
train_X.shape

(35776,)

In [None]:
train_y.shape

(35776, 13)

In [None]:
test_X.shape

(3976,)

In [None]:
test_y.shape

(3976, 13)

.

## Bi-directional LSTM RNN 

Implementing Bi-directional Long short term Memory recurrent neural network 

In [None]:
# Importing the necessary libraries

import tensorflow 
from tensorflow import keras

from keras.layers import Dense, Flatten, Input, LSTM, Bidirectional, Embedding, Dropout
from keras.models import Model, Sequential

42763


Sequential API

In [None]:
classes = len(data['sentiment'].unique())

print(classes)

13


In [None]:
model = Sequential()

In [None]:
X[0].shape

(100,)

In [None]:
model.add(Embedding(input_dim = len(unique_words),output_dim = 1,input_length= 100))
model.add(LSTM(100))
model.add(Dense(classes, activation = 'softmax'))

In [None]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 35776, 100)        4276300   
                                                                 
 lstm_4 (LSTM)               (None, 100)               80400     
                                                                 
 dense_4 (Dense)             (None, 13)                1313      
                                                                 
 embedding_5 (Embedding)     (None, 13, 100)           4276300   
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dense_5 (Dense)             (None, 13)                1313      
                                                                 
 embedding_6 (Embedding)     (None, 13, 1)            

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(train_X, train_y, epochs=3)

ValueError: ignored

In [None]:
preprocessed_data["sentiment"].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [None]:
min(test_y)

0

In [None]:
train_X.shape

(35776, 100)

In [None]:
train_X.dtype

dtype('O')

In [None]:
train_X = np.array([np.array(val).astype('float64') for val in train_X])
train_y = np.array([np.array(val).astype('float64') for val in train_y])
test_X = np.array([np.array(val).astype('float64') for val in test_X])
test_y = np.array([np.array(val).astype('float64') for val in test_y])

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
model.fit(train_X, train_y, epochs=10)

ValueError: ignored

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier()

In [None]:
classifier.fit(train_X, train_y)

ValueError: ignored

In [None]:
train_X = np.array(train_X)

In [None]:
train_y.dtype

dtype('int64')

In [None]:
classifier.fit(train_X, train_y)

ValueError: ignored

## Machine learning model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_classifier = RandomForestClassifier()

In [None]:
# converting it into array

train_X = list(train_X)

train_X = np.array(train_X)

In [None]:
rf_classifier.fit(train_X, train_y)

RandomForestClassifier()

In [None]:
predictions = rf_classifier.predict(np.array(list(test_X)))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
score = accuracy_score(test_y, predictions)

In [None]:
score

0.2623239436619718

In [None]:
test_y

array([11,  8, 12, ..., 12,  8, 10])

In [None]:
predictions

array([ 8,  8, 12, ...,  8,  8,  8])