<a href="https://colab.research.google.com/github/abhi-11nav/Text-Emotion-Detection/blob/main/Text_Emotion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the necessary libraries 

import pandas as pd
import numpy as np 

In [2]:
# Cloning the github repository 

!git clone https://github.com/abhi-11nav/Text-Emotion-Detection.git

Cloning into 'Text-Emotion-Detection'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 36 (delta 20), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (36/36), done.


In [3]:
# Importing data

data = pd.read_csv("/content/Text-Emotion-Detection/tweet_emotions.csv")

In [4]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [5]:
# Let us drop the tweet id

data.drop("tweet_id", axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [7]:
# Let us check if the tweet has any missing values 

data.isna().any()

sentiment    False
content      False
dtype: bool

No missing values

In [8]:
# Let us check the number of categories in sentiment variable

data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

The data seems imbalanced. Let us deal with it after a bit

In [9]:
# Let us look at the sentences

data['content'][0]

'@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =['

In [10]:
data['content'][1]

'Layin n bed with a headache  ughhhh...waitin on your call...'

Text Preprocessing

In [11]:
# Importing libraries

import re 

import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [12]:
def text_preprocess(dataset,list_name):
  
  for i in range(dataset.shape[0]):
    list_name.append(re.sub('[^a-zA-Z]',' ',str(dataset.iloc[i,1])))

  print("Number and other symbols eliminated from the text")

  # String spacing 
  for x in range(len(list_name)):
    list_name[x] = " ".join(y for y in str(list_name[x]).split()).lower()

  print("Text reorganized and converted to small letter")
  
  for index in range(len(list_name)):
    temp_list= []
    # Lemmatization
    for word in list_name[index].split():
      if word not in stopwords.words('english'):
        temp_list.append(word)
    list_name[index] = " ".join(lemmatizer.lemmatize(words) for words in temp_list )

In [13]:
sentences = []

text_preprocess(data,sentences)

Number and other symbols eliminated from the text
Text reorganized and converted to small letter


In [14]:
p_data = pd.concat([pd.DataFrame(np.array(sentences), columns=["Content"]), data['sentiment']], axis=1)

**WORD2VEC MANUAL**

Applying one hot encoding '

In [None]:
from keras.preprocessing.text import one_hot

In [None]:
p_data.head()

Unnamed: 0,Content,sentiment
0,tiffanylue know listenin bad habit earlier sta...,empty
1,layin n bed headache ughhhh waitin call,sadness
2,funeral ceremony gloomy friday,sadness
3,want hang friend soon,enthusiasm
4,dannycastillo want trade someone houston ticke...,neutral


In [None]:
# List conatining sentences

sentences = [sent for sent in p_data["Content"]]

In [None]:
# Unique words

unique_word_list = []

for index in range(len(sentences)):
  [unique_word_list.append(w) for w in sentences[index].split()]


unique_words = list(set(unique_word_list))

print(len(unique_words))

42763


In [None]:
# Vocabulary_size 

vocab_size = len(unique_words)

In [None]:
one_hot_encoder = [one_hot(words, vocab_size) for words in sentences]

In [None]:
one_hot_encoder[0]

[37387, 32288, 33984, 12232, 17768, 4014, 42550, 34624, 26440]

Padding

In [None]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Finidng the sentence with maximum length

max_length = 0

for sent in sentences:
  if len(sent) > max_length:
    max_length = len(sent)

In [None]:
max_length

133

In [None]:
embedded_docs = pad_sequences(one_hot_encoder, padding='pre', maxlen=max_length)

In [None]:
embedded_docs[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0, 37387, 32288,
       33984, 12232,

Converting values into vector features

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Model, Sequential

In [None]:
vector_dimension = 50

In [None]:
model = Sequential()

Embedding layer and LSTM RNN Model

In [None]:
model.add(Embedding(vocab_size, output_dim = vector_dimension,input_length=max_length))
model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(13, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')

In [None]:
from keras.callbacks import EarlyStopping

checkpoint = EarlyStopping(patience = 20)

In [None]:
history = model.fit(embedded_docs, y, validation_split=0.15, epochs = 100,callbacks=checkpoint)

Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


(40000, 133, 50)

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
model = GaussianNB()

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.1,random_state=101)

In [None]:
model.fit(train_X, train_y)

ValueError: ignored

In [None]:
train_X = np.array(train_X)

  """Entry point for launching an IPython kernel.


In [None]:
train_X = np.array([x.reshape(-1,1) for x in train_X])

  """Entry point for launching an IPython kernel.


In [None]:
predictions = model.predict(test_X)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(test_y, predictions)

0.019

3

Text preprocessing done

Converting text to vectors 

Word2vec

In [15]:
# Importing necessary libraries

import gensim

from gensim.models import Word2Vec

from tqdm import tqdm

from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
# Words list

words_list = []

# looping through to append words
for index in range(len(sentences)):
  words_list.append(nltk.word_tokenize(sentences[index]))

print(len(words_list)," length of sentences")

40000  length of sentences


In [17]:
empty_lists = []

for i,wl in enumerate(words_list):
  if not wl:
    empty_lists.append(i)

print("The number of empty lists are: ", len(empty_lists))

The number of empty lists are:  21


Since there are 21 empty lists. We will combine them with the labels and drop the 21 rows

In [18]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

preprocessed_data = pd.concat([pd.DataFrame(np.array(words_list)),pd.DataFrame(data['sentiment'])], axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
# Checking for null values 

preprocessed_data.isna().any()

0            False
sentiment    False
dtype: bool

In [20]:
# We have empty lists that we have to get rid of and we have the indexes of those lists store in empty_lists list

# Verifying elemnts from the list

for indexes in empty_lists:
  print(preprocessed_data.iloc[indexes,0])

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


There we go, our empty lists. 

In [21]:
preprocessed_data.drop(empty_lists, axis=0, inplace=True)

In [22]:
word_lists = [lists for lists in preprocessed_data.iloc[:,0]]

In [23]:
model = gensim.models.Word2Vec(words_list, window=5, min_count = 2)

In [24]:
# Empty list 
X = []

# Looping though words
for words in tqdm(word_lists):
  X.append(np.mean([model.wv[word] for word in words if word in model.wv.index2word], axis=0))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 39979/39979 [00:20<00:00, 1951.62it/s]


In [25]:
# Coverting them to arrays

X = np.array(X)
y = preprocessed_data['sentiment']

  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
labels = []
corresponding_num = []

for ind,lab in enumerate(y.unique()):
  labels.append(lab)
  corresponding_num.append(ind)

In [27]:
encodings = [val for val in y]

In [28]:
for i,value in enumerate(encodings):
  for ind,unique in enumerate(labels):
    if value==unique:
      encodings[i] = ind

In [29]:
encodings = np.array(encodings)

In [30]:
y = encodings

Checking types

In [31]:
# Converting all the arrays to same data type

X = np.array([val.astype(np.float64) for val in X])

  This is separate from the ipykernel package so we can avoid doing imports until


Checking for null values in the array

In [32]:
pd.DataFrame(X).isna().sum()

0    227
dtype: int64

Found 227 null values

In [33]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

vector_data = pd.concat([pd.DataFrame(X),pd.DataFrame(y)], axis=1)

In [34]:
vector_data.head()

Unnamed: 0,0,0.1
0,"[0.12428990006446838, -4.5006163418293e-05, 0....",0
1,"[0.16568677127361298, -0.06962722539901733, 0....",1
2,"[0.07699213922023773, -0.009849236346781254, -...",1
3,"[0.18733398616313934, -0.05504029616713524, 0....",2
4,"[0.14208655059337616, -0.011926676146686077, 0...",3


In [35]:
vector_data.isna().any()

0     True
0    False
dtype: bool

Dropping all the null values

In [36]:
vector_data.dropna(inplace=True)

In [37]:
vector_data.shape

(39752, 2)

In [149]:
X = np.array([feat for feat in vector_data.iloc[:,0]])
y = np.array([label for label in vector_data.iloc[:,1]])

In [150]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y, train_size = 0.93, random_state= 101 )

In [41]:
from sklearn.naive_bayes import GaussianNB

In [42]:
gnb = GaussianNB()

gnb.fit(train_X, train_y)

GaussianNB()

In [43]:
predictions = gnb.predict(test_X)

In [44]:
from sklearn.metrics import accuracy_score

In [45]:
score = accuracy_score(test_y, predictions)

In [46]:
print("And the final score is ...... ..... ...", score)

And the final score is ...... ..... ... 0.05964786201940352


In [47]:
train_X.shape

(36969, 100)

In [48]:
train_y.shape

(36969,)

Converting to categories

In [151]:
from keras.utils import to_categorical

In [152]:
train_y = to_categorical(train_y,13)

## LSTM RNN MODEL

Implementing Bi-directional Long short term Memory recurrent neural network 

In [156]:
# Importing the necessary libraries

import tensorflow 
from tensorflow import keras

from keras.layers import Dense, Flatten, Input, LSTM, Bidirectional, Embedding, Dropout
from keras.models import Model, Sequential

In [157]:
train_X.shape[1:]

(100,)

In [158]:
input = Input(shape=(100,1))
lstm = LSTM(100, activation="relu", return_sequences=True)(input)
dropout = Dropout(0.15)(lstm)
flatten = Flatten()(dropout)
prediction = Dense(13, activation="softmax")(flatten)



In [159]:
# Model

model = Model(inputs = input, outputs = prediction)

In [160]:
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 100, 1)]          0         
                                                                 
 lstm_18 (LSTM)              (None, 100, 100)          40800     
                                                                 
 dropout_10 (Dropout)        (None, 100, 100)          0         
                                                                 
 flatten_1 (Flatten)         (None, 10000)             0         
                                                                 
 dense_15 (Dense)            (None, 13)                130013    
                                                                 
Total params: 170,813
Trainable params: 170,813
Non-trainable params: 0
_________________________________________________________________


In [161]:
# Compiling the model

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics="accuracy")

Keras callbacks

In [162]:
from keras.callbacks import EarlyStopping

In [163]:
callbacks = EarlyStopping(patience=20)

In [164]:
# Covertiing test_y to binary 

test_y = to_categorical(test_y,13)

In [165]:
train_X.shape

(36969, 100)

In [166]:
train_y.shape

(36969, 13)

In [167]:
test_X.shape

(2783, 100)

In [168]:
test_y.shape

(2783, 13)

In [None]:
model.fit(train_X, train_y, validation_data=(test_X,test_y),epochs=50,callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50