<a href="https://colab.research.google.com/github/abhi-11nav/Text-Emotion-Detection/blob/main/Text_Emotion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the necessary libraries 

import pandas as pd
import numpy as np 

In [2]:
# Cloning the github repository 

!git clone https://github.com/abhi-11nav/Text-Emotion-Detection.git

Cloning into 'Text-Emotion-Detection'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 33 (delta 18), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (33/33), done.


In [3]:
# Importing data

data = pd.read_csv("/content/Text-Emotion-Detection/tweet_emotions.csv")

In [4]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [5]:
# Let us drop the tweet id

data.drop("tweet_id", axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [7]:
# Let us check if the tweet has any missing values 

data.isna().any()

sentiment    False
content      False
dtype: bool

No missing values

In [8]:
# Let us check the number of categories in sentiment variable

data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

The data seems imbalanced. Let us deal with it after a bit

In [9]:
# Let us look at the sentences

data['content'][0]

'@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =['

In [10]:
data['content'][1]

'Layin n bed with a headache  ughhhh...waitin on your call...'

Text Preprocessing

In [11]:
# Importing libraries

import re 

import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [12]:
def text_preprocess(dataset,list_name):
  
  for i in range(dataset.shape[0]):
    list_name.append(re.sub('[^a-zA-Z]',' ',str(dataset.iloc[i,1])))

  print("Number and other symbols eliminated from the text")

  # String spacing 
  for x in range(len(list_name)):
    list_name[x] = " ".join(y for y in str(list_name[x]).split()).lower()

  print("Text reorganized and converted to small letter")
  
  for index in range(len(list_name)):
    temp_list= []
    # Lemmatization
    for word in list_name[index].split():
      if word not in stopwords.words('english'):
        temp_list.append(word)
    list_name[index] = " ".join(lemmatizer.lemmatize(words) for words in temp_list )

In [13]:
sentences = []

text_preprocess(data,sentences)

Number and other symbols eliminated from the text
Text reorganized and converted to small letter


In [14]:
p_data = pd.concat([pd.DataFrame(np.array(sentences), columns=["Content"]), data['sentiment']], axis=1)

**WORD2VEC MANUAL**

Applying one hot encoding '

In [None]:
from keras.preprocessing.text import one_hot

In [None]:
p_data.head()

Unnamed: 0,Content,sentiment
0,tiffanylue know listenin bad habit earlier sta...,empty
1,layin n bed headache ughhhh waitin call,sadness
2,funeral ceremony gloomy friday,sadness
3,want hang friend soon,enthusiasm
4,dannycastillo want trade someone houston ticke...,neutral


In [None]:
# List conatining sentences

sentences = [sent for sent in p_data["Content"]]

In [None]:
# Unique words

unique_word_list = []

for index in range(len(sentences)):
  [unique_word_list.append(w) for w in sentences[index].split()]


unique_words = list(set(unique_word_list))

print(len(unique_words))

42763


In [None]:
# Vocabulary_size 

vocab_size = len(unique_words)

In [None]:
one_hot_encoder = [one_hot(words, vocab_size) for words in sentences]

In [None]:
one_hot_encoder[0]

[37387, 32288, 33984, 12232, 17768, 4014, 42550, 34624, 26440]

Padding

In [None]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Finidng the sentence with maximum length

max_length = 0

for sent in sentences:
  if len(sent) > max_length:
    max_length = len(sent)

In [None]:
max_length

133

In [None]:
embedded_docs = pad_sequences(one_hot_encoder, padding='pre', maxlen=max_length)

In [None]:
embedded_docs[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0, 37387, 32288,
       33984, 12232,

Converting values into vector features

In [None]:
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Model, Sequential

In [None]:
vector_dimension = 50

In [None]:
model = Sequential()

Embedding layer and LSTM RNN Model

In [None]:
model.add(Embedding(vocab_size, output_dim = vector_dimension,input_length=max_length))
model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(13, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')

In [None]:
from keras.callbacks import EarlyStopping

checkpoint = EarlyStopping(patience = 20)

In [None]:
history = model.fit(embedded_docs, y, validation_split=0.15, epochs = 100,callbacks=checkpoint)

Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


(40000, 133, 50)

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
model = GaussianNB()

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.1,random_state=101)

In [None]:
model.fit(train_X, train_y)

ValueError: ignored

In [None]:
train_X = np.array(train_X)

  """Entry point for launching an IPython kernel.


In [None]:
train_X = np.array([x.reshape(-1,1) for x in train_X])

  """Entry point for launching an IPython kernel.


In [None]:
predictions = model.predict(test_X)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(test_y, predictions)

0.019

3

Text preprocessing done

Converting text to vectors 

Word2vec

In [15]:
# Importing necessary libraries

import gensim

from gensim.models import Word2Vec

from tqdm import tqdm

from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
# Words list

words_list = []

# looping through to append words
for index in range(len(sentences)):
  words_list.append(nltk.word_tokenize(sentences[index]))

print(len(words_list)," length of sentences")

40000  length of sentences


In [28]:
empty_lists = []

for i,wl in enumerate(words_list):
  if not wl:
    empty_lists.append(i)

print("The number of empty lists are: ", len(empty_lists))

The number of empty lists are:  21


Since there are 21 empty lists. We will combine them with the labels and drop the 21 rows

In [25]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

preprocessed_data = pd.concat([pd.DataFrame(np.array(words_list)),pd.DataFrame(data['sentiment'])], axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [27]:
# Checking for null values 

preprocessed_data.isna().any()

0            False
sentiment    False
dtype: bool

In [31]:
# We have empty lists that we have to get rid of and we have the indexes of those lists store in empty_lists list

# Verifying elemnts from the list

for indexes in empty_lists:
  print(preprocessed_data.iloc[indexes,0])

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


There we go, our empty lists. 

In [32]:
preprocessed_data.drop(empty_lists, axis=0, inplace=True)

In [37]:
word_lists = [lists for lists in preprocessed_data.iloc[:,0]]

In [40]:
model = gensim.models.Word2Vec(words_list, window=5, min_count = 2)

In [51]:
# Empty list 
X = []

# Looping though words
for words in tqdm(word_lists):
  X.append(np.mean([model.wv[word] for word in words if word in model.wv.index2word], axis=0))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 39979/39979 [00:21<00:00, 1871.72it/s]


In [93]:
# Coverting them to arrays

X = np.array(X)
y = preprocessed_data['sentiment']

In [98]:
labels = []
corresponding_num = []

for ind,lab in enumerate(y.unique()):
  labels.append(lab)
  corresponding_num.append(ind)

In [109]:
encodings = [val for val in y]

In [110]:
for i,value in enumerate(encodings):
  for ind,unique in enumerate(labels):
    if value==unique:
      encodings[i] = ind

In [118]:
encodings = np.array(encodings)

In [119]:
y = encodings

Checking types

In [122]:
# Converting all the arrays to same data type

X = np.array([val.astype(np.float64) for val in X])

  This is separate from the ipykernel package so we can avoid doing imports until


Checking for null values in the array

In [141]:
pd.DataFrame(X).isna().sum()

0    227
dtype: int64

Found 227 null values

In [153]:
# Let us combine the dataset and get rid of any null values that may have occured after preprocessing

vector_data = pd.concat([pd.DataFrame(X),pd.DataFrame(y)], axis=1)

In [154]:
vector_data.head()

Unnamed: 0,0,0.1
0,"[-0.18124164640903473, 0.4027554988861084, 0.0...",0
1,"[-0.07959838956594467, 0.38952603936195374, 0....",1
2,"[-0.06298115849494934, 0.17759841680526733, -0...",1
3,"[-0.03169899806380272, 0.5318045616149902, 0.0...",2
4,"[-0.111288882791996, 0.4396958649158478, 0.060...",3


In [155]:
vector_data.isna().any()

0     True
0    False
dtype: bool

Dropping all the null values

In [156]:
vector_data.dropna(inplace=True)

In [157]:
vector_data.shape

(39752, 2)

In [159]:
X = np.array([feat for feat in vector_data.iloc[:,0]])
y = np.array([label for label in vector_data.iloc[:,1]])

In [164]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y, train_size = 0.93, random_state= 101 )

In [167]:
gnb.fit(train_X, train_y)

GaussianNB()

In [169]:
predictions = gnb.predict(test_X)

In [171]:
from sklearn.metrics import accuracy_score

In [172]:
score = accuracy_score(test_y, predictions)

In [173]:
print("And the final score is ...... ..... ...", score)

And the final score is ...... ..... ... 0.056054617319439455


In [174]:
test_y

array([5, 3, 4, ..., 9, 3, 9])

In [175]:
predictions

array([ 8,  0, 10, ...,  8, 10,  6])

In [None]:
=

(39752,)

Train test split

In [None]:
import tensorflow as tf

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X,y,test_size=0.1,random_state=101)

In [None]:
train_X.shape

(35776,)

In [None]:
train_y.shape

(35776, 13)

In [None]:
test_X.shape

(3976,)

In [None]:
test_y.shape

(3976, 13)

.

## Bi-directional LSTM RNN 

Implementing Bi-directional Long short term Memory recurrent neural network 

In [None]:
# Importing the necessary libraries

import tensorflow 
from tensorflow import keras

from keras.layers import Dense, Flatten, Input, LSTM, Bidirectional, Embedding, Dropout
from keras.models import Model, Sequential

42763


Sequential API

In [None]:
classes = len(data['sentiment'].unique())

print(classes)

13


In [None]:
model = Sequential()

In [None]:
X[0].shape

(100,)

In [None]:
model.add(Embedding(input_dim = len(unique_words),output_dim = 1,input_length= 100))
model.add(LSTM(100))
model.add(Dense(classes, activation = 'softmax'))

In [None]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 35776, 100)        4276300   
                                                                 
 lstm_4 (LSTM)               (None, 100)               80400     
                                                                 
 dense_4 (Dense)             (None, 13)                1313      
                                                                 
 embedding_5 (Embedding)     (None, 13, 100)           4276300   
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dense_5 (Dense)             (None, 13)                1313      
                                                                 
 embedding_6 (Embedding)     (None, 13, 1)            

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(train_X, train_y, epochs=3)

ValueError: ignored

In [None]:
preprocessed_data["sentiment"].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [None]:
min(test_y)

0

In [None]:
train_X.shape

(35776, 100)

In [None]:
train_X.dtype

dtype('O')

In [None]:
train_X = np.array([np.array(val).astype('float64') for val in train_X])
train_y = np.array([np.array(val).astype('float64') for val in train_y])
test_X = np.array([np.array(val).astype('float64') for val in test_X])
test_y = np.array([np.array(val).astype('float64') for val in test_y])

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
model.fit(train_X, train_y, epochs=10)

ValueError: ignored

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier()

In [None]:
classifier.fit(train_X, train_y)

ValueError: ignored

In [None]:
train_X = np.array(train_X)

In [None]:
train_y.dtype

dtype('int64')

In [None]:
classifier.fit(train_X, train_y)

ValueError: ignored

## Machine learning model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_classifier = RandomForestClassifier()

In [None]:
# converting it into array

train_X = list(train_X)

train_X = np.array(train_X)

In [None]:
rf_classifier.fit(train_X, train_y)

RandomForestClassifier()

In [None]:
predictions = rf_classifier.predict(np.array(list(test_X)))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
score = accuracy_score(test_y, predictions)

In [None]:
score

0.2623239436619718

In [None]:
test_y

array([11,  8, 12, ..., 12,  8, 10])

In [None]:
predictions

array([ 8,  8, 12, ...,  8,  8,  8])