In [2]:
import pandas as pd
import numpy as np

In [3]:
# loading the dataset
df = pd.read_csv("/content/Corona_NLP_test.csv")
df.head()

Unnamed: 0,OriginalTweet,Sentiment
0,TRENDING: New Yorkers encounter empty supermar...,Negative
1,When I couldn't find hand sanitizer at Fred Me...,Positive
2,Find out how you can protect yourself and love...,Positive
3,#Panic buying hits #NewYork City as anxious sh...,Negative
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [4]:
df.shape

(3798, 2)

In [5]:
df.columns = ['text', 'sentiment']

My dataset is having the url at some rows in the text column.

i am creating a function to remove all the links since they will create nloise and extra dimensions in my dataset

In [6]:
def remove_link(text):
  return text.split('http')[0]

df['text'] = df['text'].apply(remove_link)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       3798 non-null   object
 1   sentiment  3798 non-null   object
dtypes: object(2)
memory usage: 59.5+ KB


In [8]:
#  checing for the duplicate entries
df.duplicated().sum()

np.int64(9)

In [9]:
# checking for the missing values
df.isna().sum()

Unnamed: 0,0
text,0
sentiment,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       3798 non-null   object
 1   sentiment  3798 non-null   object
dtypes: object(2)
memory usage: 59.5+ KB


In [11]:
# classes distribution
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Negative,1633
Positive,1546
Neutral,619


In [12]:
df['sentiment'].value_counts().shape

(3,)

# preprocessing

In [13]:
# encoding teh target column using label encoder
from sklearn.preprocessing import LabelEncoder
import pickle

In [14]:
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

In [15]:
df.head()

Unnamed: 0,text,sentiment,sentiment_encoded
0,TRENDING: New Yorkers encounter empty supermar...,Negative,0
1,When I couldn't find hand sanitizer at Fred Me...,Positive,2
2,Find out how you can protect yourself and love...,Positive,2
3,#Panic buying hits #NewYork City as anxious sh...,Negative,0
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1


In [16]:
# saving the label encoder as pkl file for the further use cases
with open('label_encoder_sentiment.pkl', 'wb') as f:
    pickle.dump(le, f)

In [17]:
# with open('label_encoder_sentiment.pkl', 'rb') as f:
#     le_loaded = pickle.load(f)

In [18]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

In [19]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [20]:
# Initialize once
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

creating a preprocessing pipeline function where the text are lower casted, removing punctuations, tokenized, removed stopwords punctuations and numbers

In [21]:
import re
def preprocess_text(text):
    if pd.isnull(text):
        return ""

    # 1. Lowercasing
    text = text.lower()

    # 2. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 3. Tokenization
    tokens = word_tokenize(text)

    # 4. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # 5. Lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]

    # Return the processed string
    pre_text = ' '.join(lemmatized)

    # removeing punctuations and numbers
    no_punct = re.sub(r'[^a-zA-Z\s]', '', pre_text)

    #cleaned text
    cleaned_text = re.sub(r'\s+', ' ', no_punct).strip()

    return cleaned_text

In [22]:
# applying the function on the text data
df['processed_text'] = df['text'].apply(preprocess_text)

In [23]:
df

Unnamed: 0,text,sentiment,sentiment_encoded,processed_text
0,TRENDING: New Yorkers encounter empty supermar...,Negative,0,trending new yorkers encounter empty supermark...
1,When I couldn't find hand sanitizer at Fred Me...,Positive,2,couldnt find hand sanitizer fred meyer turned ...
2,Find out how you can protect yourself and love...,Positive,2,find protect loved one coronavirus
3,#Panic buying hits #NewYork City as anxious sh...,Negative,0,panic buying hit newyork city anxious shopper ...
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1,toiletpaper dunnypaper coronavirus coronavirus...
...,...,...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,Positive,2,meanwhile supermarket israel people dance sing...
3794,Did you panic buy a lot of non-perishable item...,Negative,0,panic buy lot nonperishable item echo need foo...
3795,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral,1,asst prof economics cconces nbcphiladelphia ta...
3796,Gov need to do somethings instead of biar je r...,Negative,0,gov need somethings instead biar je rakyat ass...


In [24]:
# saving the preprocessed data
df.to_csv('preprocessed_data_sentiment_classification.csv', index=False)

# multinomial naive bayse

i am using multinomial naive bayse as the beggining because it handles high dimensional data (including text data) well.

In [25]:
# importing the libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [26]:
# splitting the data
X = df['processed_text'] # Features
y = df['sentiment'] # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [27]:
# vectorizng both training and testing data by CountVectorizer (bag of words)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [28]:
# creating and training the model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [29]:
# predicting and evaluating
y_pred = model.predict(X_test_vec)

In [30]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6276315789473684
[[258   0  69]
 [ 66  10  48]
 [ 95   5 209]]

Classification Report:
               precision    recall  f1-score   support

    Negative       0.62      0.79      0.69       327
     Neutral       0.67      0.08      0.14       124
    Positive       0.64      0.68      0.66       309

    accuracy                           0.63       760
   macro avg       0.64      0.52      0.50       760
weighted avg       0.63      0.63      0.59       760



In [31]:
# Save vectorizer
with open('count_vectorizer_for_naivebayse.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save model
with open('naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# RNN

In [32]:
#  for rnn we are importing the required libraries
import re
import pandas as pd
import numpy as np
import nltk
nltk.download('all')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Flatten, SimpleRNN, GRU
from tensorflow.keras import Sequential
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

In [33]:
# Number of classes
num_classes = len(le.classes_)
num_classes

3

In [34]:
df.columns

Index(['text', 'sentiment', 'sentiment_encoded', 'processed_text'], dtype='object')

In [35]:
df

Unnamed: 0,text,sentiment,sentiment_encoded,processed_text
0,TRENDING: New Yorkers encounter empty supermar...,Negative,0,trending new yorkers encounter empty supermark...
1,When I couldn't find hand sanitizer at Fred Me...,Positive,2,couldnt find hand sanitizer fred meyer turned ...
2,Find out how you can protect yourself and love...,Positive,2,find protect loved one coronavirus
3,#Panic buying hits #NewYork City as anxious sh...,Negative,0,panic buying hit newyork city anxious shopper ...
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1,toiletpaper dunnypaper coronavirus coronavirus...
...,...,...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,Positive,2,meanwhile supermarket israel people dance sing...
3794,Did you panic buy a lot of non-perishable item...,Negative,0,panic buy lot nonperishable item echo need foo...
3795,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral,1,asst prof economics cconces nbcphiladelphia ta...
3796,Gov need to do somethings instead of biar je r...,Negative,0,gov need somethings instead biar je rakyat ass...


In [36]:
# splitting the data for RNN
X = df['processed_text']
y = df['sentiment_encoded']
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

In [37]:
for i in (x_train,x_test,y_train,y_test):
  print(i.shape)

(3038,)
(760,)
(3038,)
(760,)


In [38]:
# creatinhg the tokenixer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [39]:
# to check the unique words dictionary and their label
tokenizer.index_word

{1: 'covid',
 2: 'coronavirus',
 3: 'food',
 4: 'store',
 5: 'grocery',
 6: 'stock',
 7: 'people',
 8: 'supermarket',
 9: 'amp',
 10: 'shopping',
 11: 'online',
 12: 'price',
 13: 'panic',
 14: 'need',
 15: 'paper',
 16: 'toilet',
 17: 'get',
 18: 'like',
 19: 'dont',
 20: 'u',
 21: 'time',
 22: 'go',
 23: 'buying',
 24: 'going',
 25: 'home',
 26: 'local',
 27: 'im',
 28: 'please',
 29: 'shelf',
 30: 'supply',
 31: 'day',
 32: 'buy',
 33: 'work',
 34: 'consumer',
 35: 'retail',
 36: 'today',
 37: 'week',
 38: 'coronaviruspandemic',
 39: 'due',
 40: 'everyone',
 41: 'help',
 42: 'hand',
 43: 'good',
 44: 'one',
 45: 'demand',
 46: 'virus',
 47: 'coronaoutbreak',
 48: 'stay',
 49: 'cant',
 50: 'pandemic',
 51: 'know',
 52: 'thing',
 53: 'see',
 54: 'coronapocalypse',
 55: 'would',
 56: 'make',
 57: 'take',
 58: 'shop',
 59: 'still',
 60: 'think',
 61: 'empty',
 62: 'water',
 63: 'even',
 64: 'item',
 65: 'panicbuying',
 66: 'keep',
 67: 'went',
 68: 'many',
 69: 'market',
 70: 'business'

In [40]:
vocab_len = len(tokenizer.index_word) +1
vocab_len
# there are 9361 unique words in our training dataset

9362

In [41]:
# applying tokenizer and converting words into number in the training and tetsing data
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [43]:
# for example first training data has 23 words
len(x_train_seq[0])

23

In [44]:
df

Unnamed: 0,text,sentiment,sentiment_encoded,processed_text
0,TRENDING: New Yorkers encounter empty supermar...,Negative,0,trending new yorkers encounter empty supermark...
1,When I couldn't find hand sanitizer at Fred Me...,Positive,2,couldnt find hand sanitizer fred meyer turned ...
2,Find out how you can protect yourself and love...,Positive,2,find protect loved one coronavirus
3,#Panic buying hits #NewYork City as anxious sh...,Negative,0,panic buying hit newyork city anxious shopper ...
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1,toiletpaper dunnypaper coronavirus coronavirus...
...,...,...,...,...
3793,Meanwhile In A Supermarket in Israel -- People...,Positive,2,meanwhile supermarket israel people dance sing...
3794,Did you panic buy a lot of non-perishable item...,Negative,0,panic buy lot nonperishable item echo need foo...
3795,Asst Prof of Economics @cconces was on @NBCPhi...,Neutral,1,asst prof economics cconces nbcphiladelphia ta...
3796,Gov need to do somethings instead of biar je r...,Negative,0,gov need somethings instead biar je rakyat ass...


In [45]:
# creating a new list to get the word count of each row of the data
doc_lenth = []
for doc in x_train_seq:
  doc_lenth.append(len(doc))

In [46]:
max(doc_lenth)
# the text with the highest wordcount is 40

40

In [47]:
np.average(doc_lenth)

np.float64(18.757735352205398)

In [50]:
# setting up the max_len as the 90 percentile of our training data
max_len = np.quantile(doc_lenth, 0.9)
max_len = round(max_len)
max_len

27

In [51]:
# prepadding the data (both traing and testing)
x_train_pad = pad_sequences(x_train_seq, maxlen=max_len, padding='pre')
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len, padding='pre')

In [52]:
print(x_train_pad.shape)
print(x_test_pad.shape)

(3038, 27)
(760, 27)


In [53]:
x_test_pad

array([[   0,    0,    0, ...,   47, 7357,  925],
       [   0,    0,    0, ...,  529,  829,  379],
       [   0,    0,    0, ...,   70,  904, 1028],
       ...,
       [   0,    0,    0, ...,  321,  460,    1],
       [   0,    0,    0, ..., 1736,  153,  349],
       [   0,    0,    0, ...,   93,    7,   31]], dtype=int32)

In [54]:
max_len

27

In [55]:
# creating the rnn model
model = Sequential()
model.add(Embedding(input_dim=vocab_len,output_dim=50,input_length=max_len))
model.add(SimpleRNN(units=32,return_sequences=False))
model.add(Dense(3,activation='softmax'))
model.summary()



In [56]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])
model.fit(x_train_pad,y_train,epochs=20,batch_size=5,validation_split=0.2)

Epoch 1/20
[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.4286 - loss: 1.0364 - val_accuracy: 0.5164 - val_loss: 0.9676
Epoch 2/20
[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.8821 - loss: 0.4821 - val_accuracy: 0.4951 - val_loss: 1.1364
Epoch 3/20
[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9952 - loss: 0.0582 - val_accuracy: 0.4934 - val_loss: 1.3537
Epoch 4/20
[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9976 - loss: 0.0133 - val_accuracy: 0.4819 - val_loss: 1.5333
Epoch 5/20
[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9984 - loss: 0.0093 - val_accuracy: 0.4852 - val_loss: 1.5031
Epoch 6/20
[1m486/486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9983 - loss: 0.0060 - val_accuracy: 0.4786 - val_loss: 1.6782
Epoch 7/20
[1m486/486[0m 

<keras.src.callbacks.history.History at 0x7fc24ce23210>

In [57]:
p = model.predict(x_test_pad)
p = np.argmax(p, axis=1)
print(p)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
[0 2 0 2 2 0 0 1 0 2 1 0 0 1 0 2 2 0 2 1 2 0 0 0 1 1 0 0 1 0 2 2 0 0 2 0 2
 2 0 0 2 1 2 2 0 0 0 1 1 2 2 2 2 2 1 1 2 1 0 1 0 2 0 0 0 0 1 2 0 2 2 2 2 0
 0 2 1 1 1 0 0 1 0 2 2 0 2 1 0 0 0 2 0 2 2 2 1 1 0 2 2 1 0 0 0 2 0 1 0 1 1
 0 0 0 1 1 2 2 2 1 0 0 0 0 0 0 2 2 0 0 2 2 2 1 1 0 0 0 0 1 1 1 0 1 0 1 0 2
 0 2 2 0 2 1 2 0 2 1 0 0 0 2 0 2 0 2 1 2 2 2 0 1 2 2 0 1 2 0 1 0 1 0 0 1 0
 1 2 0 0 2 0 2 2 1 2 2 0 1 0 0 2 0 0 2 1 2 0 0 0 0 1 0 0 0 1 0 2 0 2 0 1 0
 2 1 1 0 0 1 0 1 0 0 2 0 0 1 0 0 0 0 0 2 0 2 1 2 1 2 1 0 2 2 2 2 0 0 2 1 2
 1 2 0 2 1 1 0 0 1 0 2 0 0 0 2 2 2 0 2 1 1 1 0 0 0 0 1 2 2 1 1 2 2 1 0 2 2
 2 0 2 1 2 0 2 1 0 1 0 1 2 2 2 2 1 1 1 1 0 2 1 0 1 0 2 0 1 2 2 2 2 0 2 1 0
 2 1 2 2 2 0 0 2 0 1 1 2 1 0 0 0 2 1 2 1 1 1 2 1 1 1 0 1 1 1 0 2 0 0 0 2 2
 1 2 0 1 0 0 1 2 2 0 1 1 1 0 2 0 0 0 0 2 0 1 0 2 1 2 2 0 1 2 0 2 1 0 2 0 0
 1 1 2 1 0 2 0 0 0 0 0 1 2 0 1 1 2 1 0 1 0 0 0 1 0 0 2 1 0 0 2 0 2 0 2 2 0
 1 2 1 2 0 2 2 2 0 0 1 1 2

In [58]:
from sklearn.metrics import confusion_matrix, classification_report
d = pd.DataFrame(confusion_matrix(y_test,p),columns=["Positive","Neutral","Negative"],index=["Positive","Neutral","Negative"])
print(d)
print(classification_report(y_test,p))

          Positive  Neutral  Negative
Positive       146       75       106
Neutral         43       43        38
Negative       120       88       101
              precision    recall  f1-score   support

           0       0.47      0.45      0.46       327
           1       0.21      0.35      0.26       124
           2       0.41      0.33      0.36       309

    accuracy                           0.38       760
   macro avg       0.36      0.37      0.36       760
weighted avg       0.40      0.38      0.39       760



# LSTM

In [59]:
# now lets try LSTM model
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

9362


In [60]:
x_train_pad = pad_sequences(x_train, maxlen=max_len, padding='pre')
x_test_pad = pad_sequences(x_test, maxlen=max_len, padding='pre')

In [61]:
print(x_train_pad.shape)
print(y_train.shape)

(3038, 27)
(3038,)


In [62]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=5,input_length=max_len))
model.add(LSTM(units=150,return_sequences=False))
model.add(Dense(3,activation='softmax'))
model.summary()



In [64]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])
model.fit(x_train_pad,y_train,epochs=20,batch_size=4,validation_split=0.2)

Epoch 1/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.4328 - loss: 1.0193 - val_accuracy: 0.4309 - val_loss: 0.9850
Epoch 2/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.4955 - loss: 0.9128 - val_accuracy: 0.4852 - val_loss: 0.9937
Epoch 3/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.6668 - loss: 0.7086 - val_accuracy: 0.5181 - val_loss: 1.0568
Epoch 4/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.8081 - loss: 0.4878 - val_accuracy: 0.5247 - val_loss: 1.1511
Epoch 5/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.8953 - loss: 0.2995 - val_accuracy: 0.5263 - val_loss: 1.2315
Epoch 6/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9503 - loss: 0.1697 - val_accuracy: 0.5674 - val_loss: 1.3225
Epoch 7/20
[1m608/608[0m 

<keras.src.callbacks.history.History at 0x7fc2a74edad0>

In [65]:
p = model.predict(x_test_pad)
p = np.argmax(p, axis=1)
print(p)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[0 0 1 2 2 0 2 0 0 2 2 2 0 0 2 1 1 2 0 0 1 2 0 0 1 2 0 0 2 0 0 2 0 2 2 0 2
 2 2 0 2 2 0 2 2 2 0 0 0 1 0 2 0 0 2 0 2 0 0 1 2 0 0 0 2 1 2 0 0 0 1 2 2 2
 0 0 2 2 2 0 1 1 0 0 0 0 2 0 0 1 1 2 0 2 0 2 2 2 2 1 0 2 0 2 2 0 0 0 0 1 0
 2 0 2 2 2 0 0 2 0 0 0 2 0 2 1 2 2 2 0 0 2 0 2 2 0 0 0 0 0 0 2 2 0 0 2 2 0
 0 1 0 2 0 0 2 2 2 0 0 1 2 2 2 2 0 0 1 0 2 0 0 2 2 2 0 1 2 2 2 2 2 2 0 2 2
 2 0 0 2 0 2 1 2 2 0 0 1 0 2 0 1 1 0 0 1 2 2 1 2 0 0 0 2 2 1 0 0 0 0 0 2 2
 2 0 0 2 2 0 0 2 1 0 2 2 0 2 2 2 0 0 0 1 0 0 1 0 2 2 2 2 2 2 2 2 0 2 0 0 0
 2 2 2 2 2 2 2 2 2 2 0 2 1 2 0 2 2 0 0 0 2 2 0 0 2 0 2 0 0 0 2 2 2 1 0 2 2
 0 2 2 2 0 0 0 1 2 2 0 0 2 2 0 0 0 0 2 2 2 0 1 0 0 2 2 2 0 2 1 0 1 0 0 2 2
 2 2 2 2 1 0 0 2 2 2 0 2 1 2 2 0 2 1 0 2 0 2 2 0 0 0 2 0 2 1 2 2 2 1 0 0 2
 2 1 0 1 0 0 0 0 1 0 0 2 0 0 2 2 2 0 2 0 2 0 0 2 2 0 2 0 2 0 2 0 2 0 0 0 0
 2 2 2 2 2 1 0 0 2 2 2 0 0 0 2 0 0 1 0 1 1 2 0 2 2 0 2 1 0 2 2 2 2 0 1 0 1
 0 2 1 2 0 2 1 2 2 1 2 0 2 

In [66]:
from sklearn.metrics import confusion_matrix, classification_report
d = pd.DataFrame(confusion_matrix(y_test,p),columns=["Positive","Neutral","Negative"],index=["Positive","Neutral","Negative"])
print(d)
print(classification_report(y_test,p))

          Positive  Neutral  Negative
Positive       230       10        87
Neutral         26       49        49
Negative        66       34       209
              precision    recall  f1-score   support

           0       0.71      0.70      0.71       327
           1       0.53      0.40      0.45       124
           2       0.61      0.68      0.64       309

    accuracy                           0.64       760
   macro avg       0.62      0.59      0.60       760
weighted avg       0.64      0.64      0.64       760



# LSTM 2

In [67]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=5,input_length=max_len))
model.add(LSTM(units=100,return_sequences=True))

# 2nd LSTM Layer
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))

# Fully Connected Layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(3,activation='softmax'))
model.summary()



In [68]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])
model.fit(x_train_pad,y_train,epochs=20,batch_size=4,validation_split=0.2)

Epoch 1/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.4225 - loss: 1.0254 - val_accuracy: 0.4424 - val_loss: 0.9685
Epoch 2/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.4795 - loss: 0.9148 - val_accuracy: 0.5115 - val_loss: 0.9518
Epoch 3/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.7175 - loss: 0.6513 - val_accuracy: 0.6036 - val_loss: 0.8972
Epoch 4/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.8905 - loss: 0.3324 - val_accuracy: 0.6266 - val_loss: 1.1667
Epoch 5/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.9529 - loss: 0.1447 - val_accuracy: 0.6382 - val_loss: 1.2200
Epoch 6/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 9ms/step - accuracy: 0.9801 - loss: 0.0663 - val_accuracy: 0.5954 - val_loss: 1.6242
Epoch 7/20
[1m608/60

<keras.src.callbacks.history.History at 0x7fc22397c0d0>

In [69]:
p = model.predict(x_test_pad)
p = np.argmax(p, axis=1)
print(p)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 1 1 2 0 0 2 0 0 0 1 2 0 0 1 2 0 0 0 2 2 0 2
 2 2 0 1 2 2 2 2 0 2 0 0 1 0 2 0 0 2 0 2 0 0 1 0 0 0 0 2 1 0 0 0 0 1 0 2 0
 2 0 2 2 2 0 1 1 0 0 0 0 0 0 0 1 1 2 2 2 0 2 0 2 2 2 0 0 0 2 2 2 0 0 0 1 2
 2 0 2 2 1 0 0 2 2 0 0 2 2 0 2 0 2 2 2 0 0 2 2 2 0 0 0 0 0 0 0 2 0 0 0 2 0
 0 1 0 2 0 0 2 1 1 0 0 1 2 1 2 0 0 2 1 0 2 0 2 2 2 2 0 1 1 2 2 1 1 0 0 2 0
 2 0 0 0 0 2 1 2 2 2 2 2 2 2 0 0 2 2 0 2 0 2 1 0 0 0 0 1 2 1 0 0 0 0 2 2 2
 0 0 0 0 2 0 0 0 1 0 2 2 0 0 2 0 0 0 0 2 0 0 1 0 2 2 2 0 0 2 2 0 2 1 0 0 0
 2 2 2 2 1 2 2 2 2 2 0 2 1 2 0 2 0 0 0 0 2 2 0 0 0 0 2 0 0 0 2 2 2 1 0 0 1
 2 2 2 2 0 0 0 1 0 0 0 2 1 2 0 0 0 0 2 2 2 0 1 0 0 1 2 1 2 2 1 0 1 0 0 2 2
 2 2 2 2 1 0 0 2 2 0 0 2 0 2 2 0 1 2 0 2 0 0 2 0 2 0 2 0 2 1 2 0 1 2 0 0 0
 2 2 0 1 0 0 0 0 2 0 0 2 0 0 2 2 2 0 2 0 2 0 0 2 2 2 2 0 2 0 2 0 1 0 0 0 0
 2 0 2 2 2 1 2 2 1 2 1 0 2 0 2 2 0 1 0 1 1 1 0 2 2 0 2 0 0 2 2 2 2 0 0 0 0
 0 2 2 0 0 2 2 2 0 1 2 0 2

In [70]:
from sklearn.metrics import confusion_matrix, classification_report
d = pd.DataFrame(confusion_matrix(y_test,p),columns=["Positive","Neutral","Negative"],index=["Positive","Neutral","Negative"])
print(d)
print(classification_report(y_test,p))

          Positive  Neutral  Negative
Positive       245       14        68
Neutral         37       50        37
Negative        74       36       199
              precision    recall  f1-score   support

           0       0.69      0.75      0.72       327
           1       0.50      0.40      0.45       124
           2       0.65      0.64      0.65       309

    accuracy                           0.65       760
   macro avg       0.61      0.60      0.60       760
weighted avg       0.64      0.65      0.65       760



# LSTM 3

In [71]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=5,input_length=max_len))
model.add(LSTM(units=150,return_sequences=True))

# 2nd LSTM Layer
model.add(LSTM(units=70, return_sequences=False))
model.add(Dropout(0.2))

# Fully Connected Layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(3,activation='softmax'))
model.summary()



In [72]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])
model.fit(x_train_pad,y_train,epochs=20,batch_size=4,validation_split=0.2)

Epoch 1/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.4322 - loss: 1.0236 - val_accuracy: 0.4260 - val_loss: 0.9753
Epoch 2/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 14ms/step - accuracy: 0.4821 - loss: 0.9165 - val_accuracy: 0.4457 - val_loss: 1.0019
Epoch 3/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.6368 - loss: 0.7233 - val_accuracy: 0.4967 - val_loss: 1.0743
Epoch 4/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.7784 - loss: 0.5245 - val_accuracy: 0.5214 - val_loss: 1.3210
Epoch 5/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.8841 - loss: 0.3342 - val_accuracy: 0.5148 - val_loss: 1.4215
Epoch 6/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.9398 - loss: 0.1910 - val_accuracy: 0.5378 - val_loss: 1.7567
Epoch 7/20
[1m608

<keras.src.callbacks.history.History at 0x7fc1d8b64950>

In [73]:
p = model.predict(x_test_pad)
p = np.argmax(p, axis=1)
print(p)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
[0 2 2 1 2 0 2 0 0 0 2 2 0 0 2 0 1 2 0 0 1 2 0 0 1 2 0 0 1 0 0 1 0 2 0 0 2
 2 2 0 2 2 2 2 2 0 2 0 2 1 0 1 0 2 2 0 0 0 0 1 0 0 0 0 2 1 0 0 0 0 1 1 2 0
 2 0 1 2 2 0 1 1 0 0 0 0 2 2 0 1 1 2 2 2 0 2 2 2 2 2 0 2 0 1 2 0 0 0 0 1 0
 0 0 0 2 2 0 0 2 1 0 0 0 0 2 2 0 2 2 2 0 1 0 2 2 0 0 0 0 2 0 0 2 0 0 0 2 0
 0 1 2 2 0 0 2 2 2 0 0 1 1 2 2 2 0 0 0 1 2 0 0 2 1 0 0 1 1 0 2 1 1 2 2 1 0
 0 0 2 2 0 2 1 2 0 2 0 1 2 2 0 0 1 0 0 1 0 1 1 0 0 0 0 2 2 1 0 0 0 0 2 2 0
 0 0 0 2 0 2 0 1 2 0 0 2 0 0 2 1 0 0 0 1 0 0 0 0 2 2 2 0 0 2 2 0 0 1 0 0 2
 0 2 0 1 2 0 0 2 2 2 0 2 1 1 2 0 0 0 0 0 2 2 0 0 2 0 2 2 0 0 2 2 1 1 0 2 1
 2 0 2 2 0 0 0 0 2 0 0 2 1 0 0 1 0 2 0 2 2 0 1 0 0 2 2 2 0 0 1 0 2 0 0 2 2
 2 1 0 2 1 0 0 2 2 2 0 2 0 2 2 0 2 1 0 2 0 2 2 0 0 2 2 0 2 1 1 2 1 2 0 0 2
 0 1 0 1 2 0 0 0 1 0 0 2 0 0 2 2 0 0 2 0 2 0 0 1 2 2 2 0 2 2 0 0 0 0 0 0 0
 2 2 1 0 2 1 0 2 2 2 2 0 0 0 2 2 0 1 0 1 1 2 0 2 2 0 2 1 0 2 0 2 2 0 1 0 0
 0 2 0 0 0 2 2 2 2 1 2 0 2

In [74]:
from sklearn.metrics import confusion_matrix, classification_report
d = pd.DataFrame(confusion_matrix(y_test,p),columns=["Positive","Neutral","Negative"],index=["Positive","Neutral","Negative"])
print(d)
print(classification_report(y_test,p))

          Positive  Neutral  Negative
Positive       228       24        75
Neutral         38       51        35
Negative        87       49       173
              precision    recall  f1-score   support

           0       0.65      0.70      0.67       327
           1       0.41      0.41      0.41       124
           2       0.61      0.56      0.58       309

    accuracy                           0.59       760
   macro avg       0.56      0.56      0.56       760
weighted avg       0.59      0.59      0.59       760



# GRU

In [77]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=5,input_length=max_len))
model.add(GRU(units=100,return_sequences=True))

# 2nd LSTM Layer
model.add(GRU(units=50, return_sequences=False))
model.add(Dropout(0.2))

# Fully Connected Layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(3,activation='softmax'))
model.summary()

In [80]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])
model.fit(x_train_pad,y_train,epochs=20,batch_size=4,validation_split=0.2)

Epoch 1/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.3898 - loss: 1.0330 - val_accuracy: 0.4326 - val_loss: 0.9537
Epoch 2/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 10ms/step - accuracy: 0.4794 - loss: 0.8837 - val_accuracy: 0.4260 - val_loss: 1.0962
Epoch 3/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.5705 - loss: 0.7192 - val_accuracy: 0.4688 - val_loss: 1.1893
Epoch 4/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.5581 - loss: 0.6534 - val_accuracy: 0.4507 - val_loss: 1.1703
Epoch 5/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.6384 - loss: 0.6078 - val_accuracy: 0.4951 - val_loss: 1.3244
Epoch 6/20
[1m608/608[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.8422 - loss: 0.3763 - val_accuracy: 0.5296 - val_loss: 1.4948
Epoch 7/20
[1m608/608

<keras.src.callbacks.history.History at 0x7fc1b24b1690>

In [81]:
p = model.predict(x_test_pad)
p = np.argmax(p, axis=1)
print(p)

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[0 0 2 2 2 0 1 0 2 2 2 0 2 2 0 1 1 2 0 0 0 2 0 0 1 2 0 0 0 2 0 2 0 2 2 0 2
 2 0 0 0 2 2 1 2 0 2 0 0 1 0 2 0 2 2 0 0 2 0 1 0 0 0 0 2 1 1 0 0 0 1 2 2 1
 0 0 1 2 2 0 1 1 0 0 0 0 0 2 0 1 2 1 0 2 0 0 0 2 0 2 0 2 2 1 0 2 0 2 0 1 0
 2 0 2 2 2 0 0 2 2 0 0 0 0 2 2 1 0 2 0 0 2 0 2 2 0 2 0 0 0 0 2 2 0 0 2 2 0
 0 1 0 1 0 0 2 2 2 2 0 1 0 1 2 0 0 0 1 0 0 0 0 2 2 0 0 0 2 2 0 2 1 0 2 1 2
 2 0 0 0 2 0 1 2 2 0 0 1 2 2 1 2 2 2 2 1 0 2 1 0 0 0 0 2 0 1 0 1 0 0 0 2 2
 0 0 0 2 0 0 1 0 2 0 2 0 0 2 2 2 2 0 0 1 0 0 2 0 2 0 2 0 2 2 2 0 0 1 0 0 0
 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 0 0 0 0 0 2 2 0 0 1 2 2 2 0 0 0 2 1 1 0 1 1
 2 0 2 2 0 0 0 1 2 2 2 2 1 2 2 0 0 0 2 2 2 0 1 0 0 0 2 0 0 2 2 0 1 0 0 2 2
 2 2 2 2 1 0 0 2 1 2 0 1 0 2 2 0 1 1 0 2 0 2 2 0 0 2 0 0 2 1 2 2 2 2 0 0 2
 1 2 0 2 0 0 0 0 2 0 0 0 0 0 2 2 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 1 0 0 0 2
 2 0 2 0 2 0 0 2 2 2 0 2 2 2 2 2 0 2 2 2 2 2 0 2 2 2 2 1 0 0 1 2 2 2 1 0 1
 0 2 1 0 2 2 2 2 1 0 2 0 0 

In [82]:
from sklearn.metrics import confusion_matrix, classification_report
d = pd.DataFrame(confusion_matrix(y_test,p),columns=["Positive","Neutral","Negative"],index=["Positive","Neutral","Negative"])
print(d)
print(classification_report(y_test,p))

          Positive  Neutral  Negative
Positive       227       13        87
Neutral         38       49        37
Negative        80       32       197
              precision    recall  f1-score   support

           0       0.66      0.69      0.68       327
           1       0.52      0.40      0.45       124
           2       0.61      0.64      0.63       309

    accuracy                           0.62       760
   macro avg       0.60      0.58      0.58       760
weighted avg       0.62      0.62      0.62       760



In [84]:
# saving the model
model.save("GRUmodel.h5")

