In [None]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import tensorflow
from tensorflow import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Embedding,Dropout,LSTM

from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

In [None]:
data= pd.read_csv ("/content/drive/MyDrive/Datasets/data_tweet_analysis.csv")

In [None]:
data.head(3)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive


In [None]:
data.OriginalTweet[0]

'TRENDING: New Yorkers encounter empty supermarket shelves (pictured, Wegmans in Brooklyn), sold-out online grocers (FoodKick, MaxDelivery) as #coronavirus-fearing shoppers stock up https://t.co/Gr76pcrLWh https://t.co/ivMKMsqdT1'

In [None]:
data.Sentiment.value_counts()

Negative              1041
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     592
Name: Sentiment, dtype: int64

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       3798 non-null   int64 
 1   ScreenName     3798 non-null   int64 
 2   Location       2964 non-null   object
 3   TweetAt        3798 non-null   object
 4   OriginalTweet  3798 non-null   object
 5   Sentiment      3798 non-null   object
dtypes: int64(2), object(4)
memory usage: 178.2+ KB


In [None]:
data.Location.value_counts()

United States               75
London, England             48
Washington, DC              38
New York, NY                34
Los Angeles, CA             33
                            ..
Distrito Federal, México     1
Plymouth UK                  1
Wakefield, UK                1
Corpus Christi, TX           1
SF Native/Peninsula          1
Name: Location, Length: 1717, dtype: int64

In [None]:
data.Location.describe()

count              2964
unique             1717
top       United States
freq                 75
Name: Location, dtype: object

In [None]:
# filling in missing values by the most frequent
data = data.fillna (data.Location.describe()[2])

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3798 entries, 0 to 3797
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       3798 non-null   int64 
 1   ScreenName     3798 non-null   int64 
 2   Location       3798 non-null   object
 3   TweetAt        3798 non-null   object
 4   OriginalTweet  3798 non-null   object
 5   Sentiment      3798 non-null   object
dtypes: int64(2), object(4)
memory usage: 178.2+ KB


In [None]:
data.Location.value_counts()

United States               909
London, England              48
Washington, DC               38
New York, NY                 34
Los Angeles, CA              33
                           ... 
Distrito Federal, México      1
Plymouth UK                   1
Wakefield, UK                 1
Corpus Christi, TX            1
SF Native/Peninsula           1
Name: Location, Length: 1717, dtype: int64

In [None]:
# label encoding the target
data['Sentiment'].replace('Extremely Negative',0,inplace=True)
data['Sentiment'].replace('Negative',1,inplace=True)
data['Sentiment'].replace('Neutral',2,inplace=True)
data['Sentiment'].replace('Positive',3,inplace=True)
data['Sentiment'].replace('Extremely Positive',4,inplace=True)

print (data.Sentiment.value_counts())

1    1041
3     947
2     619
4     599
0     592
Name: Sentiment, dtype: int64


In [None]:
x=data.drop('Sentiment',axis=1)
y=data['Sentiment'].values
print(y)
y=y.reshape(-1,1)
print(y)

[0 3 4 ... 2 0 4]
[[0]
 [3]
 [4]
 ...
 [2]
 [0]
 [4]]


In [None]:
# Text Preprocessing
nltk.download('stopwords')
message=x.copy()
message.reset_index(inplace=True)

ps=PorterStemmer()
corpus=[]
for i in range(len(x)):
  review=re.sub('[^a-zA-Z]',' ',message['OriginalTweet'][i])
  review=review.lower()
  review=review.split()
  review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
  review=' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
print(corpus)

['trend new yorker encount empti supermarket shelv pictur wegman brooklyn sold onlin grocer foodkick maxdeliveri coronaviru fear shopper stock http co gr pcrlwh http co ivmkmsqdt', 'find hand sanit fred meyer turn amazon pack purel check coronaviru concern drive price http co ygbipbflmi', 'find protect love one coronaviru', 'panic buy hit newyork citi anxiou shopper stock food amp medic suppli healthcar worker becom bigappl st confirm coronaviru patient bloomberg stage event http co iasiregpc qanon qanon qanon elect cdc http co iszoewxu', 'toiletpap dunnypap coronaviru coronavirusaustralia coronavirusupd covid news corvid newsmelb dunnypaperg costco one week everyon buy babi milk powder next everyon buy toilet paper http co sczryvvsih', 'rememb last time paid gallon regular ga lo angel price pump go look coronaviru impact price pm abc http co pyzq ymuv', 'vote age coronaviru hand sanit supertuesday http co z bel dk', 'drtedro stop covid without protect healthwork price surgic mask incr

In [None]:
voc_size=50000
one_hot_r=[one_hot(word,voc_size) for word in corpus]
print(one_hot_r)

[[36475, 41747, 28655, 18335, 42228, 49498, 10892, 34237, 17017, 38700, 48655, 27677, 38914, 27367, 12960, 49371, 28760, 31832, 17998, 20827, 16178, 15037, 4254, 20827, 16178, 4611], [32230, 287, 33225, 12583, 45895, 40137, 25184, 9923, 17154, 31489, 49371, 31847, 24649, 40277, 20827, 16178, 6524], [32230, 26806, 11388, 11230, 49371], [24600, 25222, 35291, 7950, 42025, 35377, 31832, 17998, 40663, 28451, 19710, 5194, 28047, 42397, 11310, 14481, 13084, 10557, 49371, 13495, 36354, 28875, 28378, 20827, 16178, 11454, 17609, 17609, 17609, 3346, 14762, 20827, 16178, 30553], [37345, 31484, 49371, 2990, 18210, 35413, 8265, 5725, 13059, 13406, 30932, 11230, 31038, 40187, 25222, 16150, 1551, 150, 27702, 40187, 25222, 23831, 11585, 20827, 16178, 18418], [3953, 10662, 17133, 1620, 1607, 19218, 44119, 43413, 6203, 40277, 45673, 7711, 49468, 49371, 8416, 40277, 44955, 46461, 20827, 16178, 25160, 1627], [30117, 30830, 49371, 287, 33225, 21017, 20827, 16178, 18022, 2009, 12928], [22031, 35641, 35413, 4

In [None]:
sent_length=30
input=pad_sequences(one_hot_r,padding='pre',maxlen=sent_length)

final_x=np.array(input)
final_y=np.array(y)

In [None]:
# test-train split
x_train, x_test, y_train, y_test = train_test_split(final_x,final_y, test_size=0.33, random_state=42)
print(x_train.shape)
print(y_train.shape)

(2544, 30)
(2544, 1)


## MODEL BUILDING: MODEL #1

In [None]:
dim=40
model=Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(80))
model.add(Dropout(0.3))
model.add(Dense(1,activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 40)            2000000   
_________________________________________________________________
dropout (Dropout)            (None, 30, 40)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 80)                38720     
_________________________________________________________________
dropout_1 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 81        
Total params: 2,038,801
Trainable params: 2,038,801
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train,y_train,batch_size=50,epochs=10,validation_data=(x_test,y_test), verbose = 2)


Epoch 1/120
51/51 - 6s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 2/120
51/51 - 3s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 3/120
51/51 - 3s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 4/120
51/51 - 3s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 5/120
51/51 - 3s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 6/120
51/51 - 3s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 7/120
51/51 - 3s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 8/120
51/51 - 3s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 9/120
51/51 - 3s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 10/120
51/51 - 3s - loss: 0.000

<keras.callbacks.History at 0x7f4458ae5510>

In [None]:
y_pred=model.predict(x_test)

val=metrics.accuracy_score(y_test,y_pred)
print("accuracy is =",str(val*100)+" %")

accuracy is = 26.236044657097292 %


## MODEL #2

In [None]:
from keras.layers import Bidirectional
dim=120
model=Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1,activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 120)           6000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 30, 128)           94720     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 6,136,001
Trainable params: 6,136,001
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train,y_train,batch_size=50,epochs=10,validation_data=(x_test,y_test), verbose = 2)

Epoch 1/10
51/51 - 17s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 2/10
51/51 - 8s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 3/10
51/51 - 8s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 4/10
51/51 - 9s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 5/10
51/51 - 8s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 6/10
51/51 - 8s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 7/10
51/51 - 8s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 8/10
51/51 - 8s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 9/10
51/51 - 8s - loss: 0.0000e+00 - accuracy: 0.2799 - val_loss: 0.0000e+00 - val_accuracy: 0.2624
Epoch 10/10
51/51 - 8s - loss: 0.0000e+00 - a

<keras.callbacks.History at 0x7f44586946d0>

In [None]:
y_pred1=model.predict(x_test)

val=metrics.accuracy_score(y_test,y_pred1)
print("accuracy is =",str(val*100)+" %")

accuracy is = 26.236044657097292 %
