# Tweet Sentiment Analysis

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re

from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Dense

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Import Data and Preprocess

In [2]:
ts = pd.read_csv('text_emotion.csv')

In [3]:
ts.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [4]:
ts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   author     40000 non-null  object
 3   content    40000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.2+ MB


In [5]:
ts = ts[['sentiment','content']]

In [6]:
ts['content'] = ts['content'].apply(lambda x: x.lower())
ts['content'] = ts['content'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

for idx,row in ts.iterrows():
    row[0] = row[0].replace('rt',' ')

In [7]:
tokenizer = Tokenizer(split=' ')
tokenizer.fit_on_texts(ts['content'].values)
x = tokenizer.texts_to_sequences(ts['content'].values)
x = pad_sequences(x)
vocabulary_size = len(tokenizer.word_index) + 1
print(vocabulary_size)
y = pd.get_dummies(ts['sentiment']).values

53229


In [8]:
# Split the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(28000, 33) (28000, 13)
(12000, 33) (12000, 13)


### Model

In [9]:
# RNN
model = Sequential()
model.add(Embedding(vocabulary_size,10,input_length=x.shape[1]))
model.add(LSTM(128,return_sequences=True))
model.add(LSTM(64))
model.add(Dense(units=100,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=13,activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 33, 10)            532290    
_________________________________________________________________
lstm_1 (LSTM)                (None, 33, 128)           71168     
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               6500      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 13)                1313      
Total params: 660,679
Trainable params: 660,679
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Compile Model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [12]:
# Train Model
model.fit(x_train,y_train,epochs=10,batch_size=256)

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x26e822eec88>

In [13]:
# Test the Model
loss, accuracy = model.evaluate(x_test,y_test)
print('Accuracy: '+str(accuracy*100)+'%')

Accuracy: 25.266666666666666%
