# Sentiment Analysis

## Connect to Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


##Import libraries

In [2]:
import time
import re
import pandas as pd
import numpy as np
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import SimpleRNN
import tensorflow as tf
import datetime
%load_ext tensorboard

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

##Import and explore data

In [None]:
data = pd.read_csv('gdrive/MyDrive/Colab Notebooks/Sentiment-Analysis/data/sentiment.csv', encoding = "latin-1")
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


We need to rename the columns and we can get rid of all the useless data. To build our sentiment analysis model, we only need the text and the sentiment columns.

In [None]:
data.columns = ['Sentiment', 'ids', 'Date', 'Flag', 'User', 'Text']
data.drop(['ids', 'Flag', 'Date', 'User'], axis=1, inplace=True)
data = data.sample(frac=1) # shuffle the data
data.head()

Unnamed: 0,Sentiment,Text
1406450,4,@Alan_Wilbourn That's what we had for breakfas...
1225324,4,Its a beautiful day
824245,4,@kcmpls I'm glad someone thinks the maple syru...
1270128,4,@shamara99 your background is STILL there on m...
343476,0,Isn't online business great... What a cool gen...


In [None]:
data['Sentiment'].value_counts()

4    800000
0    799999
Name: Sentiment, dtype: int64

##Preprocess the data

The sentiment column has 2 different values : 0 for negative and 4 for positive. Let's replace them by 'positive' and 'negative' so that it is easier to read and we'll use the factorize method later to encode it for our model.


In [None]:
data.Sentiment.replace(4, 'positive', inplace=True)
data.Sentiment.replace(0, 'negative', inplace=True)
data.head()

Unnamed: 0,Sentiment,Text
1406450,positive,@Alan_Wilbourn That's what we had for breakfas...
1225324,positive,Its a beautiful day
824245,positive,@kcmpls I'm glad someone thinks the maple syru...
1270128,positive,@shamara99 your background is STILL there on m...
343476,negative,Isn't online business great... What a cool gen...


Now we need to preprocess the text. We need to lower it, get rid of usernames, numbers and emojis.

In [41]:
# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

# Defining regex patterns.
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern = '@[^\s]+'
alphaPattern = "\w*\d{1,}\w*"
sequencePattern = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

In [None]:
t = time.time()

data['Preprocessed text'] = pd.Series(dtype='object')

for index, row in data.iterrows():
  text = row.Text.lower()
  # Replace all URls with 'URL'
  text = re.sub(urlPattern,' URL',text)
  # Replace all emojis.
  for emoji in emojis.keys():
      text = text.replace(emoji, "EMOJI" + emojis[emoji])        
  # Replace @USERNAME to 'USER'.
  text = re.sub(userPattern,' USER', text)        
  # Replace all non alphabets.
  text = re.sub(alphaPattern, " ", text)
  # Replace 3 or more consecutive letters by 2 letter.
  text = re.sub(sequencePattern, seqReplacePattern, text)

  preprocessed = ""
  for word in text.split():
    if not word in list(nltk.corpus.stopwords.words('english')):
      preprocessed += (word + ' ')

  row['Preprocessed text'] = preprocessed

print(f'Time Taken: {round(time.time()-t)} seconds')
data.head()

Time Taken: 2770 seconds


Unnamed: 0,Sentiment,Text,Preprocessed text
1406450,positive,@Alan_Wilbourn That's what we had for breakfas...,USER that's breakfast
1225324,positive,Its a beautiful day,beautiful day
824245,positive,@kcmpls I'm glad someone thinks the maple syru...,USER i'm glad someone thinks maple syrup liquo...
1270128,positive,@shamara99 your background is STILL there on m...,USER background still monitor : . go get sleep...
343476,negative,Isn't online business great... What a cool gen...,online business great.. cool generation part o...


In [None]:
data.to_csv('gdrive/MyDrive/Colab Notebooks/Sentiment-Analysis/data/processed_sentiment.csv')

##Load processed data

In [3]:
data = pd.read_csv('gdrive/MyDrive/Colab Notebooks/Sentiment-Analysis/data/processed_sentiment.csv')
data.index = data['Unnamed: 0']
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data.index.name = None
data['Preprocessed text'] = data['Preprocessed text'].astype(str)
data.head()

Unnamed: 0,Sentiment,Text,Preprocessed text
1406450,positive,@Alan_Wilbourn That's what we had for breakfas...,USER that's breakfast
1225324,positive,Its a beautiful day,beautiful day
824245,positive,@kcmpls I'm glad someone thinks the maple syru...,USER i'm glad someone thinks maple syrup liquo...
1270128,positive,@shamara99 your background is STILL there on m...,USER background still monitor : . go get sleep...
343476,negative,Isn't online business great... What a cool gen...,online business great.. cool generation part o...


## Test on smaller dataset

In order to make several test and try different models, we're going to work on a smaller part of the dataset so that the processing doesn't take too long.

In [25]:
dev_data = data[:100000]

Now let's tokenize the texts so that we can use it to train our model.

In [73]:
max_features = 500
tokenizer = Tokenizer(
    num_words= max_features,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    split=" "
)
tokenizer.fit_on_texts(dev_data['Preprocessed text'].values)
X = tokenizer.texts_to_sequences(dev_data['Preprocessed text'].values)
X = pad_sequences(X)

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

52534


Now that we vectorized the texts, by turning each text into a sequence of integers (each integer being the index of a token in a dictionary), we can create our training data by splitting the data in two part : a training set and a test set.

In [74]:
sentiment_label = dev_data.Sentiment.factorize()
print(sentiment_label[1])
y = sentiment_label[0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=False)

Index(['positive', 'negative'], dtype='object')


## First model : Simple RNN

In [8]:
expanded_X_train = np.expand_dims(X_train,axis=2)
expanded_X_test = np.expand_dims(X_test,axis=2)

In [9]:
input_shape = expanded_X_train[0].shape
input_shape

(58, 1)

In [None]:
model = Sequential()
model.add(SimpleRNN(32, input_shape=input_shape))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 32)                1088      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 1,121
Trainable params: 1,121
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(expanded_X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f78fd66aa90>

In [None]:
loss,acc = model.evaluate(expanded_X_test, y_test)
print("loss: %.2f" % (loss))
print("accuracy: %.2f" % (acc))

loss: 0.69
accuracy: 0.51


We have a simple RNN model with a sentiment prediction accuracy of about 50%. We can definitely improve this model. Let's try to add an Embedding layer before the SimpleRNN layer.

## Second model : Simple RNN with Embedding

In [28]:
model_2 = Sequential()
model_2.add(Embedding(max_features, 32, input_length=X.shape[1]))
model_2.add(SimpleRNN(32, input_shape=input_shape))
model_2.add(Dropout(0.3))
model_2.add(Dense(1, activation='sigmoid'))
model_2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model_2.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 35, 32)            16000     
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dropout_4 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 18,113
Trainable params: 18,113
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
model_2.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f22d508eb10>

In [30]:
loss,acc = model_2.evaluate(X_test, y_test)
print("loss: %.2f" % (loss))
print("accuracy: %.2f" % (acc))

loss: 0.55
accuracy: 0.71


## Third model : LSTM

In [34]:
model_3 = Sequential()
model_3.add(Embedding(max_features, 32, input_length=X.shape[1]))
model_3.add(SpatialDropout1D(0.4))
model_3.add(LSTM(64, recurrent_dropout=0.2, dropout=0.2))
model_3.add(Dense(64, activation='relu'))
model_3.add(Dropout(0.3))
model_3.add(Dense(1, activation='sigmoid'))
model_3.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model_3.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 35, 32)            16000     
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 35, 32)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_7 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 65        
Total params: 45,057
Trainable params: 45,057
Non-trainable params: 0
__________________________________________________

In [35]:
model_3.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f22d050c950>

In [37]:
loss,acc = model_3.evaluate(X_test, y_test)
print("loss: %.2f" % (loss))
print("accuracy: %.2f" % (acc))

loss: 0.53
accuracy: 0.73


##Prediction Pipeline

In [75]:
def preprocess(text):
  text = text.lower()
  text = re.sub(urlPattern,' URL',text)
  for emoji in emojis.keys():
      text = text.replace(emoji, "EMOJI" + emojis[emoji])        
  text = re.sub(userPattern,' USER', text)        
  text = re.sub(alphaPattern, " ", text)
  text = re.sub(sequencePattern, seqReplacePattern, text)

  preprocessed = ""
  for word in text.split():
    if not word in list(nltk.corpus.stopwords.words('english')):
      preprocessed += (word + ' ')

  return preprocessed

In [89]:
def prediction(text):
  preprocessed = preprocess(text)
  preprocessed = tokenizer.texts_to_sequences([preprocessed])
  preprocessed = pad_sequences(preprocessed, maxlen=X.shape[1])
  prediction = model_3.predict(preprocessed)[0][0]
  if prediction > 0.5:
    print(sentiment_label[1][1])
  else:
    print(sentiment_label[1][0])

In [91]:
prediction('I love Mondays and Wednesdays !')

positive
