# Sentiment Analysis on the Sentiment140 dataset using Multinomial Naive Bayes and LSTMs

In [1]:
import warnings
warnings.filterwarnings("ignore")

The dataset was acquired from the [Sentiment140](http://help.sentiment140.com/for-students) website.

## 1. Naive Bayes

In [94]:
from sklearn.naive_bayes import MultinomialNB # we need this for our Naive Bayes model

# These next two are about processing the data. We'll look into this more later in the semester.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import sklearn.metrics
import pandas as pd
import numpy as np

# Determine class names
class_names = [0,2,4]

# Read the data from the CSV file provided.
data = pd.read_csv("/Users/bracho/Downloads/trainingandtestdata/training.1600000.processed.noemoticon.csv",
                   encoding="ISO-8859-1",
                   names=["polarity", "id", "date", "query", "user", "text"])

data = data.sample(frac=1).reset_index(drop=True)

data.head()

Unnamed: 0,polarity,id,date,query,user,text
0,0,2243077421,Fri Jun 19 12:59:52 PDT 2009,NO_QUERY,urbanko,I just screamed 'daffodils!!! Ahhhh!!!' and mo...
1,4,2070002870,Sun Jun 07 16:55:37 PDT 2009,NO_QUERY,butterbeckafly,petted the stingrays at the zoo and ate delici...
2,0,2321629940,Wed Jun 24 21:20:00 PDT 2009,NO_QUERY,davidlian,@nikicheong The only way I've done it is the m...
3,0,1979916623,Sun May 31 05:00:54 PDT 2009,NO_QUERY,DigitalJedi007,I STILL can't believe it...why do I even attem...
4,0,1972805441,Sat May 30 09:55:19 PDT 2009,NO_QUERY,hawkins_boi,ohhhhhhhhhhh i hope i make it home in time fo...


In [3]:
# Convert the text into numbers that represent each word (bag of words method)
word_vector = CountVectorizer()
word_vector_counts = word_vector.fit_transform(list(data["text"]))

# Account for the length of the documents:
#   get the frequency with which the word occurs instead of the raw number of times
term_freq_transformer = TfidfTransformer()
term_freq = term_freq_transformer.fit_transform(word_vector_counts)

In [7]:
# Train the Naive Bayes model
model = MultinomialNB().fit(term_freq, list(data["polarity"]))

We can now test the training dataset.

In [8]:
test = pd.read_csv("/Users/bracho/Downloads/trainingandtestdata/testdata.manual.2009.06.14.csv",
                   encoding="ISO-8859-1",
                   names=["polarity", "id", "date", "query", "user", "text"])

test_counts = word_vector.transform(list(test["text"]))
test_term_freq = term_freq_transformer.transform(test_counts)
    
test_pred = model.predict(test_term_freq)
test_actual = list(test["polarity"])
test_actual

print("RECALL:", sklearn.metrics.recall_score(test_actual, test_pred, average=None))
print("PRECISION:", sklearn.metrics.precision_score(test_actual, test_pred, average=None))
print("F1 SCORE:", sklearn.metrics.f1_score(test_actual, test_pred, average=None))

RECALL: [0.83050847 0.         0.76373626]
PRECISION: [0.66818182 0.         0.5       ]
F1 SCORE: [0.74055416 0.         0.60434783]


In [9]:
fd_input = [
    'the sentiment140 dataset is a great tool']

def predictions(fake_docs):
    fake_counts = word_vector.transform(fake_docs)
    fake_term_freq = term_freq_transformer.transform(fake_counts)

    predicted = model.predict_proba(fake_term_freq)
    print(predicted)

predictions(fd_input)

[[0.43264136 0.56735864]]


## 2. LSTMs

In [95]:
import keras
from keras.models import Sequential, load_model, Model
from keras.layers import Conv2D, BatchNormalization, Activation, Dropout, MaxPooling2D, Conv2DTranspose, UpSampling2D, Flatten, Dense, Reshape, Conv1D, MaxPooling1D, PReLU, Input, TimeDistributed, LSTM, Embedding
from keras.optimizers import Adam
from keras import regularizers
import keras.backend as K
from scipy.sparse import csr_matrix
import re
import random

In [96]:
def text_convert(tx):
    tx = re.sub(r'[^0-9a-zA-Z ]+', '', tx).split()
    v = csr_matrix(w_vector.transform(tx)).todense()
    v = np.where(v==1)[1]+1
    v = np.pad(v, (0, 100-len(v)), 'constant')
    return v

In [97]:
# vectorize the words in the dataset
# Convert the text into numbers that represent each word (bag of words method)
w_vector = CountVectorizer(max_features=10000)
w_vector_counts = w_vector.fit_transform(list(data["text"]))

In [109]:
data_vectorized = []
for text in list(data["text"])[:10000]:
    data_vectorized.append(text_convert(text))

data_vectorized = np.array(data_vectorized)
data_vectorized.shape

(100, 100)

In [110]:
lstm_model = Sequential()
inputs = Input(shape=(100,))
embedding = Embedding(10001, 100, input_length=100)(inputs)
lstm1, state_h1, state_c1 = LSTM(128, return_state=True, return_sequences=True)(embedding)
lstm2, state_h2, state_c2 = LSTM(128, return_state=True)(lstm1)
dense_1 = Dense(1024)(state_h2)
dense_output = Dense(1)(dense_1)
output_tanh = Activation("tanh")(dense_output)
lstm_model = Model(inputs=inputs, outputs=[output_tanh])

In [111]:
optimizer = keras.optimizers.Adadelta()

lstm_model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["mse"])
lstm_model.fit(data_vectorized, list((data["polarity"]-2)/2)[:10000], epochs=1, shuffle=True)

Epoch 1/1


<keras.callbacks.History at 0x1906bd690>

In [104]:
lstm_model.predict(np.array([text_convert("I hated the movie I watched yesterday it was so bad")]))

array([[-0.10848051]], dtype=float32)