# Sentiment Analysis on the Sentiment140 dataset using Multinomial Naive Bayes

The dataset was acquired from the [Sentiment140](http://help.sentiment140.com/for-students) website.

In [90]:
from sklearn.naive_bayes import MultinomialNB # we need this for our Naive Bayes model

# These next two are about processing the data. We'll look into this more later in the semester.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import sklearn.metrics
import pandas as pd

# Determine class names
class_names = [0,2,4]

# Read the data from the CSV file provided.
data = pd.read_csv("/Users/bracho/Downloads/trainingandtestdata/training.1600000.processed.noemoticon.csv",
                   encoding="ISO-8859-1",
                   names=["polarity", "id", "date", "query", "user", "text"])

data.head()

Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [91]:
# Convert the text into numbers that represent each word (bag of words method)
word_vector = CountVectorizer()
word_vector_counts = word_vector.fit_transform(list(data["text"]))

# Account for the length of the documents:
#   get the frequency with which the word occurs instead of the raw number of times
term_freq_transformer = TfidfTransformer()
term_freq = term_freq_transformer.fit_transform(word_vector_counts)

In [92]:
# Train the Naive Bayes model
model = MultinomialNB().fit(term_freq, list(data["polarity"]))

In [93]:
# We can now test on the provided training dataset.

test = pd.read_csv("/Users/bracho/Downloads/trainingandtestdata/testdata.manual.2009.06.14.csv",
                   encoding="ISO-8859-1",
                   names=["polarity", "id", "date", "query", "user", "text"])

test_counts = word_vector.transform(list(test["text"]))
test_term_freq = term_freq_transformer.transform(test_counts)
    
test_pred = model.predict(test_term_freq)
test_actual = list(test["polarity"])
test_actual

print("RECALL:", sklearn.metrics.recall_score(test_actual, test_pred, average=None))
print("PRECISION:", sklearn.metrics.precision_score(test_actual, test_pred, average=None))
print("F1 SCORE:", sklearn.metrics.f1_score(test_actual, test_pred, average=None))

RECALL: [0.83050847 0.         0.76373626]
PRECISION: [0.66818182 0.         0.5       ]
F1 SCORE: [0.74055416 0.         0.60434783]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [97]:
fd_input = [
    'the sentiment140 dataset is a great tool']

def predictions(fake_docs):
    fake_counts = word_vector.transform(fake_docs)
    fake_term_freq = term_freq_transformer.transform(fake_counts)

    predicted = model.predict_proba(fake_term_freq)
    print(predicted)

predictions(fd_input)

[[0.43264136 0.56735864]]
