In [18]:
# get all of the data
# and format it in a workable way

import util
import os

data = util.read_csv(os.path.join('.', 'tweets.csv'))
parsed_data = util.parse(data)
polar_neutral_data = util.polar_neutral_split(parsed_data)
pos_neg_data = util.pos_neg_split(parsed_data)

all_X, all_y = util.generate_x_y(parsed_data)
polar_neutral_X, polar_neutral_y = util.generate_x_y(polar_neutral_data)
pos_neg_X, pos_neg_y = util.generate_x_y(pos_neg_data)

In [3]:
# find baseline accuracy based on existing modules
# more specifically textblob and vaderSentiment
# start with textblob

from textblob import TextBlob

textblob_results = [TextBlob(x).sentiment for x in all_X]
textblob_y = []
for result in textblob_results:
    if result.polarity < -1/3:
        textblob_y.append(-1)
    elif result.polarity < 1/3:
        textblob_y.append(0)
    else:
        textblob_y.append(1)

textblob_acc = util.get_accuracy(all_y, textblob_y)
print('textblob accuracy: %f' % textblob_acc)

textblob accuracy: 0.333538


In [4]:
# now move on to vader

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

vader_results = [analyzer.polarity_scores(x) for x in all_X]
vader_y = []
for result in vader_results:
    if result['compound'] <= -0.05:
        vader_y.append(-1)
    elif result['compound'] < 0.05:
        vader_y.append(0)
    else:
        vader_y.append(1)

vader_acc = util.get_accuracy(all_y, vader_y)
print('vaderSentiment accuracy: %f' % vader_acc)

vaderSentiment accuracy: 0.542828


In [19]:
# now let's create our own sentiment analyzer
# first, try it on the whole dataset

classifier = util.generate_classifier()

train_acc, test_acc = util.cross_validate(10, classifier, all_X, all_y)
    
print('all_x train acc: %f, all_x test acc: %f' % (train_acc, test_acc))

Training accuracy: 0.795128, testing accuracy: 0.789617


In [None]:
# second, focus on just the polar-neutral classification

train_acc, test_acc = util.cross_validate(10, classifier, polar_neutral_X, polar_neutral_y)

print('polar-neutral train acc: %f, polar-neutral test acc: %f' % (train_acc, test_acc))

In [None]:
# third, focus on just the pos-neg classification

train_acc, test_acc = util.cross_validate(10, classifier, pos_neg_X, pos_neg_y)

print('pos-neg train acc: %f, pos-neg test acc: %f' % (train_acc, test_acc))