In [None]:
### Improved sentiment analysis classifer ###
# Uses k-fold cross validation and Naive Bayes, Decision Tree, and Bernoulli ML models #
# Outputs average accuracy of the model #
import tarfile
import collections
import nltk
import sys
import random
import string
import re
import numpy as np
import sklearn
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from nltk import classify
from nltk.classify import SklearnClassifier
from nltk import NaiveBayesClassifier, DecisionTreeClassifier
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.metrics.scores import precision, recall
from nltk.stem import WordNetLemmatizer
import os

In [None]:
# only run this cell once for download if necessary
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# training/testing data files
polaritytar = tarfile.open("../Data/review_polarity.tar.gz", "r")
polaritytar.extractall('../Data/Polarity_Data')

nrctar = tarfile.open("NRC-Sentiment-Emotion-Lexicons.tar.gz", 'r')
nrctar.extractall('NRC_Data')

In [None]:
# get all the lines from all the reviews

# lines from negative reviews
neglines = []
for nfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/neg'):
    openFile = open(('../Data/Polarity_Data/txt_sentoken/neg/' + nfilename),"r")
    neglines = openFile.readlines()

# lines from positive reviews
poslines = []
for pfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/pos'):
    openFile = open(('../Data/Polarity_Data/txt_sentoken/pos/' + pfilename),"r")
    poslines = openFile.readlines()
    
intensity_file = open('../Data/NRC_Data/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
intensity_lines = intensity_file.readlines()

In [None]:
lemmatizer = WordNetLemmatizer()
word_emotions = dict()


# use the data from affect-intensity file
for line in intensity_lines[1:]:
    features = line.split()
    # features[0]: the word
    # features[2]: the primary sentiment (fear, sadness, anger, joy)
    if features[2] == '1':
        word_emotion = (lemmatizer.lemmatize(features[0]), features[1])
        word_emotions.update({word_emotion})

In [None]:
# tokens for positive reviews
poslines_tokens = []
for line in poslines:
    l = []
    for word in line.split():
        l.append(word)
    poslines_tokens.append(l)
# tokens for negative reviews
neglines_tokens = []
for line in neglines:
    l = []
    for word in line.split():
        l.append(word)
    neglines_tokens.append(l)

In [None]:
# helper function to remove non-alphanumeric characters and lowercase each token
def clean_tokens(tokens):

    cleaned_tokens = []
    for token in tokens:
        # removing stop words decreased performance significantly
        if len(token) != 0 and token not in string.punctuation: # and token.lower() not in stopwords.words('english'):
            cleaned_tokens.append(lemmatizer.lemmatize(token.lower()))

    # return pos_tag(cleaned_tokens) - not useful so didn't end up using
    return cleaned_tokens

In [None]:
# clean up the tokens list
positive_cleaned_tokens = []
negative_cleaned_tokens = []

for tokens in poslines_tokens:
    positive_cleaned_tokens.append(clean_tokens(tokens))

for tokens in neglines_tokens:
    negative_cleaned_tokens.append(clean_tokens(tokens))

In [None]:
# helper function to create the model from the tokens list
def create_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)
positive_tokens_for_model = create_model(positive_cleaned_tokens)
negative_tokens_for_model = create_model(negative_cleaned_tokens)
# categorize the tokens in each review
positive_dataset = [(t,"Positive") for t in positive_tokens_for_model]
negative_dataset = [(t,"Negative") for t in negative_tokens_for_model]
positive_emotions = ['positive', 'anticipation', 'joy', 'surprise', 'trust']
negative_emotions = ['anger', 'disgust', 'fear', 'negative', 'sadness']
# here we remove words from "conflicting" sentiments from the reviews
# i.e. if there is a word in a review marked as positive that has a "sadness" label, that word will be removed

In [None]:
# remove negative words from the set of positive reviews
pos_to_remove = list()
for (review, sentiment) in positive_dataset:
    for word in review:
        if word in word_emotions and word_emotions[word] in negative_emotions:
            pos_to_remove.append(word)

for (review, sentiment) in positive_dataset:
    for neg_word in pos_to_remove:
        if neg_word in review.keys():
            review.pop(neg_word)

In [None]:
# remove positive words from the set of negative reviews
neg_to_remove = list()
for (review, sentiment) in negative_dataset:
  for word in review:
    if word in word_emotions and word_emotions[word] in positive_emotions:
      neg_to_remove.append(word)

for (review, sentiment) in negative_dataset:
  for pos_word in neg_to_remove:
    if pos_word in review.keys():
      review.pop(pos_word)
dataset = positive_dataset + negative_dataset

In [None]:
np_dataset = np.array(dataset)
# use k-fold cross validation with k = 9 to train and test
kfold = KFold(n_splits=10, shuffle=True, random_state=35)
nb_mean_accuracy, dt_mean_accuracy, bern_mean_accuracy = list(), list(), list()
nb_mean_precision, dt_mean_precision, bern_mean_precision = list(), list(), list()

for train, test in kfold.split(np_dataset):
    # naive bayes classifier
    nb_classifier = NaiveBayesClassifier.train(np_dataset[train])
    nb_mean_accuracy.append(classify.accuracy(nb_classifier, np_dataset[test]))

    # decitions tree classifier
    dt_classifier = DecisionTreeClassifier.train(np_dataset[train])
    dt_mean_accuracy.append(classify.accuracy(dt_classifier, np_dataset[test]))

    # bernoulli classifier
    bern_classifier = SklearnClassifier(BernoulliNB()).train(np_dataset[train])
    bern_mean_accuracy.append(classify.accuracy(bern_classifier, np_dataset[test]))
    
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    classifiers = [nb_classifier, dt_classifier, bern_classifier]

    for classifier in classifiers:
        for i, (feats, label) in enumerate(np_dataset[test]):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)   

        if classifier == nb_classifier:
            nb_mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
            nb_mean_precision.append(precision(refsets['Negative'], testsets['Negative']))
            nb_mean_precision = list(filter(None, nb_mean_precision))

        elif classifier == dt_classifier:
            dt_mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
            dt_mean_precision.append(precision(refsets['Negative'], testsets['Negative']))
            dt_mean_precision = list(filter(None, dt_mean_precision))

        elif classifier == bern_classifier:
            bern_mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
            bern_mean_precision.append(precision(refsets['Negative'], testsets['Negative']))
            bern_mean_precision = list(filter(None, bern_mean_precision))

In [None]:
# print the mean accuracy across all the folds for each classifier
print("Naive Bayes accuracy:", np.mean(nb_mean_accuracy))
print("Naive Bayes precision:", np.mean(nb_mean_precision))
print('\n')
print("Decision Tree accuracy:", np.mean(dt_mean_accuracy))
print("Decision Tree precision:", np.mean(dt_mean_precision))
print('\n')
print("Bernoulli accuracy:", np.mean(bern_mean_accuracy))
print("Bernoulli precision:", np.mean(bern_mean_precision))

Naive Bayes accuracy: 0.8066666666666666
Naive Bayes precision: 0.8888888888888888


Decision Tree accuracy: 0.8066666666666666
Decision Tree precision: 0.8157407407407408


Bernoulli accuracy: 0.7166666666666666
Bernoulli precision: 0.7222222222222222
