# Sentiment Analysis - NLTK - Vader

#### Some background : http://t-redactyl.io/blog/2017/04/using-vader-to-handle-sentiment-analysis-with-social-media-text.html; Paper for reference : http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf

In [None]:
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
    "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
    "VADER is very smart, handsome, and funny.",  # booster words handled correctly (sentiment intensity adjusted)
    "VADER is VERY SMART, handsome, and FUNNY.",  # emphasis for ALLCAPS handled
    "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity
    "VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score
    "The book was good.",         # positive sentence
    "The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted)
    "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence
    "A really bad, horrible book.",       # negative sentence with booster words
    "At least it isn't a horrible book.", # negated negative sentence with contraction
    ":) and :D",     # emoticons handled
    "",              # an empty string is correctly handled
    "Today sux",     #  negative slang handled
    "Today sux!",    #  negative slang with punctuation emphasis handled
    "Today SUX!",    #  negative slang with capitalization emphasis
    "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but"
 ]

In [None]:
#can handle passages
paragraph = "It was one of the worst movies I've seen, despite good reviews.\
Unbelievably bad acting!! Poor direction. VERY poor production. \
The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"

from nltk import tokenize
lines_list = tokenize.sent_tokenize(paragraph)
sentences.extend(lines_list)

In [None]:
sid = SentimentIntensityAnalyzer()
for sentence in sentences:
    print(sentence)
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
        print()

## Our review data

#### Hand annotated a few sentences, ideally we should have atleast a 100 sentences. (Which shouldn't be too hard!)

In [None]:
import pandas as pd
reviews = pd.read_csv("sentiment_test.csv")


In [None]:
def get_polarity(sentence):
    pol_score = sid.polarity_scores(sentence)
    return pol_score
    
reviews['vader_polarity'] = reviews['sentence'].apply(get_polarity)

#### the compound score is normalized and ranges from -1 to +1, we will call all scores between [-1, -0.25] as neg, (-0.25, 0.25) as neutral, and [0.25, 1] as positive. 

In [None]:
pol_cols_df = reviews['vader_polarity'].apply(pd.Series)
reviews_final = pd.concat([reviews, pol_cols_df], axis = 1).drop('vader_polarity', axis = 1)
reviews_final['vader_sentiment'] = reviews_final['compound'].apply(lambda x: "positive" if x>=0.25
                                                                   else ("negative" if x<=-0.25 else "neutral"))

In [None]:
reviews_final[reviews_final["sentiment"] == reviews_final["vader_sentiment"]].shape[0]/reviews_final.shape[0]*100

In [None]:
annotations = pd.read_csv("predictions_sentence.csv")
annotations.head()

In [None]:
annotations[annotations["entities"] != "[]"].sample(n=70).to_csv("sentiment_test_1.csv", index = False)

In [None]:
annotations['sentence'] = annotations['sentence'].astype('str')
annotations['vader_polarity'] = annotations['sentence'].apply(lambda x: sid.polarity_scores(x))
pol_cols_df = annotations['vader_polarity'].apply(pd.Series)
annotations_final = pd.concat([annotations, pol_cols_df], axis = 1).drop('vader_polarity', axis = 1)
annotations_final['vader_sentiment'] = annotations_final['compound'].apply(lambda x: "positive" if x>=0.25
                                                                   else ("negative" if x<=-0.25 else "neutral"))

In [None]:
annotations_final

## LR Sentiment Analysis

In [None]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
n_instances = 100

subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
print(subj_docs)
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
# pos_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
# neg_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
print(len(subj_docs), len(obj_docs))
(100, 100)
#Each document is represented by a tuple (sentence, label). The sentence is tokenized, so it is represented by a list of strings:

print(subj_docs[0])
#(['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one',
#'thing', 'is', 'a', 'small', 'gem', '.'], 'subj')
#We separately split subjective and objective instances to keep a balanced uniform class distribution in both train and test sets.

train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
#We use simple unigram word features, handling negation:

unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
print(len(unigram_feats))
#83
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
#We apply features to obtain a feature-value representation of our datasets:

training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
#We can now train our classifier on the training set, and subsequently output the evaluation results:

trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
# Training classifier
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))
# Evaluating NaiveBayesClassifier results...
# Accuracy: 0.8
# F-measure [obj]: 0.8
# F-measure [subj]: 0.8
# Precision [obj]: 0.8
# Precision [subj]: 0.8
# Recall [obj]: 0.8
# Recall [subj]: 0.8

In [None]:
import nltk
from nltk.classify import NaiveBayesClassifier
import os
from random import shuffle
import csv
from nltk.classify.util import accuracy
from nltk.tokenize import word_tokenize
from nltk.corpus import movie_reviews



# for input to 
def format_sentence(sent):
    return({word: True for word in nltk.word_tokenize(sent)})
def create_word_features(words):
    useful_words = [word for word in words]
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict


#using the movie review corpus 
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
    words = movie_reviews.words(fileid)
    neg_reviews.append((create_word_features(words), "negative"))
print(len(neg_reviews))

pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
    words = movie_reviews.words(fileid)
    pos_reviews.append((create_word_features(words), "positive"))
print(len(pos_reviews))

train_set = neg_reviews[:750] + pos_reviews[:750]
test_set =  neg_reviews[750:] + pos_reviews[750:]
print(len(train_set),  len(test_set))



# PATH_TO_DATA = '/Users/mithramuthukrishnan/Documents/CS585/aclImdb/'
# TRAIN_DIR = os.path.join(PATH_TO_DATA, "train")
# TEST_DIR = os.path.join(PATH_TO_DATA, "test")

# train = []
# for x in os.listdir(TRAIN_DIR + "/" + 'pos'):
#     f = open(TRAIN_DIR + "/pos/" + x).read()
#     train.append((format_sentence(f),'pos'))
# for x in os.listdir(TRAIN_DIR + "/" + 'neg'):
#     f = open(TRAIN_DIR + "/neg/" + x).read()
#     train.append((format_sentence(f),'neg'))
    
# shuffle(train)

classifier = NaiveBayesClassifier.train(train_set)

classifier.show_most_informative_features()
print("Accuracy of movie",accuracy(classifier, test_set))
test = []
with open('/Users/mithramuthukrishnan/Documents/CS585/word2mouth/sentiment_test.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter = ',')
    header = 0
    for row in csv_reader:
        if header > 1:
            form_sent = format_sentence(row[0])
            if row[1] == 'positive':
                test.append((form_sent,'positive'))
            elif row[1] == 'negative':
                test.append((form_sent,'negative'))
        header += 1

print("Accuracy of yelp dataset",accuracy(classifier,test))






# vocab = set()
# for passage in train:
#     review = passage[0].lower()
#     for word in word_tokenize(review):
#         vocab.add(word)
# #     print(word_tokenize(review))
# vocab

# dictionary = set(word.lower() for passage in train for word in word_tokenize(passage[0]))



#t = [({word: (word in word_tokenize(x[0])) for word in dictionary}, x[1]) for x in train]


        