# Sentiment Analysis - NLTK - Vader

#### Some background : http://t-redactyl.io/blog/2017/04/using-vader-to-handle-sentiment-analysis-with-social-media-text.html

In [26]:
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [19]:
sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
    "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
    "VADER is very smart, handsome, and funny.",  # booster words handled correctly (sentiment intensity adjusted)
    "VADER is VERY SMART, handsome, and FUNNY.",  # emphasis for ALLCAPS handled
    "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity
    "VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score
    "The book was good.",         # positive sentence
    "The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted)
    "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence
    "A really bad, horrible book.",       # negative sentence with booster words
    "At least it isn't a horrible book.", # negated negative sentence with contraction
    ":) and :D",     # emoticons handled
    "",              # an empty string is correctly handled
    "Today sux",     #  negative slang handled
    "Today sux!",    #  negative slang with punctuation emphasis handled
    "Today SUX!",    #  negative slang with capitalization emphasis
    "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but"
 ]

In [20]:
#can handle passages
paragraph = "It was one of the worst movies I've seen, despite good reviews.\
Unbelievably bad acting!! Poor direction. VERY poor production. \
The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"

from nltk import tokenize
lines_list = tokenize.sent_tokenize(paragraph)
sentences.extend(lines_list)

In [25]:
sid = SentimentIntensityAnalyzer()
for sentence in sentences:
    print(sentence)
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
        print()

VADER is smart, handsome, and funny.
compound: 0.8316, 
neg: 0.0, 
neu: 0.254, 
pos: 0.746, 
VADER is smart, handsome, and funny!
compound: 0.8439, 
neg: 0.0, 
neu: 0.248, 
pos: 0.752, 
VADER is very smart, handsome, and funny.
compound: 0.8545, 
neg: 0.0, 
neu: 0.299, 
pos: 0.701, 
VADER is VERY SMART, handsome, and FUNNY.
compound: 0.9227, 
neg: 0.0, 
neu: 0.246, 
pos: 0.754, 
VADER is VERY SMART, handsome, and FUNNY!!!
compound: 0.9342, 
neg: 0.0, 
neu: 0.233, 
pos: 0.767, 
VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!
compound: 0.9469, 
neg: 0.0, 
neu: 0.294, 
pos: 0.706, 
The book was good.
compound: 0.4404, 
neg: 0.0, 
neu: 0.508, 
pos: 0.492, 
The book was kind of good.
compound: 0.3832, 
neg: 0.0, 
neu: 0.657, 
pos: 0.343, 
The plot was good, but the characters are uncompelling and the dialog is not great.
compound: -0.7042, 
neg: 0.327, 
neu: 0.579, 
pos: 0.094, 
A really bad, horrible book.
compound: -0.8211, 
neg: 0.791, 
neu: 0.209, 
pos: 0.0, 
At least it i

## Our review data

#### Hand annotated a few sentences, ideally we should have atleast a 100 sentences. (Which shouldn't be too hard!)

In [27]:
import pandas as pd
reviews = pd.read_csv("sentiment_train.csv")


In [49]:
def get_polarity(sentence):
    pos_threshold = 0.25 
    neg_threshold = -0.25
    pol_score = sid.polarity_scores(sentence)
    return pol_score
    
reviews['vader_polarity'] = reviews['sentence'].apply(get_polarity)

#### the compound score is normalized and ranges from -1 to +1, we will call all scores between [-1, -0.25] as neg, (-0.25, 0.25) as neutral, and [0.25, 1] as positive. 

In [55]:
pol_cols_df = reviews['vader_polarity'].apply(pd.Series)
reviews_final = pd.concat([reviews, pol_cols_df], axis = 1).drop('vader_polarity', axis = 1)
reviews_final['vader_sentiment'] = reviews_final['compound'].apply(lambda x: "positive" if x>=0.25
                                                                   else ("negative" if x<=-0.25 else "neutral"))

   compound    neg    neu    pos
0    0.0000  0.000  1.000  0.000
1   -0.0772  0.056  0.944  0.000
2    0.6300  0.000  0.863  0.137
3    0.5574  0.114  0.702  0.184
4    0.0000  0.000  1.000  0.000


In [56]:
reviews_final

Unnamed: 0,sentence,sentiment,compound,neg,neu,pos,vader_sentiment
0,their chicken wings is a must get,positive,0.0,0.0,1.0,0.0,neutral
1,"we also ordered the hot and sour soup, shrimp ...",negative,-0.0772,0.056,0.944,0.0,neutral
2,there was only of eating and had plenty of lef...,positive,0.63,0.0,0.863,0.137,positive
3,"the home made kim chee is the best, then have ...",positive,0.5574,0.114,0.702,0.184,positive
4,they are home made fillings and noodles,negative,0.0,0.0,1.0,0.0,neutral
5,"be careful, you have them once and you will ha...",positive,0.2244,0.0,0.927,0.073,neutral
6,the yellow curries are awesome,positive,0.6249,0.0,0.494,0.506,positive
7,kung pao is amazing,positive,0.5859,0.0,0.441,0.559,positive
8,try the black bean homemade noodles or any noo...,negative,0.0,0.0,1.0,0.0,neutral
9,"chicken wings, garlic chicken, mao pao, dry fr...",positive,0.541,0.0,0.821,0.179,positive
