In [1]:
import nltk

In [2]:
from nltk.sentiment import vader



In [3]:
sia = vader.SentimentIntensityAnalyzer()

In [4]:
positiveReviewsFileName = "/home/bhavana/Desktop/rt-polaritydata/rt-polarity.pos"

In [5]:
with open(positiveReviewsFileName,'r',errors="ignore") as f:
    positiveReviews = f.readlines()

In [6]:
len(positiveReviews)

5331

In [7]:
positiveReviews[20]

'a thoughtful , provocative , insistently humanizing film . \n'

Doing the same with negative review file

In [8]:
negativeReviewsFileName = "/home/bhavana/Desktop/rt-polaritydata/rt-polarity.neg"

In [9]:
with open(negativeReviewsFileName,'r',errors="ignore") as f1:
    negativeReviews = f1.readlines()

In [10]:
len(negativeReviews)

5331

In [11]:
negativeReviews[1]

"it's so laddish and juvenile , only teenage boys could possibly find it funny . \n"

#### Using VADER to classify the reviews

Creating function to return compound value - 

In [12]:
def vaderSentiment(review):
    return sia.polarity_scores(review)['compound']

In [13]:
vaderSentiment("this is the best restaurant in the city")

0.6369

In [14]:
def getReviewSentiments(sentimentCalculator):
    negReviewResult = [sentimentCalculator(oneNegativeReview) for oneNegativeReview in negativeReviews]
    posReviewResult = [sentimentCalculator(onePositiveReview) for onePositiveReview in positiveReviews]
    return {'results-on-positive':posReviewResult, 'results-on-negative':negReviewResult}

In [15]:
vaderResult = getReviewSentiments(vaderSentiment)

In [16]:
vaderResult.keys()

dict_keys(['results-on-positive', 'results-on-negative'])

In [17]:
vaderResult

{'results-on-negative': [0.0258,
  0.4404,
  0.0,
  -0.25,
  0.0,
  0.4939,
  0.0,
  0.0,
  -0.34,
  -0.3612,
  -0.3678,
  0.397,
  -0.0384,
  -0.836,
  0.3818,
  -0.2565,
  0.4404,
  0.4199,
  0.0772,
  0.0,
  0.7346,
  -0.3559,
  0.2732,
  -0.0516,
  0.4939,
  0.4019,
  -0.5423,
  -0.8887,
  0.6068,
  -0.296,
  0.0772,
  0.0,
  0.5267,
  0.4939,
  -0.7845,
  -0.5865,
  0.0258,
  -0.2457,
  -0.5789,
  0.0,
  -0.25,
  -0.6808,
  0.4588,
  0.5574,
  0.802,
  -0.4767,
  0.6124,
  -0.4767,
  -0.7579,
  0.0,
  -0.5562,
  0.0516,
  0.6369,
  -0.4767,
  -0.5574,
  0.4404,
  0.8658,
  0.0,
  0.3477,
  0.5574,
  -0.8591,
  -0.5574,
  -0.5994,
  0.128,
  0.1154,
  0.34,
  0.2509,
  0.4404,
  -0.4767,
  0.0,
  0.0,
  -0.144,
  0.4215,
  0.0,
  0.2846,
  -0.5267,
  0.0,
  -0.0258,
  -0.2235,
  -0.4824,
  0.5095,
  -0.4215,
  0.4402,
  0.4019,
  -0.7269,
  0.0,
  0.875,
  0.4767,
  -0.3239,
  0.1779,
  0.8497,
  0.4404,
  0.0,
  0.3612,
  0.802,
  -0.1263,
  -0.3612,
  0.5994,
  0.0,
  0.0,
  -0.4

#### Calculating the accuracy

In [18]:
def runDiagnostics(reviewResult):
    positiveReviewsResult = reviewResult['results-on-positive']
    negativeReviewsResult = reviewResult['results-on-negative']
    TruePositive = float(sum(x > 0 for x in positiveReviewsResult))/len(positiveReviewsResult)
    TrueNegative = float(sum(x < 0 for x in negativeReviewsResult))/len(negativeReviewsResult)
    totalAccurate = float(sum(x > 0 for x in positiveReviewsResult)) + float(sum(x < 0 for x in negativeReviewsResult))
    total = len(positiveReviewsResult) + len(negativeReviewsResult)
    print("Accuracy on positive reviews: "+"%.2f" % (TruePositive*100)+"%")
    print("Accuracy on negative reviews: "+"%.2f" % (TrueNegative*100)+"%")
    print("Overall accuracy: "+"%.2f" % (totalAccurate*100/total) + "%")

In [19]:
runDiagnostics(getReviewSentiments(vaderSentiment))

Accuracy on positive reviews: 69.48%
Accuracy on negative reviews: 40.16%
Overall accuracy: 54.82%


#### Using Sentiwordnet

In [20]:
from nltk.corpus import sentiwordnet as swn

In [21]:
list(swn.senti_synsets('dog')) #In python2 this returns list directly but in python3 it doesn't and that is why we need to convert it into list

[SentiSynset('dog.n.01'),
 SentiSynset('frump.n.01'),
 SentiSynset('dog.n.03'),
 SentiSynset('cad.n.01'),
 SentiSynset('frank.n.02'),
 SentiSynset('pawl.n.01'),
 SentiSynset('andiron.n.01'),
 SentiSynset('chase.v.01')]

In [22]:
list(swn.senti_synsets('dog'))[3]

SentiSynset('cad.n.01')

In [23]:
list(swn.senti_synsets('dog'))[3].pos_score()

0.0

In [24]:
list(swn.senti_synsets('dog'))[3].neg_score()

1.0

In [25]:
def superNaiveSentiment(review):
    reviewPolarity = 0.0
    numExceptions = 0
    for word in review.lower().split():
        weight = 0.0
        try:
            common_meaning = list(swn.senti_synsets(word))[0]
            if common_meaning.pos_score() > common_meaning.neg_score():
                weight = weight + common_meaning.pos_score()
            elif common_meaning.neg_score() > common_meaning.pos_score():
                weight = weight - common_meaning.neg_score()
        except:
            numExceptions = numExceptions + 1
        reviewPolarity = reviewPolarity + weight
    return reviewPolarity

In [26]:
runDiagnostics(getReviewSentiments(superNaiveSentiment))

Accuracy on positive reviews: 65.11%
Accuracy on negative reviews: 42.86%
Overall accuracy: 53.99%


#### Now we try to improve the performance in two ways

In [27]:
from string import punctuation

In [28]:
from nltk.corpus import stopwords

In [29]:
stopwords = set(stopwords.words('english')+list(punctuation))

In [30]:
list(punctuation)

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~']

In [31]:
stopwords

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'need

In [32]:
def naiveSentiment(review):
    reviewPolarity = 0.0
    numExceptions = 0
    for word in review.lower().split():
        numMeanings = 0
        if word in stopwords:
            continue
        weight = 0.0
        try:
            for meaning in swn.senti_synsets(word):
                if meaning.pos_score() > meaning.neg_score():
                    weight = weight + (meaning.pos_score() - meaning.neg_score())
                    numMeanings = numMeanings + 1
                elif meaning.neg_score() > meaning.pos_score():
                    weight = weight - (meaning.neg_score() - meaning.pos_score())
                    numMeanings = numMeanings + 1
        except:
            numExceptions = numExceptions + 1
        if numMeanings > 0:
            reviewPolarity = reviewPolarity + (weight/numMeanings)
    return reviewPolarity

In [33]:
runDiagnostics(getReviewSentiments(naiveSentiment))

Accuracy on positive reviews: 75.58%
Accuracy on negative reviews: 42.79%
Overall accuracy: 59.18%


In the above procedure we have removed the stopwords so in some cases negation would not give appropriate result for example - 

In [34]:
review = "This is a best restaurant in the city"
naiveSentiment(review)

0.5630810810810811

In [35]:
review = "This is not a best restaurant in the city"
naiveSentiment(review)

0.5630810810810811

Since we cannot improve the negative reviews accuracy, we try to move on to ML-based approach