In [129]:
import os
import nltk
import pickle

In [None]:
posReviewFileName = "/data/rt-polaritydata/rtpolarity.pos"
negReviewFileName = "/data/rt-polaritydata/rtpolarity.neg"

In [5]:
with open(posReviewFileName,'r') as f:
    positiveReviews = f.readlines()
with open(negReviewFileName,'r') as f:
    negativeReviews = f.readlines()

### Train Test Split

In [6]:
testTrainingSplitIndex = 2500

In [8]:
trainPositiveReviews = positiveReviews[:testTrainingSplitIndex]
trainNegativeReviews = negativeReviews[:testTrainingSplitIndex]

In [66]:
len(trainNegativeReviews)

2500

In [9]:
testPositiveReviews = positiveReviews[testTrainingSplitIndex:]
testNegativeReviews = negativeReviews[testTrainingSplitIndex:]

### Build Vocabulary

In [12]:
PositiveWordList = [word for line in trainPositiveReviews for word in line.split()]
NegativeWordList = [word for line in trainNegativeReviews for word in line.split()]

In [15]:
AllWordList = [item for sublist in [PositiveWordList,NegativeWordList] for item in sublist]

In [17]:
vocabulary = list(set(AllWordList)) # create a set for filtering unique items

### Extract Features Function

In [67]:
# Extract Feature Vector from Reviews

def extractFeatures(review):
    review_words = set(review)
    features = {}
    for word in vocabulary:
        features[word] = (word in review_words) #returns true if any word in vocabulary is present in review_words
    return features

### Transform Data For Training

In [68]:
# returns a tuple containing review and its corresponding label

negTaggedRevTrainList  = [{'review': oneReview.split(),'label' : 'negative'} for oneReview in trainNegativeReviews]
posTaggedRevTrainList  = [{'review': oneReview.split(),'label' : 'positive'} for oneReview in trainPositiveReviews]
fullTaggedRevTrainList = [item for sublist in [negTaggedRevTrainList,posTaggedRevTrainList] for item in sublist]
trainingData = [(review['review'],review['label']) for review in fullTaggedRevTrainList ]

In [127]:
[print(trainingData) for trainingData in trainingData[0:5]]

(['simplistic', ',', 'silly', 'and', 'tedious', '.'], 'negative')
(["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.'], 'negative')
(['exploitative', 'and', 'largely', 'devoid', 'of', 'the', 'depth', 'or', 'sophistication', 'that', 'would', 'make', 'watching', 'such', 'a', 'graphic', 'treatment', 'of', 'the', 'crimes', 'bearable', '.'], 'negative')
(['[garbus]', 'discards', 'the', 'potential', 'for', 'pathological', 'study', ',', 'exhuming', 'instead', ',', 'the', 'skewed', 'melodrama', 'of', 'the', 'circumstantial', 'situation', '.'], 'negative')
(['a', 'visually', 'flashy', 'but', 'narratively', 'opaque', 'and', 'emotionally', 'vapid', 'exercise', 'in', 'style', 'and', 'mystification', '.'], 'negative')


[None, None, None, None, None]

### Feature Extraction Using nltk

In [69]:
# Input : Training Data and Function Object, output : correct feature vector form

trainingFeatures = nltk.classify.apply_features(extractFeatures,trainingData)

### Train model

In [73]:
trainedNBClassifier = nltk.NaiveBayesClassifier.train(trainingFeatures)

In [74]:
trainedNBClassifier

<nltk.classify.naivebayes.NaiveBayesClassifier at 0x217ee871288>

In [75]:
def naiveBayesSentimentCalculator(review):
    problemInstance =review.split()
    problemFeatures = extractFeatures(problemInstance)
    return trainedNBClassifier.classify(problemFeatures)

In [103]:
def getTestReviewSentiments(naiveBayesSentimentCalculator):
    testNegResults = [naiveBayesSentimentCalculator(review) for review in testNegativeReviews]
    testPosResults = [naiveBayesSentimentCalculator(review) for review in testPositiveReviews]
    labelToNum = {'positive': 1, 'negative':-1}
    numericNegResults = [labelToNum[x] for x in testNegResults]
    numericPosResults = [labelToNum[x] for x in testPosResults]
    return { 'positive_results' : numericPosResults, 'negative_results' : numericNegResults}    

In [104]:
# To test accuracy
def runDiagnostics(reviewResult):
    
    posReviewRslt = reviewResult['positive_results']
    negReviewRslt = reviewResult['negative_results']
    
    # percentage
    pctTruePositive = sum([x>0 for x in posReviewRslt])/len(posReviewRslt)
    pctTrueNegative = sum([x<0 for x in negReviewRslt])/len(negReviewRslt)
    
    #total 
    total       = len(posReviewRslt)+len(negReviewRslt)
    total_pct   = sum([x>0 for x in posReviewRslt])+sum([x<0 for x in negReviewRslt])
    overall_Acc = (total_pct*100)/total
    
    #display results
    print("Accuracy of positive reviews : " + "%.2f" % (pctTruePositive*100) +"%" )
    print("Accuracy of negative reviews : " + "%.2f" % (pctTrueNegative*100) +"%" )
    print("Overall Accuracy : " + "%.2f" % overall_Acc +"%" )

In [105]:
runDiagnostics(getTestReviewSentiments(naiveBayesSentimentCalculator))

Accuracy of positive reviews : 73.40%
Accuracy of negative reviews : 77.08%
Overall Accuracy : 75.24%


In [108]:
naiveBayesSentimentCalculator("What an awesome movie")

'positive'

In [111]:
naiveBayesSentimentCalculator("Movie was really bad")

'negative'

In [113]:
naiveBayesSentimentCalculator("awesome movie, great direction")

'positive'

In [114]:
naiveBayesSentimentCalculator("awesome movie, bad direction")

'negative'

In [117]:
naiveBayesSentimentCalculator("above average movie, wonderful")

'positive'

In [118]:
naiveBayesSentimentCalculator("above average movie, good")

'negative'

good is classified as negative... OOPS 

In [120]:
naiveBayesSentimentCalculator("One of the best movies ever. Props to Coppola for directing this amazing masterpiece . Marlon Brando and Al Pacino are the standout stars of this movie that make it interesting. The plot is suspenseful and interesting. There are many scenes in this movie that show you just how powerful the Corleones are and reminds you not to mess with the Mafia. It also ties up the importance of family , love while also being a gangster film.")

'positive'

In [121]:
naiveBayesSentimentCalculator("This movie was gravely disappointing. If you’re scrolling through Apple TV during quarantine wondering whether to buy it or not? Don’t! It’s a let down with a poor story line and jokes that fall flat and waste of money. The trailer paints a witty, intelligently strung stylised story but what you get instead is vulgar sexual jokes (which aren’t funny) and an array of side characters with out of context one liners. We turned it off when one of the characters tells another “you smell so nice and fresh! Almost like a fresh thermometer before it enters your butt”. Plain stupid, vulgar and tasteless. Save your money and watch one of the older chick flicks if you wish. I truly believe that having men write, produce and direct this movie is where it went wrong! No woman would ever agree to such a vulgar and tasteless script.")

'negative'

Works fine on lengthy reviews and not good with one liners with very few words.

In [131]:
# Save Classifier
movRevNBClassifier = open("nbSentiClassifier.pickle", "wb")
pickle.dump(trainedNBClassifier,movRevNBClassifier)