#### Reading the reviews from files

In [1]:
negativeReviewFile = '/home/bhavana/Desktop/rt-polaritydata/rt-polarity.neg'

In [2]:
with open(negativeReviewFile,'r',errors='ignore') as f:
    negativeReviews = f.readlines()

In [3]:
positiveReviewFile = '/home/bhavana/Desktop/rt-polaritydata/rt-polarity.pos'

In [4]:
with open(positiveReviewFile,'r',errors='ignore') as f:
    positiveReviews = f.readlines()

In [5]:
positiveReviews[0]

'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n'

We now split the data into training set and test data. First 2500 records are used for training and next 2500 records are used for testing. 

In [6]:
split = 2500

In [7]:
trainPositiveReviews = positiveReviews[:split]
trainNegativeReviews = negativeReviews[:split]

In [8]:
trainPositiveReviews[0]

'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . \n'

In [9]:
testPositiveReviews = positiveReviews[split+1:]
testNegativeReviews = negativeReviews[split+1:]

#### Defining vocabulary

In [10]:
def getVocabulary():
    positiveWordList = [word for line in trainPositiveReviews for word in line.split()]
    negativeWordList = [word for line in trainNegativeReviews for word in line.split()]
    allWordList = [item for sublist in [positiveWordList,negativeWordList] for item in sublist]
    allWordSet = list(set(allWordList))
    vocabulary = allWordSet
    return vocabulary

In [11]:
vocabulary = getVocabulary()
len(vocabulary)

14089

In [49]:
vocabulary[0]

'disingenuous'

In [12]:
'jean' in vocabulary

True

In [13]:
# This function is used as function object in the training step
def extract_features(review):
    review_words = set(review)
    features = {}
    for word in vocabulary:
        features[word] = (word in review_words)
    return features

#### Setting up training data

In [14]:
def getTrainingData():
    negTaggedNegativeReviews = [{'review': oneReview.split(),'label': 'negative'} for oneReview in trainNegativeReviews]
    posTaggedPositiveReviews = [{'review': oneReview.split(),'label': 'positive'} for oneReview in trainPositiveReviews]
    fullTaggedReviews = [item for sublist in [posTaggedPositiveReviews,negTaggedNegativeReviews] for item in sublist]
    trainingData = [(review['review'],review['label']) for review in fullTaggedReviews]
    return trainingData

In [15]:
trainingData = getTrainingData()

In [16]:
trainingData[2500][0] #Here we get the review and label

['simplistic', ',', 'silly', 'and', 'tedious', '.']

In [17]:
trainingData[2800][1] #Here we get only label

'negative'

In [18]:
len(trainingData)

5000

In [19]:
import nltk

In [20]:
def getTrainedNBClassifier(extract_features, trainingData):
    trainingFeatures = nltk.classify.apply_features(extract_features,trainingData) # Training data is converted into feature vector 
    Classifier = nltk.NaiveBayesClassifier.train(trainingFeatures)
    return Classifier

In [21]:
Classifier = getTrainedNBClassifier(extract_features, trainingData)

In [22]:
def naiveBayesSentimentAnalyzer(review):
    problemInstance = list(review.split())
    problemFeatures = extract_features(problemInstance)
    return Classifier.classify(problemFeatures)

In [23]:
naiveBayesSentimentAnalyzer("What an amazing movie!!")

'positive'

In [24]:
naiveBayesSentimentAnalyzer("What a terrible movie")

'negative'

In [25]:
def getTestReviewSentiments(naiveBayesSentimentAnalyzer):
    testNegResults = [naiveBayesSentimentAnalyzer(review) for review in testNegativeReviews]
    testPosResults = [naiveBayesSentimentAnalyzer(review) for review in testPositiveReviews]
    labelToNum = {'positive':1,'negative':-1}
    numericNegResults = [labelToNum[x] for x in testNegResults]
    numericPosResults = [labelToNum[x] for x in testPosResults]
    return {'results-on-positive':numericPosResults, 'results-on-negative':numericNegResults}


In [27]:
def runDiagnostics(reviewResult):
    positiveReviewsResult = reviewResult['results-on-positive']
    negativeReviewsResult = reviewResult['results-on-negative']
    numTruePositive = sum(x > 0 for x in positiveReviewsResult)
    numTrueNegative = sum(x < 0 for x in negativeReviewsResult)
    TruePositive = float(numTruePositive)/len(positiveReviewsResult)
    TrueNegative = float(numTrueNegative)/len(negativeReviewsResult)
    totalAccurate = numTrueNegative + numTruePositive
    total = len(positiveReviewsResult) + len(negativeReviewsResult)
    print("Accuracy on positive reviews: "+"%.2f" % (TruePositive*100)+"%")
    print("Accuracy on negative reviews: "+"%.2f" % (TrueNegative*100)+"%")
    print("Overall accuracy: "+"%.2f" % (totalAccurate*100/total) + "%")

In [28]:
runDiagnostics(getTestReviewSentiments(naiveBayesSentimentAnalyzer))

Accuracy on positive reviews: 73.53%
Accuracy on negative reviews: 77.07%
Overall accuracy: 75.30%
