In [1]:
# Sentiment Analysis
import nltk
from nltk.corpus import movie_reviews

In [None]:
dir(movie_reviews) #categories, words and fileids

In [2]:
# list of words for all files in movie_reviews with category
documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

In [3]:
import random

random.shuffle(documents)
# print(documents[1][0])

In [4]:
from nltk.probability import FreqDist #FreqDist arranges the words in descendng order of frequently appeared words.
all_words = []

for word in movie_reviews.words():
    all_words.append(word.lower())
    
all_words = FreqDist(all_words)
word_features = list(all_words)[:3000] #Most frequent words in movie_words

In [5]:
# Relation of documents with features(if present or not)
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [7]:
# print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

#Prepare dataset
featuresets=[]
for (rev, category) in documents:
    featuresets.append((find_features(rev), category))
    
# print(len(featuresets))

In [8]:
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:] #100 sets

In [9]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [10]:
import pickle

#Dump the classifier
classifier_dump = open('dump/classifier.pickle', 'wb')
new_classifier = pickle.dump(classifier, classifier_dump)
classifier_dump.close()

#Load the classifier
classifier_f = open('dump/classifier.pickle', 'rb')
classifier = pickle.load(classifier_f)
classifier_f.close()

In [11]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
# classifier.show_most_informative_features(15)

Classifier accuracy percent: 82.0


In [12]:
#SklearnClassifier API
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from nltk.classify.scikitlearn import SklearnClassifier

In [13]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, testing_set)*100)

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, testing_set)*100)

MultinomialNB accuracy percent: 84.0
BernoulliNB accuracy percent: 82.0


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

LogisticRegression_classifier accuracy percent: 85.0
SVC_classifier accuracy percent: 81.0
LinearSVC_classifier accuracy percent: 81.0
NuSVC_classifier accuracy percent: 86.0


In [20]:
from nltk.classify import ClassifierI
from statistics import mode #check for most frequent

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        print("Total votes:" , votes)
        print("Votes: ", mode(votes))    
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf
    
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  MNB_classifier,
                                  BNB_classifier,
                                  LogisticRegression_classifier)

# print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
# print(testing_set[0][0])
print("Classification:", voted_classifier.classify(testing_set[0][0]))
# print("Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)

Total votes: ['neg', 'pos', 'neg', 'neg', 'neg', 'neg']
Votes:  neg
Classification: neg
