In [2]:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import ClassifierI, accuracy, SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC, NuSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier

from statistics import mode

In [3]:
class VotedClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self.classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        count = votes.count(mode(votes))
        conf = count/len(votes)
        return conf
        

In [4]:
documents = [(list(movie_reviews.words(fileid)),category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [6]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)    

In [7]:
feature_words = list(all_words.keys())[:3000]

def featurize(document):
    words = set(document)
    features = {}
    for w in feature_words:
        features[w] = (w in words)
    return features

featureset = [(featurize(doc),category)
             for (doc, category) in documents]

In [18]:
# Comment out the one not being used

# Positive Testing Set
# training_set = featureset[:1900]
# testing_set = featureset[1900:]

# Negative Testing Set
training_set = featureset[100:]
testing_set = featureset[:100]

In [19]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print('Classifier acc:', accuracy(classifier,testing_set)*100)

Classifier acc: 79.0


In [20]:
LogisticRegression_Classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_Classifier.train(training_set)
print('LogisticRegression_Classifier accuracy:', accuracy(LogisticRegression_Classifier, testing_set)*100)

SGDClassifier_Classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_Classifier.train(training_set)
print('SGDClassifier_Classifier accuracy:', accuracy(SGDClassifier_Classifier, testing_set)*100)

BernoulliNB_Classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_Classifier.train(training_set)
print('BernoulliNB_Classifier accuracy:', accuracy(BernoulliNB_Classifier, testing_set)*100)

MultinomialNB_Classifier = SklearnClassifier(MultinomialNB())
MultinomialNB_Classifier.train(training_set)
print('MultinomialNB_Classifier accuracy:', accuracy(MultinomialNB_Classifier, testing_set)*100)

LinearSVC_Classifier = SklearnClassifier(LinearSVC())
LinearSVC_Classifier.train(training_set)
print('LinearSVC_Classifier accuracy:', accuracy(LinearSVC_Classifier, testing_set)*100)

NuSVC_Classifier = SklearnClassifier(NuSVC())
NuSVC_Classifier.train(training_set)
print('NuSVC_Classifier accuracy:', accuracy(NuSVC_Classifier, testing_set)*100)

LogisticRegression_Classifier accuracy: 73.0




SGDClassifier_Classifier accuracy: 87.0
BernoulliNB_Classifier accuracy: 80.0
MultinomialNB_Classifier accuracy: 80.0
LinearSVC_Classifier accuracy: 72.0
NuSVC_Classifier accuracy: 79.0


In [21]:
voted_classifier = VotedClassifier(classifier,
                                 LogisticRegression_Classifier,
                                 SGDClassifier_Classifier,
                                 BernoulliNB_Classifier,
                                 MultinomialNB_Classifier,
                                 LinearSVC_Classifier,
                                 NuSVC_Classifier)
print('Voted_classifier accuracy:', accuracy(voted_classifier, testing_set)*100)

Voted_classifier accuracy: 86.0


Well, here both the positive as well as negative testing set is giving good performance.