In [1]:
import nltk
from nltk.corpus import movie_reviews
from nltk.classify import ClassifierI, SklearnClassifier, accuracy

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC, NuSVC

import random
import pickle 

from statistics import mode

In [2]:
random.seed(42)

Let's define a class to combine the algorithms to have a vote

In [3]:
class VoteClassifier(ClassifierI): #Inherit properties 
    
    def __init__(self, *classifiers): 
        self.classifiers = classifiers
        
    def classify(self, features):
        votes = []  # Votes from different classifiers will be stored
        for c in self.classifiers: # Iterating through each classifier
            v = c.classify(features) # Getting the vote for current classifier
            votes.append(v) # Appending the current vote to list
        return mode(votes) # Returning the category with most votes
    
    def confidence(self, features):
        votes = []
        for c in self.classifiers:
            v = c.classify(features)
            votes.append(v)
        vote_count = votes.count(mode(votes)) # Counting the number of occurances of mode in the votes list
        conf = vote_count/len(votes) # Calculating confiednce
        return conf 

In [4]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [5]:
random.shuffle(documents)

In [6]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

feature_words = list(all_words.keys())[:4000]

In [7]:
def create_features(document):
    words = set(document)
    features = {}
    for w in feature_words:
        features[w] = (w in words)
    return features    

In [8]:
featureset = [(create_features(doc), category) 
              for (doc, category) in documents ]

In [9]:
training_set = featureset[:1900]
testing_set = featureset[1900:]

In [10]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print('acc:', nltk.classify.accuracy(classifier, testing_set)*100)

acc: 75.0


In [11]:
LogisticRegression_Classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_Classifier.train(training_set)
print('LogisticRegression_Classifier accuracy:', accuracy(LogisticRegression_Classifier, testing_set)*100)

LogisticRegression_Classifier accuracy: 81.0


In [12]:
SGDClassifier_Classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_Classifier.train(training_set)
print('SGDClassifier_Classifier accuracy:', accuracy(SGDClassifier_Classifier, testing_set)*100)



SGDClassifier_Classifier accuracy: 79.0


In [13]:
BernoulliNB_Classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_Classifier.train(training_set)
print('BernoulliNB_Classifier accuracy:', accuracy(BernoulliNB_Classifier, testing_set)*100)

BernoulliNB_Classifier accuracy: 76.0


In [14]:
MultinomialNB_Classifier = SklearnClassifier(MultinomialNB())
MultinomialNB_Classifier.train(training_set)
print('MultinomialNB_Classifier accuracy:', accuracy(MultinomialNB_Classifier, testing_set)*100)

MultinomialNB_Classifier accuracy: 77.0


In [15]:
LinearSVC_Classifier = SklearnClassifier(LinearSVC())
LinearSVC_Classifier.train(training_set)
print('LinearSVC_Classifier accuracy:', accuracy(LinearSVC_Classifier, testing_set)*100)

LinearSVC_Classifier accuracy: 77.0


In [16]:
NuSVC_Classifier = SklearnClassifier(NuSVC())
NuSVC_Classifier.train(training_set)
print('NuSVC_Classifier accuracy:', accuracy(NuSVC_Classifier, testing_set)*100)

NuSVC_Classifier accuracy: 83.0


Next, let's use the voting class VoteClassifier as defined above.

In [17]:
voted_classifier = VoteClassifier(classifier,
                                 LogisticRegression_Classifier,
                                 SGDClassifier_Classifier,
                                 BernoulliNB_Classifier,
                                 MultinomialNB_Classifier,
                                 LinearSVC_Classifier,
                                 NuSVC_Classifier)

In [18]:
print('Voted_classifier accuracy:', accuracy(voted_classifier, testing_set)*100)

Voted_classifier accuracy: 75.0


In [19]:
print('Classification: {}, Confidence %: {}'.format(voted_classifier.classify(testing_set[0][0]),
                                                  voted_classifier.confidence(testing_set[0][0])*100))

Classification: neg, Confidence %: 100.0


In [20]:
print('Classification: {}, Confidence %: {}'.format(voted_classifier.classify(testing_set[1][0]),
                                                  voted_classifier.confidence(testing_set[1][0])*100))
print('Classification: {}, Confidence %: {}'.format(voted_classifier.classify(testing_set[2][0]),
                                                  voted_classifier.confidence(testing_set[2][0])*100))
print('Classification: {}, Confidence %: {}'.format(voted_classifier.classify(testing_set[3][0]),
                                                  voted_classifier.confidence(testing_set[3][0])*100))
print('Classification: {}, Confidence %: {}'.format(voted_classifier.classify(testing_set[4][0]),
                                                  voted_classifier.confidence(testing_set[4][0])*100))
print('Classification: {}, Confidence %: {}'.format(voted_classifier.classify(testing_set[5][0]),
                                                  voted_classifier.confidence(testing_set[5][0])*100))

Classification: pos, Confidence %: 57.14285714285714
Classification: neg, Confidence %: 100.0
Classification: pos, Confidence %: 100.0
Classification: pos, Confidence %: 85.71428571428571
Classification: pos, Confidence %: 100.0
