In [59]:
import nltk
import pickle
import random

from statistics import mode

from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import ClassifierI

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
# documents = [(list(movie_reviews.words(fileid)), category)
#             for category in movie_reviews.categories()
#             for fileid in movie_reviews.fileids(category)]
# dir(movie_reviews)
# movie_reviews.fileids('neg')

## Creating Training Datasets 

In [32]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))

In [33]:
random.shuffle(documents)

In [34]:
len(documents)

2000

In [80]:
all_words = []
for w in movie_reviews.words():
    # use this if condition as a filter to get rid of stop words and punctuation for better accuracy
    if len(w.lower()) > 3: 
        all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[('that', 15924), ('with', 10792), ('this', 9578), ('film', 9517), ('movie', 5771), ('from', 4999), ('have', 4901), ('they', 4825), ('there', 3770), ('like', 3690), ('about', 3523), ('more', 3347), ('what', 3322), ('when', 3258), ('which', 3161)]


In [85]:
print(all_words['shit'])
print(len(all_words))

26
38191


In [86]:
word_features = list(all_words.keys())[:3000]
# word_features

In [87]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words) # boolean
    return features

In [88]:
# find_features(movie_reviews.words('neg/cv000_29416.txt'))

In [89]:
featuresets = [(find_features(rev), category) 
               for (rev, category) in documents]

In [90]:
featuresets[0]

({'plot': False,
  'teen': False,
  'couples': False,
  'church': False,
  'party': False,
  'drink': False,
  'then': False,
  'drive': False,
  'they': True,
  'into': True,
  'accident': False,
  'guys': False,
  'dies': False,
  'girlfriend': False,
  'continues': False,
  'life': False,
  'nightmares': False,
  'what': True,
  'deal': False,
  'watch': False,
  'movie': True,
  'sorta': False,
  'find': False,
  'critique': False,
  'mind': False,
  'fuck': False,
  'generation': True,
  'that': True,
  'touches': False,
  'very': False,
  'cool': False,
  'idea': True,
  'presents': False,
  'package': False,
  'which': True,
  'makes': False,
  'this': True,
  'review': False,
  'even': True,
  'harder': False,
  'write': False,
  'since': False,
  'generally': False,
  'applaud': False,
  'films': True,
  'attempt': False,
  'break': False,
  'mold': False,
  'mess': False,
  'with': True,
  'your': False,
  'head': False,
  'such': True,
  'lost': False,
  'highway': False,
  

In [91]:
train_set = featuresets[:1900]
test_set = featuresets[1900:]

## Training Process 

In [103]:
# classifier = nltk.NaiveBayesClassifier.train(train_set)
# print('accuracy: ', nltk.classify.accuracy(classifier, test_set))

In [107]:
# classifier.show_most_informative_features(10)

## Save Trained Classifier

In [108]:
# save_classifier = open('NaiveBayes.pickle','wb')# write in bytes
# pickle.dump(classifier, save_classifier)
# save_classifier.close()

In [109]:
classifier_exist = open('NaiveBayes.pickle','rb')
ori_classifier = pickle.load(classifier_exist)
classifier_exist.close()

In [110]:
print('Ori_NB: ', nltk.classify.accuracy(ori_classifier, test_set))

Ori_NB:  0.77


## Scikit-learn + NLTK 

In [111]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_set)
print('Multi_NB:',nltk.classify.accuracy(MNB_classifier, test_set))

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(train_set)
print('Ber_NB:',nltk.classify.accuracy(BNB_classifier, test_set))

LogReg_classifier = SklearnClassifier(LogisticRegression())
LogReg_classifier.train(training_set)
print("Log_Regression:", (nltk.classify.accuracy(LogReg_classifier, test_set)))

SGD_classifier = SklearnClassifier(SGDClassifier())
SGD_classifier.train(training_set)
print("SGD_classifier:", (nltk.classify.accuracy(SGD_classifier, test_set)))

Multi_NB: 0.78
Ber_NB: 0.77
Log_Regression: 0.98




SGD_classifier: 0.95


## Combined Classifier with Voting System

In [112]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        
        choice_votes = votes.count(mode(votes))
        conf_score = choice_votes/len(votes)
        return conf_score

In [113]:
# the no. of classifiers should be odd number, otherwise 
# the statistics.mode might return error since it only return one value.
voted_classifier = VoteClassifier(ori_classifier,
                                 MNB_classifier,
                                 BNB_classifier,
                                 LogReg_classifier,
                                 SGD_classifier)

In [114]:
print('votes_classifier accuracy:', nltk.classify.accuracy(voted_classifier, test_set))

votes_classifier accuracy: 0.8


In [115]:
print("Classification:", voted_classifier.classify(test_set[23][0]), "Confidence %:",voted_classifier.confidence(test_set[23][0]))

Classification: pos Confidence %: 1.0
