# Text Classification

In [6]:
import nltk
import random
from nltk.corpus import movie_reviews

documents = []

# documents = [(list(movie_reviews.words(fileid)), category)
#              for category in movie_reviews.categories()
#              for fileid in movie_reviews.fileids(category)]

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

random.shuffle(documents)

print(documents[1])

(['i', 'find', 'most', 'of', 'television', 'so', 'intensely', 'boring', 'that', 'i', 'simply', 'never', 'turn', 'on', 'my', 'set', ',', 'unless', 'i', "'", 'm', 'watching', 'a', 'movie', '.', 'i', 'don', "'", 't', 'even', 'have', 'cable', ',', 'so', 'i', 'went', 'to', 'radio', 'shack', 'to', 'buy', 'an', 'antenna', 'specifically', 'for', 'the', 'purpose', 'of', 'watching', '"', 'the', 'x', '-', 'files', '"', 'every', 'sunday', 'night', '.', 'it', "'", 's', 'the', 'only', 'show', 'that', "'", 's', 'worth', 'an', 'hour', 'of', 'my', 'time', 'each', 'week', '(', 'though', ',', 'since', 'i', 'don', "'", 't', 'watch', 'reruns', ',', 'i', "'", 'm', 'glad', 'that', 'i', 'have', 'six', 'months', 'of', 'the', 'year', 'to', 'avoid', 'television', 'altogether', ')', '.', 'i', 'am', 'an', 'avid', 'fan', 'of', 'the', 'show', ',', 'and', 'have', 'been', 'for', 'about', 'three', 'years', 'now', '.', 'and', 'i', 'love', 'gillian', 'anderson', '.', 'the', 'x', '-', 'files', 'is', 'the', 'film', 'that',

In [7]:
all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

print(all_words.most_common(15)) # 15 most common words in all movie reviews

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [8]:
print(all_words["stupid"])

253


# Words as Feature for Learning

In [12]:
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [17]:
all_words

FreqDist({',': 77717, 'the': 76529, '.': 65876, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, "'": 30585, 'is': 25195, 'in': 21822, ...})

In [20]:
word_features[0:20]

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an']

In [13]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) # Example printing

featuresets = [(find_features(rev), category) for (rev, category) in documents]



In [27]:
len(featuresets[0][0])

3000

# Naive Bayes

In [29]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)

Naive Bayes Algo Accuracy:  78.0
Most Informative Features
                   sucks = True              neg : pos    =      9.8 : 1.0
                 idiotic = True              neg : pos    =      9.2 : 1.0
                  annual = True              pos : neg    =      9.0 : 1.0
                  turkey = True              neg : pos    =      8.4 : 1.0
                 frances = True              pos : neg    =      7.7 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
           unimaginative = True              neg : pos    =      7.0 : 1.0
               atrocious = True              neg : pos    =      6.6 : 1.0
              schumacher = True              neg : pos    =      6.6 : 1.0
                obstacle = True              pos : neg    =      6.4 : 1.0
                 singers = True              pos : neg    =      6.4 : 1.0
                 kidding = True              neg : pos    =      6.3 : 1.0
                    mena = True          

# Saving Classifier with Pickle

In [30]:
import pickle

save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [31]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f) # We are loading our classifier with pickle, rather than training it again.
classifier_f.close()

print("Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)

Naive Bayes Algo Accuracy:  78.0
Most Informative Features
                   sucks = True              neg : pos    =      9.8 : 1.0
                 idiotic = True              neg : pos    =      9.2 : 1.0
                  annual = True              pos : neg    =      9.0 : 1.0
                  turkey = True              neg : pos    =      8.4 : 1.0
                 frances = True              pos : neg    =      7.7 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
           unimaginative = True              neg : pos    =      7.0 : 1.0
               atrocious = True              neg : pos    =      6.6 : 1.0
              schumacher = True              neg : pos    =      6.6 : 1.0
                obstacle = True              pos : neg    =      6.4 : 1.0
                 singers = True              pos : neg    =      6.4 : 1.0
                 kidding = True              neg : pos    =      6.3 : 1.0
                    mena = True          

# Sci-kit Learn Incorporation

In [5]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

training_set = featuresets[:1900]
testing_set = featuresets[1900:]

# Original

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Original Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)
classifier.show_most_informative_features(15)

# Multinomial

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)

print("MNB_classifier Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

# BernoulliNB

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)

print("BernoulliNB_classifier Algo Accuracy: ", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)

# LogisticRegression

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)

print("LogisticRegression_classifier Algo Accuracy: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)

# SGDClassifier

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)

print("SGDClassifier_classifier Algo Accuracy: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100)

# SVC

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)

print("SVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(SVC_classifier, testing_set)) * 100)

# LinearSVC

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)

print("LinearSVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

# NuSVC

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)

print("NuSVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)

Original Naive Bayes Algo Accuracy:  76.0
Most Informative Features
              schumacher = True              neg : pos    =     11.7 : 1.0
                   sucks = True              neg : pos    =      9.8 : 1.0
                  annual = True              pos : neg    =      9.7 : 1.0
                 frances = True              pos : neg    =      9.0 : 1.0
           unimaginative = True              neg : pos    =      8.3 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
                  sexist = True              neg : pos    =      7.0 : 1.0
             silverstone = True              neg : pos    =      7.0 : 1.0
               atrocious = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0
                 kidding = True              neg : pos    =      6.3 : 1.0
                    mena = True 

# Combining algos with a vote

In [7]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):

    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []

        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
            return mode(votes)

    def confidence(self, features):
        votes = []

        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

training_set = featuresets[:1900]
testing_set = featuresets[1900:]

# Original

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Original Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)

# Multinomial

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)

print("MNB_classifier Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

# BernoulliNB

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)

print("BernoulliNB_classifier Algo Accuracy: ", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)

# LogisticRegression

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)

print("LogisticRegression_classifier Algo Accuracy: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)

# SGDClassifier

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)

print("SGDClassifier_classifier Algo Accuracy: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100)

# LinearSVC

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)

print("LinearSVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

# NuSVC

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)

print("NuSVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)

## Voted Classifier

voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, LinearSVC_classifier, NuSVC_classifier)

print("Voted_classifier Algo Accuracy: ", (nltk.classify.accuracy(voted_classifier, testing_set)) * 100)

print("Classfication: ",voted_classifier.classify(testing_set[0][0]), "Confidence %: ", voted_classifier.confidence(testing_set[0][0])*100)
print("Classfication: ",voted_classifier.classify(testing_set[1][0]), "Confidence %: ", voted_classifier.confidence(testing_set[1][0])*100)
print("Classfication: ",voted_classifier.classify(testing_set[2][0]), "Confidence %: ", voted_classifier.confidence(testing_set[2][0])*100)
print("Classfication: ",voted_classifier.classify(testing_set[3][0]), "Confidence %: ", voted_classifier.confidence(testing_set[3][0])*100)
print("Classfication: ",voted_classifier.classify(testing_set[4][0]), "Confidence %: ", voted_classifier.confidence(testing_set[4][0])*100)
print("Classfication: ",voted_classifier.classify(testing_set[5][0]), "Confidence %: ", voted_classifier.confidence(testing_set[5][0])*100)

Original Naive Bayes Algo Accuracy:  76.0
MNB_classifier Algo Accuracy:  78.0
BernoulliNB_classifier Algo Accuracy:  75.0
LogisticRegression_classifier Algo Accuracy:  71.0
SGDClassifier_classifier Algo Accuracy:  72.0
LinearSVC_classifier Algo Accuracy:  72.0
NuSVC_classifier Algo Accuracy:  75.0
Voted_classifier Algo Accuracy:  76.0
Classfication:  neg Confidence %:  71.42857142857143
Classfication:  neg Confidence %:  100.0
Classfication:  neg Confidence %:  57.14285714285714
Classfication:  pos Confidence %:  100.0
Classfication:  neg Confidence %:  100.0
Classfication:  pos Confidence %:  100.0


# Investigating Bias

In [1]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):

    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []

        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
            return mode(votes)

    def confidence(self, features):
        votes = []

        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

# positive data --------------------- Checking result for Positive Reviews Data
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

# Original

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Original Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)

# Multinomial

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)

print("MNB_classifier Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

# BernoulliNB

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)

print("BernoulliNB_classifier Algo Accuracy: ", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)

# LogisticRegression

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)

print("LogisticRegression_classifier Algo Accuracy: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)

# SGDClassifier

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)

print("SGDClassifier_classifier Algo Accuracy: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100)

# LinearSVC

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)

print("LinearSVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

# NuSVC

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)

print("NuSVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)

## Voted Classifier

voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, LinearSVC_classifier, NuSVC_classifier)

print("Voted_classifier Algo Accuracy: ", (nltk.classify.accuracy(voted_classifier, testing_set)) * 100)

Original Naive Bayes Algo Accuracy:  77.0
MNB_classifier Algo Accuracy:  77.0
BernoulliNB_classifier Algo Accuracy:  77.0
LogisticRegression_classifier Algo Accuracy:  80.0
SGDClassifier_classifier Algo Accuracy:  82.0
LinearSVC_classifier Algo Accuracy:  81.0
NuSVC_classifier Algo Accuracy:  83.0
Voted_classifier Algo Accuracy:  77.0


In [4]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):

    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []

        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
            return mode(votes)

    def confidence(self, features):
        votes = []

        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

# negative data --------------------- Checking result for Negative Reviews Data
training_set = featuresets[100:]
testing_set = featuresets[:100]

# Original

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Original Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)

# Multinomial

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)

print("MNB_classifier Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

# BernoulliNB

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)

print("BernoulliNB_classifier Algo Accuracy: ", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)

# LogisticRegression

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)

print("LogisticRegression_classifier Algo Accuracy: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)

# SGDClassifier

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)

print("SGDClassifier_classifier Algo Accuracy: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100)

# LinearSVC

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)

print("LinearSVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

# NuSVC

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)

print("NuSVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)

## Voted Classifier

voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, LinearSVC_classifier, NuSVC_classifier)

print("Voted_classifier Algo Accuracy: ", (nltk.classify.accuracy(voted_classifier, testing_set)) * 100)

Original Naive Bayes Algo Accuracy:  79.0
MNB_classifier Algo Accuracy:  80.0
BernoulliNB_classifier Algo Accuracy:  80.0
LogisticRegression_classifier Algo Accuracy:  73.0
SGDClassifier_classifier Algo Accuracy:  70.0
LinearSVC_classifier Algo Accuracy:  72.0
NuSVC_classifier Algo Accuracy:  77.0
Voted_classifier Algo Accuracy:  79.0


# Better Training Data

In [16]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):

    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []

        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
            return mode(votes)

    def confidence(self, features):
        votes = []

        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

short_pos = open("short_reviews/positive.txt","r").read()
short_neg = open("short_reviews/negative.txt","r").read()

documents = []

for r in short_pos.split('\n'):
    documents.append( (r, "pos") )

for r in short_neg.split('\n'):
    documents.append( (r, "neg") )

all_words = []

short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

for w in short_pos_words:
    all_words.append(w.lower())

for w in short_neg_words:
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)

training_set = featuresets[:10000]
testing_set = featuresets[10000:]

# Original

classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Original Naive Bayes Algo Accuracy: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)

# Multinomial

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)

print("MNB_classifier Algo Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing_set)) * 100)

# BernoulliNB

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)

print("BernoulliNB_classifier Algo Accuracy: ", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)) * 100)

# LogisticRegression

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)

print("LogisticRegression_classifier Algo Accuracy: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)) * 100)

# SGDClassifier

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)

print("SGDClassifier_classifier Algo Accuracy: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)) * 100)

# LinearSVC

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)

print("LinearSVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)) * 100)

# NuSVC

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)

print("NuSVC_classifier Algo Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set)) * 100)

## Voted Classifier

voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, LinearSVC_classifier, NuSVC_classifier)

print("Voted_classifier Algo Accuracy: ", (nltk.classify.accuracy(voted_classifier, testing_set)) * 100)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf3 in position 4645: invalid continuation byte