In [1]:
import nltk
from nltk.tokenize import word_tokenize
import random

In [2]:
positive_reviews=open('./reviews_dataset/positive.txt','r').read()
negative_reviews=open('./reviews_dataset/negative.txt','r').read()

In [3]:
documents=[]
for r in positive_reviews.split('\n'):
    documents.append((r,'pos'))
for r in negative_reviews.split('\n'):
    documents.append((r,'neg'))

In [4]:
all_words=[]
for w in word_tokenize(positive_reviews):
    all_words.append(w.lower())
for w in word_tokenize(negative_reviews):
    all_words.append(w.lower())
    
all_words=nltk.FreqDist(all_words)

# Taking the most common words
word_features=list(all_words.keys())[:3000]

def features_set(document):
    words=word_tokenize(document)
    features={}
    for w in word_features:
        features[w]=(w in words)
    return features
featuresets=[(features_set(rev),category) for (rev, category) in documents]
random.shuffle(featuresets)

In [5]:
len(documents)

10664

In [6]:
from nltk.classify.scikitlearn import SklearnClassifier
import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode, mean

class VoteClassifier(ClassifierI):
    def __init__(self,*classifiers):
        self._classifiers=classifiers
    
    def classify(self,features):
        votes=[]
        for c in self._classifiers:
            v=c.classify(features)
            votes.append(v)
        return mode(votes)
            
        
    def confidence(self,features):
        votes=[]
        for c in self._classifiers:
            v=c.classify(features)
            votes.append(v)
            
        choice_votes=votes.count(mode(votes))
        conf=choice_votes/len(votes)

In [7]:
training_set=featuresets[:10000]
testing_set=featuresets[10000:]

MNB_classifier=SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)


voted_classifier = VoteClassifier(
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier, SGDClassifier_classifier)

MNB_classifier accuracy percent: 73.79518072289156
BernoulliNB_classifier accuracy percent: 73.94578313253012


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression_classifier accuracy percent: 75.75301204819277
SGDClassifier_classifier accuracy percent: 73.19277108433735
LinearSVC_classifier accuracy percent: 72.13855421686746
NuSVC_classifier accuracy percent: 75.30120481927712


In [10]:
classifier=nltk.NaiveBayesClassifier.train(training_set)
print('Original Naive Bayes classifier:',(nltk.classify.accuracy(classifier,testing_set))*100)

Original Naive Bayes classifier: 73.64457831325302


In [9]:
tex='hello how are you'
s=word_tokenize(tex)
ps=nltk.pos_tag(s)
print(ps)

[('hello', 'VB'), ('how', 'WRB'), ('are', 'VBP'), ('you', 'PRP')]


In [11]:
import pickle

save_word_features = open("./NLP_prac4_pickles/word_features5k.pickle","wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()

save_featuresets = open("./NLP_prac4_pickles/featuresets5k.pickle","wb")
pickle.dump(featuresets, save_featuresets)
save_word_features.close()

save_documents = open("./NLP_prac4_pickles/documents.pickle","wb")
pickle.dump(documents, save_documents)
save_documents.close()

save_classifier = open("./NLP_prac4_pickles/originalnaivebayes5k.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

save_classifier = open("./NLP_prac4_pickles/MNB_classifier5k.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()

save_classifier = open("./NLP_prac4_pickles/BernoulliNB_classifier5k.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()

save_classifier = open("./NLP_prac4_pickles/LogisticRegression_classifier5k.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()

save_classifier = open("./NLP_prac4_pickles/LinearSVC_classifier5k.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

save_classifier = open("./NLP_prac4_pickles/SGDClassifier_classifier5k.pickle","wb")
pickle.dump(SGDClassifier_classifier, save_classifier)
save_classifier.close()

save_classifier = open("./NLP_prac4_pickles/NuSVC_classifier5k.pickle","wb")
pickle.dump(NuSVC_classifier, save_classifier)
save_classifier.close()