# Text Classification using NLTK 

In [16]:
import nltk
from nltk.corpus import movie_reviews
import random

In [6]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [101]:
random.shuffle(documents)

In [23]:
len(documents)

2000

In [24]:
movie_reviews.categories()

['neg', 'pos']

In [37]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
all_words_list = all_words

In [41]:
all_words = nltk.FreqDist(all_words)
all_words.most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

In [42]:
type(all_words)

nltk.probability.FreqDist

In [43]:
len(all_words)

39768

In [44]:
len(list(all_words.keys()))

39768

In [49]:
word_features = list(all_words.keys())[:3000] #train for only top 3000 words

In [47]:
list(all_words.keys())[:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

In [46]:
all_words_list[:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

In [51]:
def find_features(document):
    words = set(document)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features
        

In [None]:
print((find_features(movie_reviews.words('neg\cv000_29416.txt'))))

In [102]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [55]:
len(featuresets)

2000

In [103]:
training_set = featuresets[:1800]
testing_set = featuresets[1800:]

In [104]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [105]:
'accuracy',nltk.classify.accuracy(classifier, testing_set)*100 
# accuracies vary by running the code again and again as randon.shuffle changes the everytime

('accuracy', 80.5)

In [106]:
'most informative features'
classifier.show_most_informative_features(15)

Most Informative Features
               atrocious = True              neg : pos    =     11.7 : 1.0
                   sucks = True              neg : pos    =     10.3 : 1.0
                     ugh = True              neg : pos    =      9.7 : 1.0
                  annual = True              pos : neg    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
                bothered = True              neg : pos    =      8.4 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
                 frances = True              pos : neg    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      6.4 : 1.0
                  shoddy = True              neg : pos    =      6.4 : 1.0
                 kidding = True              neg : pos    =      6.4 : 1.0
                   groan = True              neg : pos    =      6.4 : 1.0

# Using Scikit Learn for above problem

In [107]:
from nltk.classify.scikitlearn import SklearnClassifier # wrapper to include scikit learn classifier witin NLTK

In [108]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [110]:
MN_classifier = SklearnClassifier(MultinomialNB())
MN_classifier.train(training_set)
'MN accuracy',nltk.classify.accuracy(MN_classifier, testing_set)*100

('MN accuracy', 81.0)

In [111]:
BR_classifier = SklearnClassifier(BernoulliNB())
BR_classifier.train(training_set)
'BR accuracy',nltk.classify.accuracy(BR_classifier, testing_set)*100

('BR accuracy', 80.5)

In [70]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [112]:
LR_classifier = SklearnClassifier(LogisticRegression())
LR_classifier.train(training_set)
'LR accuracy',nltk.classify.accuracy(LR_classifier, testing_set)*100

('LR accuracy', 80.0)

In [141]:
# SGD_classifier = SklearnClassifier(SGDClassifier())
# SGD_classifier.train(training_set)
# 'SGD accuracy',nltk.classify.accuracy(SGD_classifier, testing_set)*100

In [114]:
NSV_classifier = SklearnClassifier(NuSVC())
NSV_classifier.train(training_set)
'NSV accuracy',nltk.classify.accuracy(NSV_classifier, testing_set)*100

('NSV accuracy', 81.0)

    Combining algos with a voting system

In [116]:
from nltk.classify import ClassifierI
from statistics import mode

In [135]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            votes.append(c.classify(features))
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            votes.append(c.classify(features))
        
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf
    
        

In [142]:
voted_classifier = VoteClassifier(classifier, MN_classifier, LR_classifier, BR_classifier, NSV_classifier)

In [143]:
'voted accuracy',nltk.classify.accuracy(voted_classifier, testing_set)*100

('voted accuracy', 81.5)

        In case the above 2 cells seem confusing (see the below link)
        as we can see we have to define our own classify function and rest all the things will work as before .. 
        i still didnt get it completely

### [link](http://www.nltk.org/api/nltk.classify.html)

In [139]:
'classified for training set[0][0]',voted_classifier.classify(testing_set[50][0])

('classified for training set[0][0]', 'neg')

In [140]:
'confidence for training set[0][0]',voted_classifier.confidence(testing_set[50][0])

('confidence for training set[0][0]', 0.6)