# In this chapter, Bya will cover the following recipes:
1. Bag of words feature extraction
2. Training a Naive Bayes classifier
3. Training a decision tree classifier
4. Training a maximum entropy classifier 
5. Training scikit-learn classifiers
6. Measuring precision and recall of a classifier 
7. Calculating high information words
8. Combining classifiers with voting
9. Classifying with multiple binary classifiers
10. Training a classifier with NLTK-Trainer

# 1. Bag of words feature extraction

### `bag_of_words(words)`

In [1]:
# inside 'featx.py'

def bag_of_words(words):
    return dict([(word, True) for word in words])

In [4]:
# from featx import bag_of_words
bag_of_words(['the', 'quick', 'brown', 'fox'])

{'brown': True, 'fox': True, 'quick': True, 'the': True}

### `bag_of_words_not_in_set(words, badwords)`

In [5]:
# inside 'featx.py'

def bag_of_words_not_in_set(words, badwords):
    return bag_of_words(set(words) - set(badwords))

In [6]:
# example

# from featx import bag_of_words_not_in_set
bag_of_words_not_in_set(['the', 'quick', 'brown', 'fox'], ['the'])

{'brown': True, 'fox': True, 'quick': True}

### Filtering Stopwords

In [3]:
# inside 'featx.py'

from nltk.corpus import stopwords

def bag_of_non_stopwords(words, stopfile='english'):
    badwords = stopwords.words(stopfile)
    return bag_of_words_not_in_set(words, badwords)

In [8]:
# example

# from featx import bag_of_non_stopwords
bag_of_non_stopwords(['the', 'quick', 'brown', 'fox'])

{'brown': True, 'fox': True, 'quick': True}

### Including significant bigrams

In [10]:
# inside 'featx.py'

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return bag_of_words(words + bigrams)

In [11]:
# example

# from featx import bag_of_bigrams_words
bag_of_bigrams_words(['the', 'quick', 'brown', 'fox'])

{'quick': True,
 'fox': True,
 'the': True,
 'brown': True,
 ('quick', 'brown'): True,
 ('the', 'quick'): True,
 ('brown', 'fox'): True}

# 2. Training a Naive Bayes classifier

### `label_feats_from_corpus`

In [12]:
# inside 'featx.py'
import collections

def label_feats_from_corpus(corp, feature_detector=bag_of_words):
    label_feats = collections.defaultdict(list)
    for label in corp.categories():
        for fileid in corp.fileids(categories=[label]):
            feats = feature_detector(corp.words(fileids=[fileid]))
            label_feats[label].append(feats)
    return label_feats

In [4]:
# example:
from nltk.corpus import movie_reviews
from nltk.classify.util import accuracy
from featx import label_feats_from_corpus, split_label_feats
from featx import bag_of_words, bag_of_non_stopwords, bag_of_words_not_in_set


print(movie_reviews.categories())

lfeats = label_feats_from_corpus(movie_reviews, feature_detector=bag_of_non_stopwords)
lfeats.keys()

['neg', 'pos']


dict_keys(['neg', 'pos'])

### `split_label_feats`

In [15]:
# inside 'featx.py'
def split_label_feats(lfeats, split=0.75):
    train_feats = []
    test_feats = []
    for label, feats in lfeats.items():
        cutoff = int(len(feats) * split)
        train_feats.extend([(feat, label) for feat in feats[:cutoff]])
        test_feats.extend([(feat, label) for feat in feats[cutoff:]])
    return train_feats, test_feats

In [2]:
# Spliting data features

train_feats, test_feats = split_label_feats(lfeats, split=0.75)
print(len(train_feats))
print(len(test_feats))

1500
500


### Train NaiveBayesClassifier

In [16]:
# import NaiveBayesClassifier
from nltk.classify import NaiveBayesClassifier

# train NaiveBayesClassifier
nb_classifier = NaiveBayesClassifier.train(train_feats)
nb_classifier.labels()

['neg', 'pos']

### `classify()`

In [64]:
from featx import bag_of_words

negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous'])
nb_classifier.classify(negfeat)

'neg'

In [65]:
posfeat = bag_of_words(['kate', 'winslet', 'is', 'accessible'])
nb_classifier.classify(posfeat)

'pos'

### `accuracy()`

In [46]:
from nltk.classify.util import accuracy
accuracy(nb_classifier, test_feats)

0.728

In [66]:
from nltk.classify.util import accuracy
accuracy(nb_classifier, test_feats)

0.726

### Classification Probability

In [48]:
probs = nb_classifier.prob_classify(test_feats[0][0])
probs.samples()

dict_keys(['pos', 'neg'])

In [49]:
probs.max()

'pos'

In [51]:
probs.prob('pos')

1.0

In [50]:
probs.prob('neg')

1.744195869102063e-21

### Most Informative features

In [54]:
nb_classifier.most_informative_features(n=5)

[('magnificent', True),
 ('outstanding', True),
 ('insulting', True),
 ('vulnerable', True),
 ('ludicrous', True)]

In [56]:
print(nb_classifier.show_most_informative_features(n=5))

Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
None


### Training estimator

In [57]:
from nltk.probability import LaplaceProbDist
nb_classifier = NaiveBayesClassifier.train(train_feats, 
estimator=LaplaceProbDist)
accuracy(nb_classifier, test_feats)

0.716

### Manual Training

In [60]:
from nltk.probability import DictionaryProbDist
label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5})
true_probdist = DictionaryProbDist({True: 1})
feature_probdist = {('pos', 'yes'): true_probdist, 
                    ('neg', 'no'): true_probdist}

classifier = NaiveBayesClassifier(label_probdist, feature_probdist)

print(classifier.classify({'yes': True}))

print(classifier.classify({'no': True}))

pos
neg


# 4. Training a maximum entropy classifier

In [7]:
from nltk.classify import MaxentClassifier
me_classifier = MaxentClassifier.train(train_feats, trace=0, max_iter=1, min_lldelta=0.5)
accuracy(me_classifier, test_feats)

      Training stopped: keyboard interrupt


NameError: name 'accuracy' is not defined

### `gis`

In [None]:
me_classifier = MaxentClassifier.train(train_feats,
algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)
accuracy(me_classifier, test_feats)

In [None]:
me_classifier.show_most_informative_features(n=4)

# 5. Training `scikit-learn` classifiers

### MultinomialNB

In [5]:
# import scikitlearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB
sk_classifier = SklearnClassifier(MultinomialNB())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

0.826

### BernoulliNB

In [6]:
from sklearn.naive_bayes import BernoulliNB
sk_classifier = SklearnClassifier(BernoulliNB())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

0.808

### LogisticRegression

In [7]:
from sklearn.linear_model import LogisticRegression
sk_classifier = SklearnClassifier(LogisticRegression())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

0.874

### LinearSVC

In [10]:
from sklearn.svm import SVC
sk_classifier = SklearnClassifier(SVC())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

0.632

In [11]:
from sklearn.svm import LinearSVC
sk_classifier = SklearnClassifier(LinearSVC())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

0.872

### NuSVC

In [13]:
from sklearn.svm import NuSVC
sk_classifier = SklearnClassifier(NuSVC())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

0.872

# 6. Measuring precision and recall of a classifier

* PRECISION: False Positive
* RECALL: False Negative

In [19]:
from classification import precision_recall
nb_precisions, nb_recalls = precision_recall(nb_classifier, test_feats)

AttributeError: 'module' object has no attribute 'precision'