# Natural Language Processing with Python
## Chapter 5 Learning to Classify Text
### 1 Supervised Classification
#### 1.1 Gender Identification

In [12]:
from nltk.corpus import names
import nltk 
import random
def ExtractNameFeature(name):
    feature = {
        'first letter':name[0],
        'last letter:':name[-1]
    }
    return feature
LabeledNames = [(i,'male') for i in names.words('male.txt')] + [(i,'female') for i in names.words('female.txt')]
random.shuffle(LabeledNames)
FeatureList = [(ExtractNameFeature(i[0]),i[1]) for i in LabeledNames]
TrainSet,TestSet = FeatureList[:7000],FeatureList[7000:]
GenderClassifier = nltk.NaiveBayesClassifier.train(TrainSet)
print(nltk.classify.accuracy(GenderClassifier,TestSet))
GenderClassifier.show_most_informative_features()

0.7680084745762712
Most Informative Features
            last letter: = 'a'            female : male   =     40.5 : 1.0
            last letter: = 'k'              male : female =     28.9 : 1.0
            last letter: = 'f'              male : female =     14.6 : 1.0
            last letter: = 'd'              male : female =     10.0 : 1.0
            last letter: = 'p'              male : female =      9.9 : 1.0
            last letter: = 'v'              male : female =      9.2 : 1.0
            last letter: = 'm'              male : female =      8.5 : 1.0
            last letter: = 'o'              male : female =      8.2 : 1.0
            last letter: = 'r'              male : female =      6.4 : 1.0
            last letter: = 'w'              male : female =      5.9 : 1.0


#### 1.2 Choosing The Right Features

In [13]:
TrainSet,DevTestSet,TestSet = LabeledNames[:6000],LabeledNames[6000:7000],LabeledNames[7000:]
GenderClassifier = nltk.NaiveBayesClassifier.train([(ExtractNameFeature(i[0]),i[1]) for i in TrainSet])
print(nltk.classify.accuracy(GenderClassifier,[(ExtractNameFeature(i[0]),i[1]) for i in DevTestSet]))
for name in DevTestSet:
    result = GenderClassifier.classify(ExtractNameFeature(name[0])) 
    if result != name[1] :
        print(name[0],'correct:',name[1],'wrong',result)

0.766
Bess correct: female wrong male
Marsh correct: male wrong female
Elizabet correct: female wrong male
Vivian correct: female wrong male
Granville correct: male wrong female
Arvin correct: male wrong female
Hedwig correct: female wrong male
Orville correct: male wrong female
Arne correct: male wrong female
Tyrone correct: male wrong female
Maurice correct: male wrong female
Kip correct: female wrong male
Sully correct: male wrong female
Clemente correct: male wrong female
Garey correct: male wrong female
Duffie correct: male wrong female
Larry correct: male wrong female
Bo correct: female wrong male
Tansy correct: female wrong male
Trixy correct: female wrong male
Robinett correct: female wrong male
Riannon correct: female wrong male
Cammy correct: male wrong female
Sinead correct: female wrong male
Addie correct: male wrong female
Lynn correct: male wrong female
Chris correct: female wrong male
Lonny correct: male wrong female
Virge correct: male wrong female
Wynn correct: female 

#### 1.3 Document Classification

In [17]:
from nltk.corpus import movie_reviews
docs = [(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
random.shuffle(docs)
WordFeature = list(nltk.FreqDist([w.lower() for w in movie_reviews.words()]))[:2000]
def ExDocFeature(doc):
    words = set(doc)
    features = {}
    for i in WordFeature:
        features['contains {}'.format(i)] = i in words
    return features
DocFeaList = [(ExDocFeature(doc),c) for (doc,c) in docs]
TrainSet,TestSet = DocFeaList[:1800],DocFeaList[1800:]
DocClassifier = nltk.NaiveBayesClassifier.train(TrainSet)
print(nltk.classify.accuracy(DocClassifier,TestSet))
DocClassifier.most_informative_features()

0.81


[('contains outstanding', True),
 ('contains mulan', True),
 ('contains wonderfully', True),
 ('contains seagal', True),
 ('contains damon', True),
 ('contains poorly', True),
 ('contains flynt', True),
 ('contains ridiculous', True),
 ('contains lame', True),
 ('contains wasted', True),
 ('contains awful', True),
 ('contains waste', True),
 ('contains era', True),
 ('contains pointless', True),
 ('contains sandler', True),
 ('contains unfunny', True),
 ('contains laughable', True),
 ('contains dull', True),
 ('contains worst', True),
 ('contains fantastic', True),
 ('contains allows', True),
 ('contains mess', True),
 ('contains boring', True),
 ('contains bland', True),
 ('contains jedi', True),
 ('contains superb', True),
 ('contains stupid', True),
 ('contains terrific', True),
 ('contains portrayal', True),
 ('contains memorable', True),
 ('contains zero', True),
 ('contains snake', True),
 ('contains masterpiece', True),
 ('contains terrible', True),
 ('contains badly', True),
 (

#### 1.4 Part-of-Speech Tagging

In [40]:
from nltk.corpus import brown
SuffixFD = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    SuffixFD[word[-1]] += 1
    SuffixFD[word[-2:]] += 1
    SuffixFD[word[-3:]] += 1
CommonSuffix = [i[0] for i in SuffixFD.most_common(100)]
def ExPosFea(word):
    feature = {}
    for i in CommonSuffix:
        feature['endswith {}'.format(i)] = word.lower().endswith(i)
    return feature
TaggedWords = brown.tagged_words(categories='news')
PosFeaList = [(ExPosFea(n),g) for (n,g) in TaggedWords]
TrainSet,TestSet = PosFeaList[:90000],PosFeaList[90000:]
PoSClassifier = nltk.DecisionTreeClassifier.train(TrainSet)
print(nltk.classify.accuracy(PoSClassifier,TestSet))
PoSClassifier.pseudocode(depth=4)

0.6262080727686186


"if endswith the == False: \n  if endswith , == False: \n    if endswith s == False: \n      if endswith . == False: return 'AT'\n      if endswith . == True: return '.'\n    if endswith s == True: \n      if endswith was == False: return 'VBZ'\n      if endswith was == True: return 'BEDZ'\n  if endswith , == True: return ','\nif endswith the == True: return 'AT'\n"

#### 1.5 Exploiting Context

In [49]:
def ExPosConFeature(sentence,i):
    feature = {
        'suffix1':sentence[i][-1],
        'suffix2':sentence[i][-2:],
        'suffix3':sentence[i][-3:]
    }
    if i == 0:
        feature['PrevWord'] = '<START>'
    else:
        feature['PrevWord'] = sentence[i-1]
    return feature
BTaggedSent = brown.tagged_sents(categories='news')
SentFeaList = []
for sent in BTaggedSent:
    untag = nltk.tag.untag(sent)
    for i,(word,tag) in enumerate(sent):
        SentFeaList.append((ExPosConFeature(untag,i),tag))
TrainSet,TestSet = SentFeaList[:90000],SentFeaList[90000:]
PosConClassifier = nltk.NaiveBayesClassifier.train(TrainSet)
nltk.classify.accuracy(PosConClassifier,TestSet)

0.7774303581580443

#### 1.6 Sequence Classification

In [51]:
def ExPosHisFea(sentence,i,history):
    feature = {
        'suffix1':sentence[i][-1],
        'suffix2':sentence[i][-2:],
        'suffix3':sentence[i][-3:]
    }
    if i == 0:
        feature['PrevWord'] = '<START>'
        feature['PrevTag'] = '<START>'
    else:
        feature['PrevWord'] = sentence[i-1]
        feature['PrevTag'] = history[i-1]
    return feature 
class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self,TrainSent):
        TrainList = []
        for sent in TrainSet:
            untag = nltk.tag.untag(sent)
            history = []
            for i,(word,tag) in enumerate(sent):
                feature = ExPosHisFea(untag,i,history)
                TrainList.append((feature,tag))
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(TrainList)
    def tag(self,sentence):
        history = []
        for i,word in enumerate(sentence):
            feature = ExPosHisFea(sentence,i,history)
            tag = self.classifier.classify(feature)
            history.append(tag)
        return zip(sentence,history)
TrainSet,TestSet = BTaggedSent[:3800],BTaggedSent[3800:]
ConsecutiveTagger = ConsecutivePosTagger(TrainSet)
ConsecutiveTagger.evaluate(TestSet)

0.7782902322516884

#### 1.7 Other Methods for Sequence Classification
### 2. Further Examples of Supervised Classification
#### 2.1 Sentence Segmentation

In [55]:
sents = nltk.corpus.treebank_raw.sents()
tokens = list()
boundary = set()
offset = 0
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundary.add(offset-1)
def ExPunctFea(tokens,i):
    feature = {
        'next word capitalized':tokens[i+1][0].isupper(),
        'prev-word':tokens[i-1].lower(),
        'punct':tokens[i], 
        'prev word one char': len(tokens[i-1]) == 1
    }
    return feature
PunctFeaList = [(ExPunctFea(tokens,i),(i in boundary)) for i in range(1,len(tokens)-1) if tokens[i] in '.?!' ]
TrainSet,TestSet = PunctFeaList[:5000],PunctFeaList[5000:]
PunctClassifier = nltk.NaiveBayesClassifier.train(TrainSet)
nltk.classify.accuracy(PunctClassifier,TestSet)

0.9578059071729957

#### 2.2 Identifying Dialogue Act Types

In [65]:
posts = nltk.corpus.nps_chat.xml_posts()[:3000]
def ExDialogFea(post):
    feature = {}
    for i in nltk.word_tokenize(post):
        feature['contain {}'.format(i.lower())] = True
    return feature
DialogueFeaList = [(ExDialogFea(post.text),post.get('class')) for post in posts]
TrainSet,TestSet = DialogueFeaList[:2700],DialogueFeaList[2700:]
DialogueClassifier = nltk.NaiveBayesClassifier.train(TrainSet)
nltk.classify.accuracy(DialogueClassifier,TestSet)

0.6433333333333333

#### 2.3 Recognizing Textual Entailment

In [67]:
def ExRTEFea(pair):
    extractor = nltk.RTEFeatureExtractor(pair)
    feature = {}
    feature['word overlap'] = extractor.overlap('word')
    feature['word hyp extra'] = extractor.hyp_extra('word')
    feature['ne overlap'] = extractor.overlap('ne')
    feature['ne hyp extra'] = extractor.hyp_extra('ne')
    return feature

#### 2.4 Scaling Up to Large Datasets
### 3. Evaluation
#### 3.1 The Test Set
#### 3.2 Accuracy
#### 3.3 Precision and Recall
#### 3.4 Confusion Matrices
#### 3.5 Cross-Validation
### 4. Decision Trees
#### 4.1 Entropy and Information Gain
### 5. Naive Bayes Classifiers
#### 5.1 Underlying Probabilistic Model
#### 5.2 Zero Counts and Smoothing
#### 5.3 Non-Binary Features
#### 5.4 The Naivete of Independence
#### 5.5 The Cause of Double-Counting
### 6. Maximum Entropy Classifiers
#### 6.1 The Maximum Entropy Model
#### 6.2 Maximizing Entropy
#### 6.3 Generative vs Conditional Classifiers
### 7. Modeling Linguistic Patterns
#### 7.1 What do models tell us?
### 8. Summary
### 9. Further Reading
### 10. Exercises

#### 1. Read up on one of the language technologies mentioned in this section, such as word sense disambiguation, semantic role labeling, question answering, machine translation, named entity detection. Find out what type and quantity of annotated data is required for developing such systems. Why do you think a large amount of data is required?
#### 2. Using any of the three classifiers described in this chapter, and any features you can think of, build the best name gender classifier you can. Begin by splitting the Names Corpus into three subsets: 500 words for the test set, 500 words for the dev-test set, and the remaining 6900 words for the training set. Then, starting with the example name gender classifier, make incremental improvements. Use the dev-test set to check your progress. Once you are satisfied with your classifier, check its final performance on the test set. How does the performance on the test set compare to the performance on the dev-test set? Is this what you'd expect?
#### 3. The Senseval 2 Corpus contains data intended to train word-sense disambiguation classifiers. It contains data for four words: hard, interest, line, and serve. Choose one of these four words, and load the corresponding data:
```
from nltk.corpus import senseval
instances = senseval.instances('hard.pos')
size = int(len(instances) * 0.1)
train_set, test_set = instances[size:], instances[:size]
```
Using this dataset, build a classifier that predicts the correct sense tag for a given instance. See the corpus HOWTO at `http://nltk.org/howto` for information on using the instance objects returned by the Senseval 2 Corpus.

In [95]:
from nltk.corpus import senseval
instances = senseval.instances('hard.pos')
def ExInsFea(instance):
    feature = {
        'tag':instance.context[instance.position][1],
        #'position':instance.position
    }
    if instance.position == 0:
        feature['prev-tag'] = '<START>',
        feature['prev-word'] = '<START>',
        feature['prev-2-tag'] = '<START>',
        feature['prev-2-word'] = '<START>'
    elif instance.position == 1:
        feature['prev-tag'] = instance.context[instance.position-1][1],
        feature['prev-word'] = instance.context[instance.position-1][0],
        feature['prev-2-tag'] = '<START>',
        feature['prev-2-word'] = '<START>'
    else:
        feature['prev-tag'] = instance.context[instance.position-1][1]
        feature['prev-word'] = instance.context[instance.position-1][0]
    return feature
InsFeaList = [(ExInsFea(instance),instance.senses) for instance in instances]
random.shuffle(InsFeaList)
TrainSet,TestSet = InsFeaList[:3800],InsFeaList[3800:]
InsClassifier = nltk.MaxentClassifier.train(TrainSet)
nltk.classify.accuracy(InsClassifier,TestSet)

==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.087
             2          -0.33702        0.800
             3          -0.31773        0.822
             4          -0.30403        0.833
             5          -0.29450        0.841
             6          -0.28737        0.847
             7          -0.28178        0.854
             8          -0.27724        0.854
             9          -0.27349        0.856
            10          -0.27034        0.856
            11          -0.26765        0.856
            12          -0.26533        0.856
            13          -0.26331        0.856
            14          -0.26153        0.856
            15          -0.25995        0.856
            16          -0.25855        0.856
            17          -0.25728        0.856
            18          -0.25613        0.856
            19          -0.25509        0.856
   

0.8142589118198874

#### 4. Using the movie review document classifier discussed in this chapter, generate a list of the 30 features that the classifier finds to be most informative. Can you explain why these particular features are informative? Do you find any of them surprising?
Select one of the classification tasks described in this chapter, such as name gender detection, document classification, part-of-speech tagging, or dialog act classification. #### 5. Using the same training and test data, and the same feature extractor, build three classifiers for the task: a decision tree, a naive Bayes classifier, and a Maximum Entropy classifier. Compare the performance of the three classifiers on your selected task. How do you think that your results might be different if you used a different feature extractor?
#### 6. The synonyms strong and powerful pattern differently (try combining them with chip and sales). What features are relevant in this distinction? Build a classifier that predicts when each word should be used.

In [100]:
SynoSent = []
SynoSent.append([i for i in brown.tagged_sents() if 'powerful' in [j[0].lower() for j in i]])
SynoSent.append([i for i in brown.tagged_sents() if 'strong' in [j[0].lower() for j in i]])
def ExSynoFea(sent):
    feature = {}
    untag = [j[0].lower() for j in sent]
    if 'powerful' in untag:
        index = untag.index('powerful')
    else:
        index = untag.index('strong')
    if index == 0 :
        feature['prev-tag'] = '<START>'
        feature['prev-word'] = '<START>'
        feature['next-tag'] = sent[1][1]
        feature['next-word'] = sent[1][0]
    elif index ==  len(sent):
        feature['prev-tag'] = sent[-1][1]
        feature['prev-word'] = sent[-1][0]
        feature['next-tag'] = '<END>'
        feature['next-word'] = '<END>'
    else:
        feature['prev-tag'] = sent[index-1][1]
        feature['prev-word'] = sent[index-1][0]
        feature['next-tag'] = sent[index+1][1]
        feature['next-word'] = sent[index+1][0]
    return feature
SynoFeaList = []
for i in SynoSent[0]:
    SynoFeaList.append((ExSynoFea(i),'P'))
for i in SynoSent[1]:
    SynoFeaList.append((ExSynoFea(i),'S'))
random.shuffle(SynoFeaList)
TrainSet,TestSet = SynoFeaList[:200],SynoFeaList[200:]
SynoClassifier = nltk.MaxentClassifier.train(TrainSet)
nltk.classify.accuracy(SynoClassifier,TestSet)

==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.740
             2          -0.43569        0.780
             3          -0.37626        0.845
             4          -0.33564        0.875
             5          -0.30509        0.930
             6          -0.28098        0.935
             7          -0.26137        0.940
             8          -0.24505        0.945
             9          -0.23122        0.945
            10          -0.21935        0.950
            11          -0.20902        0.950
            12          -0.19995        0.950
            13          -0.19192        0.950
            14          -0.18476        0.950
            15          -0.17832        0.950
            16          -0.17250        0.950
            17          -0.16722        0.955
            18          -0.16240        0.955
            19          -0.15798        0.955
   

0.8421052631578947

#### 7. The dialog act classifier assigns labels to individual posts, without considering the context in which the post is found. However, dialog acts are highly dependent on context, and some sequences of dialog act are much more likely than others. For example, a `ynQuestion` dialog act is much more likely to be answered by a `yanswer` than by a `greeting`. Make use of this fact to build a consecutive classifier for labeling dialog acts. Be sure to consider what features might be useful. See the code for the consecutive classifier for part-of-speech tags in 1.7 to get some ideas.

In [23]:
from nltk.corpus import nps_chat
import nltk
def ExDialogFea(posts,i):
    feature = {}
    for word in nltk.word_tokenize(posts[i].text):
        feature['contain {}'.format(word.lower())] = True
    if i == 0 :
        feature['prev tag'] = '<START>'
    else:
        feature['prev tag'] = posts[i-1].get('class')
    return feature
PostsFeaList = []
posts = nps_chat.xml_posts()
for i in range(len(posts)):
    feature = ExDialogFea(posts,i)
    PostsFeaList.append((feature,posts[i].get('class')))
TrainSet,TestSet = PostsFeaList[:9000],PostsFeaList[9000:]
PostClassifier = nltk.MaxentClassifier.train(TrainSet)
nltk.classify.accuracy(PostClassifier,TestSet)

==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -2.70805        0.052
             2          -1.20576        0.788
             3          -0.85297        0.828
             4          -0.68715        0.863
             5          -0.58516        0.886
             6          -0.51542        0.901
             7          -0.46421        0.913
             8          -0.42467        0.922
             9          -0.39298        0.929
            10          -0.36689        0.935
            11          -0.34494        0.939
            12          -0.32615        0.942
            13          -0.30983        0.945
            14          -0.29551        0.949
            15          -0.28280        0.950
            16          -0.27144        0.953
            17          -0.26120        0.955
            18          -0.25192        0.956
            19          -0.24346        0.958
   

0.7925973197192087

#### 8. Word features can be very useful for performing document classification, since the words that appear in a document give a strong indication about what its semantic content is. However, many words occur very infrequently, and some of the most informative words in a document may never have occurred in our training data. One solution is to make use of a lexicon, which describes how different words relate to one another. Using WordNet lexicon, augment the movie review document classifier presented in this chapter to use features that generalize the words that appear in a document, making it more likely that they will match words found in the training data.
#### 9. The PP Attachment Corpus is a corpus describing prepositional phrase attachment decisions. Each instance in the corpus is encoded as a `PPAttachment` object:
```
from nltk.corpus import ppattach
ppattach.attachments('training')
PPAttachment(sent='0', verb='join', noun1='board',
              prep='as', noun2='director', attachment='V'),
PPAttachment(sent='1', verb='is', noun1='chairman',
              prep='of', noun2='N.V.', attachment='N'),
 ...]
inst = ppattach.attachments('training')[1]
(inst.noun1, inst.prep, inst.noun2)
('chairman', 'of', 'N.V.')
```
Select only the instances where `inst.attachment` is `N`:
```
nattach = [inst for inst in ppattach.attachments('training')
           if inst.attachment == 'N']
```
Using this sub-corpus, build a classifier that attempts to predict which preposition is used to connect a given pair of nouns. For example, given the pair of nouns "team" and "researchers," the classifier should predict the preposition "of". See the corpus `HOWTO` at `http://nltk.org/howto` for more information on using the PP attachment corpus.

In [32]:
from nltk.corpus import ppattach
def ExPPFea(instance):
    feature = {
        'noun1':instance.noun1, #['verb']:instance.verb,
        'noun2':instance.noun2
    }
    return feature
TrainSet = [(ExPPFea(i),i.prep) for i in ppattach.attachments('training') ]
TestSet = [(ExPPFea(i),i.prep)  for i in ppattach.attachments('test') ]
PPClassifer = nltk.MaxentClassifier.train(TrainSet)
nltk.classify.accuracy(PPClassifer,TestSet)
PPClassifer.classify({'noun1':'team',
        'noun2':'reasearcher'})

==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -4.30407        0.003
             2          -1.29282        0.799
             3          -0.89499        0.838
             4          -0.72269        0.850
             5          -0.62480        0.859
             6          -0.56096        0.864
             7          -0.51566        0.869
             8          -0.48163        0.872
             9          -0.45499        0.874
            10          -0.43348        0.876
            11          -0.41568        0.878
            12          -0.40066        0.879
            13          -0.38779        0.880
            14          -0.37662        0.881
            15          -0.36680        0.882
            16          -0.35809        0.883
            17          -0.35030        0.884
            18          -0.34328        0.884
            19          -0.33692        0.885
   

'against'

#### 10. Suppose you wanted to automatically generate a prose description of a scene, and already had a word to uniquely describe each entity, such as the jar, and simply wanted to decide whether to use in or on in relating various items, e.g. the book is in the cupboard vs the book is on the shelf. Explore this issue by looking at corpus data; writing programs as needed.    		
##### a. in the car versus on the train
##### b. in town versus on campus
##### c. in the picture versus on the screen
##### d. in Macbeth versus on Letterman