# 6.2 Further Examples of Supervised Classification

#### Sentence Segmentation

In [1]:
from nltk.corpus import names
from nltk.classify import apply_features
import random
import nltk

In [2]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [3]:
print tokens



In [4]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prevword': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [5]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
    for i in range(1, len(tokens)-1)
    if tokens[i] in '.?!']

In [6]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [7]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in words:
        if word in '.?!' and classifier.classify(words, i) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])

#### Identifying Dialogue Act Types

In [8]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [9]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains(%s)' % word.lower()] = True
    return features

In [10]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
    for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)

0.668


#### Recognizing Textual Entailment

In [12]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    print extractor
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [13]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.RTEFeatureExtractor(rtepair)
print extractor.text_words

set(['Organisation', 'Shanghai', 'Asia', 'four', 'at', 'operation', 'SCO', 'Iran', 'Soviet', 'Davudi', 'fight', 'China', 'association', 'fledgling', 'was', 'that', 'republics', 'former', 'Co', 'representing', 'Russia', 'Parviz', 'central', 'meeting', 'together', 'binds', 'terrorism.'])


In [14]:
print rtepair

<RTEPair: gid=3-34>


In [15]:
rte_features(rtepair)

<nltk.classify.rte_classify.RTEFeatureExtractor object at 0x7f146b928810>


{'ne_hyp_extra': 1, 'ne_overlap': 1, 'word_hyp_extra': 1, 'word_overlap': 0}

In [17]:
print extractor.overlap('word')

set([])


In [18]:
print extractor.overlap('ne')

set(['China'])


In [19]:
print extractor.hyp_extra('word')

set(['member'])
