In [1]:
# https://www.nltk.org/book/ch06.html

In [44]:
def gender_features(word):
    return {
        'suffix1': word[-1:],
        'suffix2': word[-2:]
    }

In [10]:
import nltk

In [3]:
from nltk.coarpus import names

In [4]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                [(name, 'female') for name in names.words('female.txt')])

In [5]:
import random

In [6]:
random.shuffle(labeled_names)

In [17]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

In [18]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [19]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [12]:
classifier.classify(gender_features('Neo'))

'male'

In [13]:
classifier.classify(gender_features('Trinity'))

'female'

In [20]:
nltk.classify.accuracy(classifier, test_set)

0.718

In [21]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     38.5 : 1.0
             last_letter = 'k'              male : female =     32.5 : 1.0
             last_letter = 'f'              male : female =     16.1 : 1.0
             last_letter = 'p'              male : female =     12.7 : 1.0
             last_letter = 'v'              male : female =     10.6 : 1.0


In [22]:
from nltk.classify import apply_features

In [23]:
train_set = apply_features(gender_features, labeled_names[500:])

In [24]:
test_set = apply_features(gender_features, labeled_names[:500])

In [26]:
def gender_features2(name):
    features = {}
    features['first_letter'] = name[0].lower()
    features['last_letter'] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features['count({})'.format(letter)] = name.lower().count(letter)
        features['has({})'.format(letter)] = (letter in name.lower())
    return features

In [27]:
gender_features2('John')

{'first_letter': 'j',
 'last_letter': 'n',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 0,
 'has(e)': False,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 1,
 'has(j)': True,
 'count(k)': 0,
 'has(k)': False,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 1,
 'has(n)': True,
 'count(o)': 1,
 'has(o)': True,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 0,
 'has(r)': False,
 'count(s)': 0,
 'has(s)': False,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [28]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]

In [29]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [31]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [32]:
nltk.classify.accuracy(classifier, test_set)

0.728

In [33]:
train_names = labeled_names[1500:]

In [34]:
devtest_names = labeled_names[500:1500]

In [35]:
test_names = labeled_names[:500]

In [36]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]

In [37]:
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]

In [38]:
test_set = [(gender_features(n), gender) for (n, gender) in test_names]

In [39]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [40]:
nltk.classify.accuracy(classifier, devtest_set)

0.797

In [41]:
errors = []

In [42]:
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag, guess, name))

In [43]:
for tag, guess, name in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Adrien                        
correct=female   guess=male     name=Aeriel                        
correct=female   guess=male     name=Aigneis                       
correct=female   guess=male     name=Alix                          
correct=female   guess=male     name=Ardelis                       
correct=female   guess=male     name=Beatrix                       
correct=female   guess=male     name=Bell                          
correct=female   guess=male     name=Berget                        
correct=female   guess=male     name=Blair                         
correct=female   guess=male     name=Bo                            
correct=female   guess=male     name=Bridget                       
correct=female   guess=male     name=Cameo                         
correct=female   guess=male     name=Cleo                          
correct=female   guess=male     name=Daloris                       
correct=female   guess=male     name=Debor      

In [45]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]

In [46]:
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]

In [47]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [48]:
nltk.classify.accuracy(classifier, devtest_set)

0.788

In [27]:
from nltk.corpus import movie_reviews

In [53]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [54]:
# random.shuffle(documents)

In [55]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

In [56]:
word_features = [word for word, _ in all_words.most_common(2000)]

In [57]:
word_features[:20]

[',',
 'the',
 '.',
 'a',
 'and',
 'of',
 'to',
 "'",
 'is',
 'in',
 's',
 '"',
 'it',
 'that',
 '-',
 ')',
 '(',
 'as',
 'with',
 'for']

In [32]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [55]:
document_features(movie_reviews.words('pos/cv957_8737.txt'))

{'contains(plot)': True,
 'contains(:)': True,
 'contains(two)': True,
 'contains(teen)': False,
 'contains(couples)': False,
 'contains(go)': False,
 'contains(to)': True,
 'contains(a)': True,
 'contains(church)': False,
 'contains(party)': False,
 'contains(,)': True,
 'contains(drink)': False,
 'contains(and)': True,
 'contains(then)': True,
 'contains(drive)': False,
 'contains(.)': True,
 'contains(they)': True,
 'contains(get)': True,
 'contains(into)': True,
 'contains(an)': True,
 'contains(accident)': False,
 'contains(one)': True,
 'contains(of)': True,
 'contains(the)': True,
 'contains(guys)': False,
 'contains(dies)': False,
 'contains(but)': True,
 'contains(his)': True,
 'contains(girlfriend)': True,
 'contains(continues)': False,
 'contains(see)': False,
 'contains(him)': True,
 'contains(in)': True,
 'contains(her)': False,
 'contains(life)': False,
 'contains(has)': True,
 'contains(nightmares)': False,
 'contains(what)': True,
 "contains(')": True,
 'contains(s)': T

In [58]:
featuresets = [(document_features(d), c) for (d, c) in documents]

In [52]:
len(documents)

2000

In [59]:
train_set, test_set = featuresets[100:], featuresets[:100]

In [63]:
train_set[0]

({'contains(,)': True,
  'contains(the)': True,
  'contains(.)': True,
  'contains(a)': True,
  'contains(and)': True,
  'contains(of)': True,
  'contains(to)': True,
  "contains(')": True,
  'contains(is)': True,
  'contains(in)': True,
  'contains(s)': True,
  'contains(")': True,
  'contains(it)': True,
  'contains(that)': True,
  'contains(-)': True,
  'contains())': True,
  'contains(()': True,
  'contains(as)': True,
  'contains(with)': True,
  'contains(for)': True,
  'contains(his)': True,
  'contains(this)': True,
  'contains(film)': True,
  'contains(i)': True,
  'contains(he)': True,
  'contains(but)': True,
  'contains(on)': True,
  'contains(are)': True,
  'contains(t)': True,
  'contains(by)': True,
  'contains(be)': True,
  'contains(one)': True,
  'contains(movie)': True,
  'contains(an)': True,
  'contains(who)': True,
  'contains(not)': True,
  'contains(you)': True,
  'contains(from)': True,
  'contains(at)': True,
  'contains(was)': True,
  'contains(have)': True,
 

In [60]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [61]:
nltk.classify.accuracy(classifier, test_set)

0.86

In [60]:
classifier.show_most_informative_features(5)

Most Informative Features
    contains(schumacher) = True              neg : pos    =     12.5 : 1.0
 contains(unimaginative) = True              neg : pos    =      8.4 : 1.0
        contains(sexist) = True              neg : pos    =      7.7 : 1.0
        contains(shoddy) = True              neg : pos    =      7.1 : 1.0
        contains(suvari) = True              neg : pos    =      7.1 : 1.0


In [61]:
from nltk.corpus import brown

In [62]:
suffix_fdist = nltk.FreqDist()

In [63]:
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [64]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]

In [65]:
common_suffixes

['e',
 ',',
 '.',
 's',
 'd',
 't',
 'he',
 'n',
 'a',
 'of',
 'the',
 'y',
 'r',
 'to',
 'in',
 'f',
 'o',
 'ed',
 'nd',
 'is',
 'on',
 'l',
 'g',
 'and',
 'ng',
 'er',
 'as',
 'ing',
 'h',
 'at',
 'es',
 'or',
 're',
 'it',
 '``',
 'an',
 "''",
 'm',
 ';',
 'i',
 'ly',
 'ion',
 'en',
 'al',
 '?',
 'nt',
 'be',
 'hat',
 'st',
 'his',
 'th',
 'll',
 'le',
 'ce',
 'by',
 'ts',
 'me',
 've',
 "'",
 'se',
 'ut',
 'was',
 'for',
 'ent',
 'ch',
 'k',
 'w',
 'ld',
 '`',
 'rs',
 'ted',
 'ere',
 'her',
 'ne',
 'ns',
 'ith',
 'ad',
 'ry',
 ')',
 '(',
 'te',
 '--',
 'ay',
 'ty',
 'ot',
 'p',
 'nce',
 "'s",
 'ter',
 'om',
 'ss',
 ':',
 'we',
 'are',
 'c',
 'ers',
 'uld',
 'had',
 'so',
 'ey']

In [66]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [67]:
tagged_words = brown.tagged_words(categories='news')

In [68]:
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]

In [69]:
size = int(len(featuresets) * 0.1)

In [70]:
train_set, test_set = featuresets[size:], featuresets[:size]

In [71]:
classifier = nltk.DecisionTreeClassifier.train(train_set)

In [72]:
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

In [73]:
classifier.classify(pos_features('cats'))

'NNS'

In [75]:
print(classifier.pseudocode(depth=4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



In [85]:
def pos_features(sentence, i, history):
    features = {
        'suffix(1)': sentence[i][-1:],
        'suffix(2)': sentence[i][-2:],
        'suffix(3)': sentence[i][-3:],
    }
    if i == 0:
        features['prev-word'] = '<START>'
        features['prev-tag'] = '<START>'
    else:
        features['prev-word'] = sentence[i-1]
        features['prev-tag'] = history[i-1]
    return features

In [77]:
pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [78]:
tagged_sents = brown.tagged_sents(categories='news')

In [79]:
featuresets = []

In [80]:
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent, i), tag))

In [81]:
size = int(len(featuresets)*0.1)

In [82]:
train_set, test_set = featuresets[size:], featuresets[:size]

In [83]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [84]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [86]:
class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [87]:
tagged_sents = brown.tagged_sents(categories='news')

In [88]:
size = int(len(tagged_sents)*0.1)

In [89]:
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]

In [90]:
tagger = ConsecutivePosTagger(train_sents)

In [91]:
tagger.evaluate(test_sents)

0.7980528511821975

In [92]:
sents = nltk.corpus.treebank_raw.sents()

In [93]:
tokens = []

In [94]:
boundaries = set()

In [95]:
offset = 0

In [96]:
for sent in sents:
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [97]:
def punct_features(tokens, i):
    return {
        'next-word-capitalized': tokens[i+1][0].isupper(),
        'prev-word': tokens[i-1].lower(),
        'punct': tokens[i],
        'prev-word-is-one-char': len(tokens[i-1]) == 1
    }

In [98]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
              for i in range(1, len(tokens)-1)
              if tokens[i] in '.?!']

In [99]:
size = int(len(featuresets)*0.1)

In [100]:
train_set, test_set = featuresets[size:], featuresets[:size]

In [101]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [102]:
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [104]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in enumerate(words):
        if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])
    return sents

In [105]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [106]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

In [107]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
              for post in posts]

In [108]:
size = int(len(featuresets)*0.1)

In [109]:
train_set, test_set = featuresets[size:], featuresets[:size]

In [110]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [111]:
nltk.classify.accuracy(classifier, test_set)

0.668

In [112]:
def rte_features(rtepair):
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [113]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]

In [114]:
extractor = nltk.RTEFeatureExtractor(rtepair)

In [115]:
extractor.text_words

{'Asia',
 'China',
 'Co',
 'Davudi',
 'Iran',
 'Organisation',
 'Parviz',
 'Russia',
 'SCO',
 'Shanghai',
 'Soviet',
 'association',
 'at',
 'binds',
 'central',
 'fight',
 'fledgling',
 'former',
 'four',
 'meeting',
 'operation',
 'representing',
 'republics',
 'terrorism.',
 'that',
 'together',
 'was'}

In [116]:
extractor.hyp_words

{'China', 'SCO.', 'member'}

In [118]:
extractor.overlap('word')

set()

In [119]:
extractor.overlap('ne')

{'China'}

In [120]:
extractor.hyp_extra('word')

{'member'}

In [1]:
import random

In [2]:
import nltk

In [3]:
from nltk.corpus import brown

In [64]:
tagged_sents = list(brown.tagged_sents(categories='news'))

In [5]:
random.shuffle(tagged_sents)

In [6]:
size = int(len(tagged_sents) * 0.1)

In [65]:
train_set, test_set = tagged_sents[size:], tagged_sents[:size]

In [8]:
file_ids = brown.fileids(categories='news')

In [9]:
size = int(len(file_ids) * 0.1)

In [16]:
train_set = brown.tagged_sents(file_ids[size:])

In [17]:
test_set = brown.tagged_sents(file_ids[:size])

In [19]:
train_set = brown.tagged_sents(categories='news')

In [20]:
test_set = brown.tagged_sents(categories='fiction')

In [67]:
train_set

[[('The', 'AT'),
  ('jury', 'NN'),
  ('said', 'VBD'),
  ('it', 'PPS'),
  ('did', 'DOD'),
  ('find', 'VB'),
  ('that', 'CS'),
  ('many', 'AP'),
  ('of', 'IN'),
  ("Georgia's", 'NP$'),
  ('registration', 'NN'),
  ('and', 'CC'),
  ('election', 'NN'),
  ('laws', 'NNS'),
  ('``', '``'),
  ('are', 'BER'),
  ('outmoded', 'JJ'),
  ('or', 'CC'),
  ('inadequate', 'JJ'),
  ('and', 'CC'),
  ('often', 'RB'),
  ('ambiguous', 'JJ'),
  ("''", "''"),
  ('.', '.')],
 [('It', 'PPS'),
  ('recommended', 'VBD'),
  ('that', 'CS'),
  ('Fulton', 'NP'),
  ('legislators', 'NNS'),
  ('act', 'VB'),
  ('``', '``'),
  ('to', 'TO'),
  ('have', 'HV'),
  ('these', 'DTS'),
  ('laws', 'NNS'),
  ('studied', 'VBN'),
  ('and', 'CC'),
  ('revised', 'VBN'),
  ('to', 'IN'),
  ('the', 'AT'),
  ('end', 'NN'),
  ('of', 'IN'),
  ('modernizing', 'VBG'),
  ('and', 'CC'),
  ('improving', 'VBG'),
  ('them', 'PPO'),
  ("''", "''"),
  ('.', '.')],
 [('The', 'AT'),
  ('grand', 'JJ'),
  ('jury', 'NN'),
  ('commented', 'VBD'),
  ('on', 'IN

In [66]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

ValueError: too many values to unpack (expected 2)

In [26]:
train_set

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.'),
 ('The', 'AT'),
 ('jury', 'NN'),
 ('further', 'RBR'),
 ('said', 'VBD'),
 ('in', 'IN'),
 ('term-end', 'NN'),
 ('presentments', 'NNS'),
 ('that', 'CS'),
 ('the', 'AT'),
 ('City', 'NN-TL'),
 ('Executive', 'JJ-TL'),
 ('Committee', 'NN-TL'),
 (',', ','),
 ('which', 'WDT'),
 ('had', 'HVD'),
 ('over-all', 'JJ'),
 ('charge', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('election', 'NN'),
 (',', ','),
 ('``', '``'),
 ('deserves', 'VBZ'),
 ('the', 'AT'),
 ('praise', 'NN'),
 ('and', 'CC'),
 ('thanks', 'NNS'),
 ('of', 'IN'),
 ('the', '

In [68]:
def tag_list(tagged_sents):
    return [tag for sent in tagged_sents for (word, tag) in sent]

In [69]:
def apply_tagger(tagger, corpus):
    return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

In [70]:
gold = tag_list(brown.tagged_sents(categories='editorial'))

In [71]:
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))

NameError: name 't2' is not defined

In [72]:
import math

In [74]:
def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(l) for l in freqdist]
    return -sum(p*math.log(p, 2) for p in probs)

In [75]:
entropy(['male', 'female', 'male', 'male'])

0.8112781244591328