## 1 Supervised classification:

##### Gender identification:

In [76]:
import nltk

def gender_features(name):
    return {'last_letter': name[-1]
            #, 'first_letter': word[0]
            #, 'name_length': len(word)
           } # builds a feature set: human-readable feature name and value pair

gender_features('Shrek')

{'last_letter': 'k'}

In [77]:
def gender_features2(name):
    features = {}
    features['first_letter'] = name[0].lower()
    features['last_letter'] = name[-1].lower()
    for x in 'abcdefghijklmnopqrstuvwxyz':
        features['count({})'.format(x)] = name.lower().count(x)
        features['has({})'.format(x)] = (x in name.lower())
    return features

In [78]:
from nltk.corpus import names

labeled_names = ([(x, 'male') for x in names.words('male.txt')] +
                [(x, 'female') for x in names.words('female.txt')])

import random
random.shuffle(labeled_names)
labeled_names[:5]

[('Kasey', 'female'),
 ('Hollis', 'male'),
 ('Berk', 'male'),
 ('Hatti', 'female'),
 ('Aime', 'female')]

In [79]:
nltk.FreqDist([y for x, y in labeled_names])

FreqDist({'female': 5001, 'male': 2943})

In [80]:
featuresets = [(gender_features2(x), y) for x, y in labeled_names]

#train_set, test_set = featuresets[500:], featuresets[:500] # When feature set is relatively small

In [81]:
# When feature set is large

from nltk.classify import apply_features

train_set = apply_features(gender_features2, labeled_names[500:])
test_set = apply_features(gender_features2, labeled_names[:500])

In [82]:
len(featuresets), len(train_set)

(7944, 7444)

In [83]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

name = 'Trinity'
print('{} is a '.format(name) + classifier.classify(gender_features2(name)))

Trinity is a female


In [84]:
print(nltk.classify.accuracy(classifier, test_set))

0.77


Find most the most effective features:

In [85]:
classifier.show_most_informative_features(6) # likelihood ratios

Most Informative Features
             last_letter = 'a'            female : male   =     37.0 : 1.0
             last_letter = 'k'              male : female =     31.9 : 1.0
             last_letter = 'f'              male : female =     17.1 : 1.0
             last_letter = 'v'              male : female =     11.1 : 1.0
             last_letter = 'd'              male : female =      9.8 : 1.0
                count(v) = 2              female : male   =      8.9 : 1.0


##### Choosing the right features:

In [86]:
def gender_features2(name):
    features = {}
    features['first_letter'] = name[0].lower()
    features['last_letter'] = name[-1].lower()
    for x in 'abcdefghijklmnopqrstuvwxyz':
        features['count({})'.format(x)] = name.lower().count(x)
        features['has({})'.format(x)] = (x in name.lower())
    return features

In [87]:
#gender_features2('John')

In [88]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]
#len(train_names), len(devtest_names), len(test_names)

Apply features to the dataset:

In [89]:
train_set = apply_features(gender_features, train_names)
devtest_set = apply_features(gender_features, devtest_names)
test_set = apply_features(gender_features, test_names)

Train the classifier:

In [90]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.786


Review errors:

In [91]:
errors = []

for x, y in devtest_names:
    guess = classifier.classify(gender_features(x))
    if guess != y:
        errors.append((y, guess, x))
        
for (y, guess, x) in sorted(errors)[:10]:
    print('correct={:<8}' 'guess={:<8s}' 'name={:<30}'.format(y, guess, x))

correct=female  guess=male    name=Addis                         
correct=female  guess=male    name=Alis                          
correct=female  guess=male    name=Allis                         
correct=female  guess=male    name=Anais                         
correct=female  guess=male    name=Annabel                       
correct=female  guess=male    name=Annabell                      
correct=female  guess=male    name=Aryn                          
correct=female  guess=male    name=Beret                         
correct=female  guess=male    name=Bliss                         
correct=female  guess=male    name=Brigit                        


Update features:

In [92]:
def gender_features(name):
    return {'suffix1': name[-1],
            'suffix2': name[-2:]}

gender_features('Ailyn')

{'suffix1': 'n', 'suffix2': 'yn'}

Retrain the classifier:

In [93]:
train_set = apply_features(gender_features, train_names)
devtest_set = apply_features(gender_features, devtest_names)
test_set = apply_features(gender_features, test_names)

classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.802


In [94]:
classifier.show_most_informative_features(6)

Most Informative Features
                 suffix2 = 'na'           female : male   =     85.0 : 1.0
                 suffix2 = 'la'           female : male   =     67.3 : 1.0
                 suffix2 = 'ra'           female : male   =     53.4 : 1.0
                 suffix2 = 'ia'           female : male   =     48.7 : 1.0
                 suffix1 = 'a'            female : male   =     36.1 : 1.0
                 suffix1 = 'k'              male : female =     27.8 : 1.0


Final accuracy test:

In [95]:
print(nltk.classify.accuracy(classifier, test_set))
name = 'Neo'
print('{} is a '.format(name) + classifier.classify(gender_features2(name)))

0.778
Neo is a female


#### Decision tree entropy and information gain:

In [96]:
import math

def entropy(labels):
    freqdist = nltk.FreqDist(labels)
    probs = [freqdist.freq(x) for x in freqdist]
    return -sum(x * math.log(x,2) for x in probs)

In [97]:
labels1 = ['female', 'male', 'male', 'male']
freqdist = nltk.FreqDist(labels1)
freqdist.most_common()

[('male', 3), ('female', 1)]

In [98]:
for x in freqdist:
    print(x)
    
freqdist.freq('male')

male
female


0.75

In [99]:
entropy(labels1)

0.8112781244591328

### Document classification:

##### Based on word frequency (Naive Bayes):

In [100]:
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(y)), x)
            for x in movie_reviews.categories()
            for y in movie_reviews.fileids(x)]

random.shuffle(documents)

In [101]:
all_words = nltk.FreqDist(x.lower() for x in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document) # checking an 'IF x in y' statement is faster in a set than in a list
    features = {}
    for x in word_features:
        features['contains({})'.format(x)] = (x in document_words)
    return features

#print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

In [236]:
#data_length = len(documents)
data_length = int(len(documents) * 0.9)

train_set = apply_features(document_features, documents[:data_length])
test_set = apply_features(document_features, documents[data_length:])

len(train_set), len(test_set)

(1800, 200)

In [237]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [238]:
nltk.classify.accuracy(classifier, test_set)

0.835

In [239]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.4 : 1.0
         contains(mulan) = True              pos : neg    =      8.3 : 1.0
        contains(wasted) = True              neg : pos    =      7.6 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.3 : 1.0
        contains(seagal) = True              neg : pos    =      6.2 : 1.0


In [242]:
errors = []

for x, y in documents[int(data_length * 0.9):]:
    guess = classifier.classify(document_features(x))
    if guess != y:
        errors.append((y, guess, x))
        
#for (y, guess, x) in sorted(errors):
#    print('correct=: ', y, 'guess=: ', guess, 'document= ', x)

##### Based on word structure (Decision Tree):

In [245]:
from nltk.corpus import brown

suffix_fdist = nltk.FreqDist()
for x in brown.words(): # compile suffix freq dist
    suffix_fdist[x[-1:]] += 1
    suffix_fdist[x[-2:]] += 1
    suffix_fdist[x[-3:]] += 1

In [246]:
common_suffixes = [x for x, y in suffix_fdist.most_common(100)]
print(common_suffixes[:10])

['e', ',', '.', 's', 'd', 't', 'n', 'he', 'of', 'a']


In [247]:
# Define feature extractor:

def pos_features(word):
    features = {}
    for x in common_suffixes:
        features['endswith({})'.format(x)] = word.lower().endswith(x)
    return features

In [244]:
# Create feature set for each data point

tagged_words = brown.tagged_words(categories='news')
#featuresets = [(pos_features(x), y) for x, y in tagged_words]

In [250]:
# Split into training and testing sets

size = int(len(tagged_words) * 0.1)

#train_set, test_set = featuresets[size:], featuresets[:size]
train_set = apply_features(pos_features, tagged_words[size:])
test_set = apply_features(pos_features, tagged_words[:size])

len(train_set), len(test_set)

(90499, 10055)

In [252]:
# Train classifier

classifier = nltk.DecisionTreeClassifier.train(train_set[:1000])

In [253]:
nltk.classify.accuracy(classifier, test_set[:100])

0.64

In [254]:
classifier.classify(pos_features('cats'))

'NNS'

In [255]:
help(nltk.DecisionTreeClassifier.train)

Help on function train in module nltk.classify.decisiontree:

train(labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100, support_cutoff=10, binary=False, feature_values=None, verbose=False)
    :param binary: If true, then treat all feature/value pairs as
        individual binary features, rather than using a single n-way
        branch for each feature.



In [256]:
# Display tree structure

print(classifier.pseudocode(depth=4))

if endswith(he) == False: 
  if endswith(s) == False: 
    if endswith(.) == False: 
      if endswith(,) == False: return 'WDT'
      if endswith(,) == True: return ','
    if endswith(.) == True: return '.'
  if endswith(s) == True: 
    if endswith(as) == False: 
      if endswith(is) == False: return 'NNS'
      if endswith(is) == True: return 'BEZ'
    if endswith(as) == True: 
      if endswith(was) == False: return 'HVZ'
      if endswith(was) == True: return 'BEDZ'
if endswith(he) == True: 
  if endswith(the) == False: return 'PPS'
  if endswith(the) == True: return 'AT'



##### Based on word structure and previous word (Naive Bayes):

In [257]:
# Define context-dependant feature extractor

def pos_features_context(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features
               
pos_features_context(brown.sents()[0], 8)                     

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [258]:
# Build a labeled featureset for every word in every sentence

tagged_sents = brown.tagged_sents(categories='news')
#featuresets = []

#for x in tagged_sents:
#    untagged_sent = nltk.tag.untag(x)
#    for i, (word, tag) in enumerate(x):
#        featuresets.append( (pos_features_context(untagged_sent, i), tag) )        

In [259]:
def featureset_buider(tagged_sents):
    featuresets = []
    for x in tagged_sents:
        untagged_sent = nltk.tag.untag(x)
        for i, (word, tag) in enumerate(x):
            featuresets.append( (pos_features_context(untagged_sent, i), tag) )        

In [266]:
featureset_buider(tagged_sents[size:])

In [267]:
featuresets[0]

({'suffix(1)': 'e',
  'suffix(2)': 'he',
  'suffix(3)': 'The',
  'prev-word': '<START>'},
 'AT')

In [269]:
# Prepare train and test sets

size =  int(len(featuresets) * 0.1)

#train_set = apply_features(featureset_buider, tagged_sents[size:])
#test_set = apply_features(featureset_buider, tagged_sents[:size])
train_set, test_set = featuresets[size:], featuresets[:size] # rebuild using apply_features() function

len(train_set), len(test_set)

(90499, 10055)

In [280]:
# Train and test

classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

In [279]:
classifier.show_most_informative_features()

Most Informative Features
               suffix(1) = '.'                 . : NN     =   6950.8 : 1.0
               suffix(2) = 'he'               AT : NN     =   3296.2 : 1.0
               suffix(2) = 'ho'              WPS : NN     =   2982.4 : 1.0
               suffix(1) = 'r'               JJR : NNS    =   2252.6 : 1.0
               suffix(2) = 'to'               TO : JJ     =   2180.6 : 1.0
               suffix(1) = 'h'               ABX : NNS    =   2013.7 : 1.0
               suffix(2) = 'es'              NNS : IN     =   1676.3 : 1.0
               suffix(3) = 'hat'              CS : NN     =   1576.4 : 1.0
               suffix(1) = "'"                '' : JJ     =   1502.2 : 1.0
               suffix(2) = 'ng'              VBG : VBN    =   1241.0 : 1.0


##### Based on word structure, previous word and previous word tag (Naive Bayes, joint classifier):

In [294]:
# Feature extraction

def pos_features_prev_tag(sentence, i, history): 
    features = {"suffix(1)": sentence[i][-1:],
                 "suffix(2)": sentence[i][-2:],
                 "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
    return features

class ConsecutivePosTagger(nltk.TaggerI): 

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features_prev_tag(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features_prev_tag(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [295]:
tagged_sents = brown.tagged_sents(categories='news')

size =  int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]  # rebuild using apply_features() function
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

0.7980528511821975


In [453]:
tagger.classifier.show_most_informative_features(10)

Most Informative Features
               suffix(1) = '.'                 . : NN     =   6881.7 : 1.0
               suffix(2) = 'he'               AT : NN     =   3266.7 : 1.0
                prev-tag = 'TO'               VB : NN     =   3227.5 : 1.0
               suffix(2) = 'ho'              WPS : NN     =   2940.6 : 1.0
                prev-tag = 'MD'               BE : NP     =   2253.7 : 1.0
               suffix(1) = 'r'               JJR : NNS    =   2223.1 : 1.0
               suffix(2) = 'to'               TO : JJ     =   2165.0 : 1.0
               suffix(1) = 'h'               ABX : NNS    =   1954.4 : 1.0
               suffix(2) = 'es'              NNS : IN     =   1648.4 : 1.0
               suffix(3) = 'hat'              CS : NN     =   1528.4 : 1.0


## 2 Further examples of supervised classification:

#### Sentence segmentation (to be continued):

In [308]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for x in sents:
    tokens.extend(x) # can add multiple individual elements to the list
    offset += len(x)
    boundaries.add(offset-1)

#### Indentifying dialogue act types:

In [328]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000] # to get a data structure representing the XML annotation for each post

Define feature extractor:

In [429]:
def dialogue_act_features(post):
    features = {}
    for x in nltk.word_tokenize(post):
        features['contains({})'.format(x.lower())] = True
    return features

In [430]:
dialogue_act_features(posts[0].text)

{'contains(now)': True,
 'contains(im)': True,
 'contains(left)': True,
 'contains(with)': True,
 'contains(this)': True,
 'contains(gay)': True,
 'contains(name)': True}

In [389]:
posts[0].text

'now im left with this gay name'

In [405]:
nltk.word_tokenize(posts[0].text)

['now', 'im', 'left', 'with', 'this', 'gay', 'name']

Create featuresets:

In [431]:
featuresets = [(dialogue_act_features(x.text), x.get('class')) for x in posts]

In [435]:
featuresets[0]

({'contains(now)': True,
  'contains(im)': True,
  'contains(left)': True,
  'contains(with)': True,
  'contains(this)': True,
  'contains(gay)': True,
  'contains(name)': True},
 'Statement')

In [455]:
# Prepare train and test sets

random.shuffle(featuresets)
size =  int(len(featuresets) * 0.1)

#train_set = apply_features(dialogue_act_features, posts[size:])
#test_set = apply_features(dialogue_act_features, posts[:size])
train_set, test_set = featuresets[size:], featuresets[:size] # rebuild using apply_features() function

len(train_set), len(test_set)

(9000, 1000)

In [456]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.669


In [457]:
classifier.show_most_informative_features(10)

Most Informative Features
            contains(hi) = True            Greet : System =    395.0 : 1.0
             contains(>) = True            Other : System =    321.1 : 1.0
           contains(brb) = True              Bye : Statem =    305.9 : 1.0
            contains(no) = True           nAnswe : System =    304.2 : 1.0
          contains(part) = True           System : Statem =    303.6 : 1.0
             contains(0) = True            Other : Statem =    284.8 : 1.0
          contains(nope) = True           nAnswe : Statem =    281.2 : 1.0
           contains(yes) = True           yAnswe : Emotio =    246.0 : 1.0
             contains(<) = True            Other : Greet  =    231.9 : 1.0
           contains(are) = True           whQues : System =    192.8 : 1.0


#### Recognising textual entailment (RTE):

In [459]:
def rte_features(rtepair): # some high frequency function words are filtered out as 'stopwords'
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne')) # named entity
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features

In [461]:
rte_features(rtepair)

{'word_overlap': 0, 'word_hyp_extra': 1, 'ne_overlap': 1, 'ne_hyp_extra': 1}

In [462]:
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.RTEFeatureExtractor(rtepair)
print(extractor.text_words)

{'Russia', 'Davudi', 'association', 'at', 'four', 'fight', 'terrorism.', 'fledgling', 'Iran', 'Shanghai', 'Soviet', 'SCO', 'operation', 'representing', 'former', 'that', 'Parviz', 'Asia', 'central', 'together', 'republics', 'Organisation', 'was', 'binds', 'meeting', 'China', 'Co'}


In [466]:
print(extractor.hyp_words)

{'member', 'SCO.', 'China'}


In [472]:
print(extractor.overlap('word'), extractor.overlap('ne'), extractor.hyp_extra('word')) # didn't pick up SCO due to period (SCO.)

set() {'China'} {'member'}


help(nltk.RTEFeatureExtractor)