# Loglinear Models in NLTK

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import random
import pprint
from nltk.classify.maxent import MaxentClassifier
from nltk.classify.util import names_demo, names_demo_features

print "Features used to classify a name as male or female:"
pp = pprint.PrettyPrinter(indent=4)
test_features = names_demo_features("anoop")
pp.pprint(test_features)

Features used to classify a name as male or female:
{   'alwayson': True,
    'count(a)': 1,
    'count(b)': 0,
    'count(c)': 0,
    'count(d)': 0,
    'count(e)': 0,
    'count(f)': 0,
    'count(g)': 0,
    'count(h)': 0,
    'count(i)': 0,
    'count(j)': 0,
    'count(k)': 0,
    'count(l)': 0,
    'count(m)': 0,
    'count(n)': 1,
    'count(o)': 2,
    'count(p)': 1,
    'count(q)': 0,
    'count(r)': 0,
    'count(s)': 0,
    'count(t)': 0,
    'count(u)': 0,
    'count(v)': 0,
    'count(w)': 0,
    'count(x)': 0,
    'count(y)': 0,
    'count(z)': 0,
    'endswith': 'p',
    'has(a)': True,
    'has(b)': False,
    'has(c)': False,
    'has(d)': False,
    'has(e)': False,
    'has(f)': False,
    'has(g)': False,
    'has(h)': False,
    'has(i)': False,
    'has(j)': False,
    'has(k)': False,
    'has(l)': False,
    'has(m)': False,
    'has(n)': True,
    'has(o)': True,
    'has(p)': True,
    'has(q)': False,
    'has(r)': False,
    'has(s)': False,
    'has(t)': Fa

In [2]:
print "Train loglinear classifier and run on some example input names:"
def mytrain(train_toks):
    return MaxentClassifier.train(train_toks, max_iter=20)
classifier = names_demo(mytrain)

Train loglinear classifier and run on some example input names:
Training classifier...
  ==> Training (20 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.374
             2          -0.61426        0.626
             3          -0.59970        0.626
             4          -0.58597        0.627
             5          -0.57305        0.633
             6          -0.56092        0.652
             7          -0.54953        0.673
             8          -0.53885        0.688
             9          -0.52882        0.702
            10          -0.51940        0.717
            11          -0.51055        0.729
            12          -0.50224        0.740
            13          -0.49442        0.747
            14          -0.48706        0.756
            15          -0.48012        0.765
            16          -0.47357        0.770
            17          -0.46739        0.773
       

In [9]:
print "Train NaiveBayes classifier and run on some example input names:"
naivebayes = names_demo(NaiveBayesClassifier.train)

Train NaiveBayes classifier and run on some example input names:
Training classifier...
Testing classifier...
Accuracy: 0.7580
Avg. log likelihood: -0.7355

Unseen Names      P(Male)  P(Female)
----------------------------------------
  Octavius        *0.9819   0.0181
  Thomasina        0.0182  *0.9818
  Barnett         *0.6642   0.3358
  Angelina         0.0002  *0.9998
  Saunders        *0.9017   0.0983


In [3]:
name='luke'
print "Run trained classifier on input name:", name
test_features = names_demo_features(name)
output = classifier.prob_classify(test_features)
print "P(male|{0})={1}".format(name,output.prob('male'))
print "P(female|{0})={1}".format(name,output.prob('female'))
classifier.explain(test_features)

Run trained classifier on input name: luke
P(male|luke)=0.413080109092
P(female|luke)=0.586919890908
  Feature                                           female    male
  ----------------------------------------------------------------
  has(u)==True (1)                                  -0.099
  count(u)==1 (1)                                   -0.096
  endswith==u'e' (1)                                 0.076
  has(a)==False (1)                                 -0.050
  count(a)==0 (1)                                   -0.050
  startswith==u'l' (1)                               0.049
  has(i)==False (1)                                 -0.034
  count(i)==0 (1)                                   -0.034
  has(l)==True (1)                                   0.031
  count(l)==1 (1)                                    0.024
  has(o)==False (1)                                  0.020
  count(o)==0 (1)                                    0.020
  has(e)==True (1)                                   0.01

In [4]:
classifier.show_most_informative_features()

  -1.403 endswith==u'a' and label is 'male'
  -1.185 endswith==u'k' and label is 'female'
  -0.945 endswith==u'v' and label is 'female'
  -0.882 endswith==u'p' and label is 'female'
  -0.766 count(v)==2 and label is 'male'
  -0.645 endswith==u'm' and label is 'female'
  -0.571 endswith==u'o' and label is 'female'
  -0.562 endswith==u'd' and label is 'female'
  -0.542 endswith==u'r' and label is 'female'
  -0.492 endswith==u'f' and label is 'female'


## Loglinear Model for Prepositional Phrase Attachment

In [5]:
from nltk.corpus import ppattach
item = random.choice(ppattach.attachments('training'))
print(item)

PPAttachment(sent=u'9522', verb=u'pushing', noun1=u'prices', prep=u'of', noun2=u'potatoes', attachment=u'N')


In [6]:
from nltk.classify import accuracy

def j(*args):
    return '::'.join(list(args))

def print_feats(feats):
    width1, width2 = 25,30
    print "{:<{col1}} {:<{col2}}".format('Feature Type','Feature Value',col1=width1,col2=width2)
    print width1*"-" + width2*"-"
    for k, v in feats.iteritems():
        print "{:<{col1}} {:<{col2}}".format(k,v,col1=width1,col2=width2)

# verb='join', noun1='board', prep='as', noun2='director'
def ppattach_feature(item):
    return {
        'prep': item.prep,
        'verb': item.verb,
        'noun1': item.noun1,
        'noun2': item.noun2,
        'prep+noun1': j(item.prep, item.noun1),
        'prep+noun2': j(item.prep, item.noun2),
        'noun1+noun2': j(item.noun1, item.noun2),
        'verb+noun1': j(item.verb, item.noun1),
        'verb+noun2': j(item.verb, item.noun2),
        'verb+prep': j(item.verb, item.prep),
        'noun1+prep': j(item.noun1, item.prep),
        'prep+noun2': j(item.prep, item.noun2),
        'verb+noun1+noun2': j(item.verb, item.noun1, item.noun2),
        'verb+prep+noun2': j(item.verb, item.prep, item.noun2),
        'noun1+prep+noun2': j(item.noun1, item.prep, item.noun2),
        'verb+noun1+prep': j(item.verb, item.noun1, item.prep),
        'verb+noun1+prep+noun2': j(item.verb, item.noun1, item.prep, item.noun2),
        }

print_feats(ppattach_feature(item))

Feature Type              Feature Value                 
-------------------------------------------------------
noun1+noun2               prices::potatoes              
noun1+prep                prices::of                    
verb+noun1+prep           pushing::prices::of           
verb+noun1+prep+noun2     pushing::prices::of::potatoes 
verb+prep+noun2           pushing::of::potatoes         
verb+prep                 pushing::of                   
verb                      pushing                       
noun1                     prices                        
verb+noun1                pushing::prices               
verb+noun2                pushing::potatoes             
noun2                     potatoes                      
prep+noun2                of::potatoes                  
verb+noun1+noun2          pushing::prices::potatoes     
noun1+prep+noun2          prices::of::potatoes          
prep+noun1                of::prices                    
prep                      of    

In [7]:
train_set = [ (ppattach_feature(item), item.attachment) for item in ppattach.attachments('training') ]
dev_set = [ (ppattach_feature(item), item.attachment) for item in ppattach.attachments('devset') ]
test_set = [ (ppattach_feature(item), item.attachment) for item in ppattach.attachments('test') ]
print "starting to train classifier ..."
maxent_classifier = MaxentClassifier.train(train_set, algorithm='IIS', max_iter=10, count_cutoff=5)
print "finished training classifier"
devacc = accuracy(maxent_classifier, dev_set)
testacc = accuracy(maxent_classifier, test_set)
print "all:dev:%lf" % (devacc)
print "all:test:%lf" % (testacc)

starting to train classifier ...
  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.478
             2          -0.29111        0.970
             3          -0.20642        0.986
             4          -0.16328        0.992
             5          -0.13608        0.995
             6          -0.11710        0.996
             7          -0.10300        0.997
             8          -0.09208        0.997
             9          -0.08335        0.997
         Final          -0.07621        0.997
finished training classifier
all:dev:0.842040
all:test:0.840168


### Comparison with Naive Bayes

In [8]:
from nltk.classify import NaiveBayesClassifier
print "starting to train classifier ..."
nb_classifier = NaiveBayesClassifier.train(train_set)
print "finished training classifier"
devacc = accuracy(nb_classifier, dev_set)
testacc = accuracy(nb_classifier, test_set)
print "all:dev:%lf" % (devacc)
print "all:test:%lf" % (testacc)


starting to train classifier ...
finished training classifier
all:dev:0.837584
all:test:0.837908


### Learning Curves

Train on increasing size of training data and track accuracy on the development set.

In [None]:
chunk_size = 2000 
x_axis = []
y_axis = []

def training_run(data, iters):
    print "starting to train classifier ..."
    #classifier = NaiveBayesClassifier.train(train_slice)
    classifier = MaxentClassifier.train(train_slice, algorithm='IIS', max_iter=iters, count_cutoff=5)
    print "finished training classifier"
    acc = accuracy(classifier, dev_set)
    print "accuracy on dev set for training size %d = %f" % (slice, acc)
    return acc
    
for slice in range(len(train_set)):
    if slice > 0 and slice % chunk_size == 0:
        train_slice = train_set[:slice]
        acc = training_run(train_slice,5)
        x_axis.append(slice)
        y_axis.append(acc)

acc = training_run(train_set,10)
x_axis.append(len(train_set))
y_axis.append(acc)

plt.xlim( (1,len(train_set)) )
plt.plot(x_axis, y_axis, label='Learning Curve')

plt.xlabel("Training Size")
plt.ylabel("Accuracy")
plt.legend(loc='lower right')
plt.show()

starting to train classifier ...
  ==> Training (5 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.486
             2          -0.28175        0.988
             3          -0.20531        0.997
             4          -0.16289        0.998
         Final          -0.13537        0.999
finished training classifier
accuracy on dev set for training size 2000 = 0.782867
starting to train classifier ...
  ==> Training (5 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.493
             2          -0.27395        0.982
             3          -0.19784        0.995
             4          -0.15679        0.998
         Final          -0.13035        0.998
finished training classifier
accuracy on dev set for training size 4000 = 0.796979
starting to train classifier ...
  ==> Training (5 iterations)

### Increasing the number of iterations

In [None]:
print "starting to train classifier ..."
maxent_classifier = MaxentClassifier.train(train_set, algorithm='IIS', max_iter=100, count_cutoff=5)
print "finished training classifier"
devacc = accuracy(maxent_classifier, dev_set)
testacc = accuracy(maxent_classifier, test_set)
print "all:dev:%lf" % (devacc)
print "all:test:%lf" % (testacc)