# Natural Language Processing

## Exercise Sheet 6

In [1]:
#imports for all exercises
import nltk
import random
from nltk.corpus import names
import random
from nltk.corpus import brown
from nltk.corpus import movie_reviews
from pickle import load

### Exercise 1

Write a name gender classifier using the Names Corpus, the `apply_features` function, shuffling, and a test set of 500 instances. Use the following features:

a) first letter;  
b) last letter;  
c) last two letters;  
d) length;  
e) for each letter one feature, which is true if the name contains the letter.

Use the `NaiveBayesClassifier`, calculate the accuracy, and display the 10 most informative features.


In [2]:
def apply_features(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    features["last_two_letters"] = name[-2].lower()
    features["length"] = len(name)
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [3]:
 labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)
featuresets = [(apply_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [4]:
print(nltk.classify.accuracy(classifier, test_set))

0.76


In [5]:
classifier.show_most_informative_features(10)

Most Informative Features
             last_letter = 'a'            female : male   =     38.3 : 1.0
             last_letter = 'k'              male : female =     31.2 : 1.0
             last_letter = 'f'              male : female =     27.7 : 1.0
             last_letter = 'p'              male : female =     11.9 : 1.0
             last_letter = 'v'              male : female =     11.2 : 1.0
             last_letter = 'd'              male : female =     10.5 : 1.0
             last_letter = 'm'              male : female =      9.0 : 1.0
             last_letter = 'o'              male : female =      8.7 : 1.0
        last_two_letters = 'o'              male : female =      7.6 : 1.0
             last_letter = 'r'              male : female =      7.4 : 1.0


In [6]:
classifier.classify(apply_features("Behrad"))

'male'

### Exercise 2

The Senseval 2 Corpus contains data intended to train word-sense disambiguation classifiers. Using this dataset, build a `NaiveBayesClassifier` that predicts the correct sense tag for a given instance for the word "hard":

In [7]:
def apply_features(inst):
    features = {}
    p = inst.position
    features["previous"] = inst.context[p-1]
    features["next"] = inst.context[p+1]
    return features

In [8]:
from nltk.corpus import senseval
instances = senseval.instances('hard.pos')
labeled_instances = [(inst, inst.senses) for inst in instances]
size = int(len(labeled_instances) * 0.1)
accuracy_t = 0
for i in range(10):
    random.shuffle(labeled_instances)
    featuresets = [(apply_features(w), sense) for (w, sense) in labeled_instances]
    train_set, test_set = featuresets[size:], featuresets[:size]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    accuracy = nltk.classify.accuracy(classifier, test_set)
    accuracy_t += accuracy
    print(accuracy)

0.9168591224018475
0.9122401847575058
0.9168591224018475
0.9168591224018475
0.9053117782909931
0.8868360277136259
0.9168591224018475
0.9214780600461894
0.9053117782909931
0.8891454965357968


In [9]:
accuracy_t/10

0.9087759815242494

Use the preceding and following word as features. They can be calculated by retrieving the position of the word "hard" as `p=inst.position` and then accessing `inst.context[p-1]` and `inst.context[p+1]`.

Run 10 iterations by reshuffling the instances and printing the individual accuracies. Finally, print the average accuracy.

### Exercise 3

The synonyms "strong" and "powerful" pattern differently. Use the tagged Brown corpus with the universal tagset to first list the nouns which follow "strong" vs. "powerful". Write for this a function `next_noun(word, tagged_text)` which returns the list of nouns that follow `word` in the `tagged_text`. Build then a `NaiveBayesClassifier` that predicts when each word should be used by using the function `apply_features` and the following noun as single feature.

Run 10 iterations by reshuffling the instances and printing the individual accuracies. Finally, print the average accuracy.


In [10]:
def next_noun(word, tagged_text):
    return sorted(set([tagged_text[i+1][0].lower() for i, (w, t) 
                       in enumerate(tagged_text) if w.lower()==word 
                       and tagged_text[i+1][1] == "NOUN"]))

def apply_features(word):
    features = {}
    features['next_noun'] = word
    return features

In [11]:
tagged_text = brown.tagged_words(tagset='universal')
strong = next_noun('strong', tagged_text)
powerful = next_noun('powerful', tagged_text)
labeled_instances = ([(w, 'strong') for w in strong] +
[(w, 'powerful') for w in powerful])
size = int(len(labeled_instances) * 0.1)
accuracy_t = 0
for i in range(10):
    random.shuffle(labeled_instances)
    featuresets = [(apply_features(w), w[1]) for w in labeled_instances]
    train_set, test_set = featuresets[size:], featuresets[:size]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    accuracy = nltk.classify.accuracy(classifier, test_set)
    accuracy_t += accuracy
    print(accuracy)
print('total: ', accuracy_t/10)

0.6363636363636364
0.6363636363636364
0.6363636363636364
0.6363636363636364
0.7272727272727273
0.5454545454545454
0.7272727272727273
0.6363636363636364
0.5454545454545454
0.7272727272727273
total:  0.6454545454545454


### Exercise 4

Based on the Movie Reviews document classifier discussed in this chapter, build a new `NaiveBayesClassifier`. Tag first the Movie Reviews Corpus using the combined tagger from the previous chapter stored in `t2.pkl`. Filter the tagged words to contain only words for the tags `['JJ', 'JJR', 'JJS', 'RB', 'NN', 'NNS', 'VB', 'VBN', 'VBG', 'VBZ', 'VBD', 'QL']` as well as only alphabetic tokens with at least three characters. Convert the words to lowercase. Use the most common 5000 words as `word_features` in the function `document_features`. 

Run 10 iterations by reshuffling the instances and printing the accuracy and 5 most informative features for each iteration. Finally, print the average accuracy.
    

In [12]:
tags = ['JJ', 'JJR', 'JJS', 'RB', 'NN', 'NNS', 'VB', 'VBN', 'VBG', 'VBZ', 'VBD', 'QL']
input = open('t2.pkl', 'rb')
t2 = load(input)
input.close()
def apply_tagger(tagger, corpus):
    return tagger.tag(corpus)
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
words = movie_reviews.words()
words_fd = nltk.FreqDist(w.lower() for w in movie_reviews.words())
tagged = apply_tagger(t2, words)
tagged_f = [(w.lower(),t) for (w,t) in tagged if t in tags and len(w)>=3 and w.isalpha()]
words_fd = nltk.FreqDist(w.lower() for (w,t) in tagged_f)
word_features = list(words_fd)[:5000]
def document_features(document): 
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features
accuracy_t = 0
for i in range (10):
    random.shuffle(documents)
    featuresets = [(document_features(d), c) for (d,c) in documents]
    train_set, test_set = featuresets[100:], featuresets[:100]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    accuracy = nltk.classify.accuracy(classifier, test_set)
    accuracy_t += accuracy
    classifier.show_most_informative_features(5)
    print('accuracy {}'.format(i), accuracy)
    classifier.show_most_informative_features(5)


Most Informative Features
   contains(magnificent) = True              pos : neg    =     11.4 : 1.0
   contains(outstanding) = True              pos : neg    =     11.0 : 1.0
     contains(insulting) = True              neg : pos    =     11.0 : 1.0
     contains(ludicrous) = True              neg : pos    =     10.7 : 1.0
        contains(hudson) = True              neg : pos    =     10.3 : 1.0
accuracy 0 0.84
Most Informative Features
   contains(magnificent) = True              pos : neg    =     11.4 : 1.0
   contains(outstanding) = True              pos : neg    =     11.0 : 1.0
     contains(insulting) = True              neg : pos    =     11.0 : 1.0
     contains(ludicrous) = True              neg : pos    =     10.7 : 1.0
        contains(hudson) = True              neg : pos    =     10.3 : 1.0
Most Informative Features
      contains(depicted) = True              pos : neg    =     12.9 : 1.0
   contains(outstanding) = True              pos : neg    =     11.3 : 1.0
      

In [15]:
print('total: ', accuracy_t/10)

total:  0.8140000000000001


### Exercise 5

The PP Attachment Corpus is a corpus describing prepositional phrase attachment decisions. Each instance in the training corpus is encoded as a `PPAttachment` object:

    from nltk.corpus import ppattach
    ppattach.attachments('training')
    
        [PPAttachment(sent='0', verb='join', noun1='board',
            prep='as', noun2='director', attachment='V'),
        PPAttachment(sent='1', verb='is', noun1='chairman',
            prep='of', noun2='N.V.', attachment='N'),
        ...]

    inst = ppattach.attachments('training')[1]
    (inst.noun1, inst.prep, inst.noun2)
    
        ('chairman', 'of', 'N.V.')

In the same way, `ppattach.attachments('test')` accesses the test instances. Select only the instances where `inst.attachment` is `'N'`:

In [3]:
from nltk.corpus import ppattach
nattach = [inst for inst in ppattach.attachments('training')
               if inst.attachment == 'N']
nattach_test = [inst for inst in ppattach.attachments('test')
               if inst.attachment == 'N']

Using this sub-corpus, build a `NaiveBayesClassifier` that attempts to predict which preposition is used to connect a given pair of nouns. For example, given the pair of nouns "team" and "researchers", the classifier should predict the preposition "of". 

Write for this purpose a function `prepare_featuresets(subcorpus)`, where `subcorpus` is either the string "training" or "test" to return the training set or the test set. 

Print the achieved accuracy as well as the result of `classifier.classify({ 'noun1': 'team', 'noun2': 'researchers' })`.

In [4]:
def prepare_featuresets(subcorpus):
    features = {}
    tr_set, te_set = [], []
    if subcorpus == 'training':
        for inst in nattach:
            features['noun1'] = inst.noun1
            features['noun2'] = inst.noun2
            label = inst.prep
            tr_set.append((features.copy(), label))
        return tr_set
    elif subcorpus == 'test':
        for inst in nattach_test:
            features['noun1'] = inst.noun1
            features['noun2'] = inst.noun2
            label = inst.prep
            te_set.append((features.copy(), label))
        return te_set
        

In [5]:
train_set = prepare_featuresets('training')
test_set = prepare_featuresets('test')

In [6]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
accuracy = nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(5)
accuracy

Most Informative Features
                   noun1 = 'stake'            in : of     =     75.0 : 1.0
                   noun2 = 'million'          of : with   =     57.5 : 1.0
                   noun1 = '%'                of : for    =     53.5 : 1.0
                   noun1 = 'questions'     about : of     =     46.0 : 1.0
                   noun1 = 'interest'         in : of     =     34.3 : 1.0


0.5690032858707558

In [8]:
classifier.classify({ 'noun1': 'team', 'noun2': 'researchers' })

'of'