# Natural Language Processing

## Exercise Sheet 6

In [65]:
#imports for all exercises
import nltk
import random
from nltk.classify import apply_features
from nltk.corpus import names,brown,ppattach,movie_reviews
import functools
import inflect
from string import ascii_lowercase
import statistics
from pickle import load

### Exercise 1

Write a name gender classifier using the Names Corpus, the `apply_features` function, shuffling, and a test set of 500 instances. Use the following features:

a) first letter;  
b) last letter;  
c) last two letters;  
d) length;  
e) for each letter one feature, which is true if the name contains the letter.

Use the `NaiveBayesClassifier`, calculate the accuracy, and display the 10 most informative features.


In [68]:
#Functions 

# if the position is negative return the last n chars else return the first n chars
def gender_features(word,position):
    if position < 0:
        pos = 'last_' + inflect.engine().number_to_words(abs(position)) + '_letters'
        return {pos : word[position:]}
    else:
        pos ='first_' + inflect.engine().number_to_words(abs(position)) + '_letters'
        return {pos : word[:position]}

#return the length of the word
def length_features(word,position):
    return {'length':len(word)}

#return true if letter in word else return false
def contains_letter(word,position):
    if position in word.lower():
        return {'contains ' + position : True}
    else:
        return {'contains ' + position : False}

# takes a position (or char) and a function 
# first create labeled naems, shuffle them, split them in train and test set according to a function which is given as argument, 
# and train the Bayes classifier on the train set
# show most informative informations and the accuracy
def specific_letter(position,func):
    labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
        [(name, 'female') for name in names.words('female.txt')])
    random.shuffle(labeled_names)
    train_set = apply_features(functools.partial(func, position=position), labeled_names[500:])
    test_set = apply_features(functools.partial(func, position=position), labeled_names[:500])
    classifier = nltk.NaiveBayesClassifier.train(train_set)

    print(classifier.show_most_informative_features(10))
    print(nltk.classify.accuracy(classifier, test_set))
    return


In [44]:
#a)
specific_letter(1,gender_features)


[({'first_one_letters': 'F'}, 'female'), ({'first_one_letters': 'F'}, 'female'), ...]
Most Informative Features
       first_one_letters = 'W'              male : female =      4.7 : 1.0
       first_one_letters = 'Q'              male : female =      2.7 : 1.0
       first_one_letters = 'U'              male : female =      2.6 : 1.0
       first_one_letters = 'K'            female : male   =      2.3 : 1.0
       first_one_letters = 'X'              male : female =      2.3 : 1.0
       first_one_letters = 'H'              male : female =      2.3 : 1.0
       first_one_letters = 'Z'              male : female =      1.7 : 1.0
       first_one_letters = 'L'            female : male   =      1.7 : 1.0
       first_one_letters = 'C'            female : male   =      1.7 : 1.0
       first_one_letters = 'T'              male : female =      1.6 : 1.0
None
0.662


In [45]:
#b)
specific_letter(-1,gender_features)


[({'last_one_letters': 'r'}, 'male'), ({'last_one_letters': 'e'}, 'male'), ...]
Most Informative Features
        last_one_letters = 'a'            female : male   =     35.2 : 1.0
        last_one_letters = 'k'              male : female =     30.9 : 1.0
        last_one_letters = 'v'              male : female =     18.7 : 1.0
        last_one_letters = 'f'              male : female =     16.0 : 1.0
        last_one_letters = 'p'              male : female =     11.9 : 1.0
        last_one_letters = 'd'              male : female =     10.1 : 1.0
        last_one_letters = 'm'              male : female =      8.3 : 1.0
        last_one_letters = 'w'              male : female =      8.0 : 1.0
        last_one_letters = 'o'              male : female =      8.0 : 1.0
        last_one_letters = 'r'              male : female =      6.8 : 1.0
None
0.742


In [47]:
#c)
specific_letter(-2,gender_features)

[({'last_two_letters': 'ey'}, 'female'), ({'last_two_letters': 'rl'}, 'female'), ...]
Most Informative Features
        last_two_letters = 'na'           female : male   =    101.3 : 1.0
        last_two_letters = 'la'           female : male   =     76.4 : 1.0
        last_two_letters = 'ia'           female : male   =     41.1 : 1.0
        last_two_letters = 'sa'           female : male   =     34.9 : 1.0
        last_two_letters = 'rd'             male : female =     32.4 : 1.0
        last_two_letters = 'ta'           female : male   =     32.2 : 1.0
        last_two_letters = 'us'             male : female =     26.7 : 1.0
        last_two_letters = 'ra'           female : male   =     26.4 : 1.0
        last_two_letters = 'do'             male : female =     24.8 : 1.0
        last_two_letters = 'rt'             male : female =     22.9 : 1.0
None
0.796


In [60]:
#d)
specific_letter(0,length_features)

Most Informative Features
                  length = 2                male : female =      2.1 : 1.0
                  length = 3                male : female =      2.0 : 1.0
                  length = 15               male : female =      1.7 : 1.0
                  length = 10             female : male   =      1.5 : 1.0
                  length = 9              female : male   =      1.3 : 1.0
                  length = 4                male : female =      1.2 : 1.0
                  length = 11             female : male   =      1.1 : 1.0
                  length = 7              female : male   =      1.1 : 1.0
                  length = 12             female : male   =      1.1 : 1.0
                  length = 5              female : male   =      1.1 : 1.0
None
0.654


In [69]:
#e)
for char in ascii_lowercase:
    specific_letter(char,contains_letter)

Most Informative Features
              contains a = False            male : female =      1.6 : 1.0
              contains a = True           female : male   =      1.4 : 1.0
None
0.61
Most Informative Features
              contains b = True             male : female =      1.2 : 1.0
              contains b = False          female : male   =      1.0 : 1.0
None
0.63
Most Informative Features
              contains c = True             male : female =      1.0 : 1.0
              contains c = False          female : male   =      1.0 : 1.0
None
0.574
Most Informative Features
              contains d = True             male : female =      1.2 : 1.0
              contains d = False          female : male   =      1.1 : 1.0
None
0.636
Most Informative Features
              contains e = False            male : female =      1.2 : 1.0
              contains e = True           female : male   =      1.1 : 1.0
None
0.6
Most Informative Features
              contains f = True            

### Exercise 2

The Senseval 2 Corpus contains data intended to train word-sense disambiguation classifiers. Using this dataset, build a `NaiveBayesClassifier` that predicts the correct sense tag for a given instance for the word "hard":

In [6]:
from nltk.corpus import senseval
instances = senseval.instances()
instances

[SensevalInstance(word='hard-a', position=20, context=[('``', '``'), ('he', 'PRP'), ('may', 'MD'), ('lose', 'VB'), ('all', 'DT'), ('popular', 'JJ'), ('support', 'NN'), (',', ','), ('but', 'CC'), ('someone', 'NN'), ('has', 'VBZ'), ('to', 'TO'), ('kill', 'VB'), ('him', 'PRP'), ('to', 'TO'), ('defeat', 'VB'), ('him', 'PRP'), ('and', 'CC'), ('that', 'DT'), ("'s", 'VBZ'), ('hard', 'JJ'), ('to', 'TO'), ('do', 'VB'), ('.', '.'), ("''", "''")], senses=('HARD1',)), SensevalInstance(word='hard-a', position=10, context=[('clever', 'NNP'), ('white', 'NNP'), ('house', 'NNP'), ('``', '``'), ('spin', 'VB'), ('doctors', 'NNS'), ("''", "''"), ('are', 'VBP'), ('having', 'VBG'), ('a', 'DT'), ('hard', 'JJ'), ('time', 'NN'), ('helping', 'VBG'), ('president', 'NNP'), ('bush', 'NNP'), ('explain', 'VB'), ('away', 'RB'), ('the', 'DT'), ('economic', 'JJ'), ('bashing', 'NN'), ('that', 'IN'), ('low-and', 'JJ'), ('middle-income', 'JJ'), ('workers', 'NNS'), ('are', 'VBP'), ('taking', 'VBG'), ('these', 'DT'), ('days

In [None]:
from nltk.corpus import senseval
instances = senseval.instances('hard.pos')
labeled_instances = [(inst, inst.senses) for inst in instances] 
size = int(len(labeled_instances) * 0.1)
random.shuffle(labeled_instances)
train_set = apply_features(features, labeled_instances[size:])
test_set = apply_features(features, labeled_instances[:size])

Use the preceding and following word as features. They can be calculated by retrieving the position of the word "hard" as `p=inst.position` and then accessing `inst.context[p-1]` and `inst.context[p+1]`.

Run 10 iterations by reshuffling the instances and printing the individual accuracies. Finally, print the average accuracy.

### Exercise 3

The synonyms "strong" and "powerful" pattern differently. Use the tagged Brown corpus with the universal tagset to first list the nouns which follow "strong" vs. "powerful". Write for this a function `next_noun(word, tagged_text)` which returns the list of nouns that follow `word` in the `tagged_text`. Build then a `NaiveBayesClassifier` that predicts when each word should be used by using the function `apply_features` and the following noun as single feature.

Run 10 iterations by reshuffling the instances and printing the individual accuracies. Finally, print the average accuracy.


In [84]:
# Find the next word after a given word if it's a noun and return a list of all found words
def next_noun(word, tagged_text):
    ret_ = []
    for i in range (0, len(tagged_text)-1):
        if tagged_text[i][0].lower() == word:
            if tagged_text[i+1][1] == 'NOUN':
                ret_.append((tagged_text[i+1][0], word))
    return ret_

noun_list_strong = next_noun('strong', brown.tagged_words(tagset='universal'))
noun_list_powerful = next_noun('powerful', brown.tagged_words(tagset='universal'))



In [94]:
def map_function(word):
    return {'word' : word}

In [99]:
# iterate 10 times: split labeled list in test and trainset and train NBC on the test-set
# print accuracy
# print average accuracy after the 10 iterations
list_combined = noun_list_strong + noun_list_powerful
overall_acc_list = []
for i in range (0,10):
    random.shuffle(list_combined)
    train_set = apply_features(map_function,list_combined[int(len(list_combined)*0.9):])
    test_set = apply_features(map_function,list_combined[:int(len(list_combined)*0.1)])
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    overall_acc_list.append(acc)
    print("Accuracy: " + str(acc))

print("Overall mean: " + str(statistics.mean(overall_acc_list)))

Accuracy: 0.7857142857142857
Accuracy: 0.8571428571428571
Accuracy: 0.6428571428571429
Accuracy: 0.7142857142857143
Accuracy: 0.7857142857142857
Accuracy: 0.7857142857142857
Accuracy: 0.5714285714285714
Accuracy: 0.6428571428571429
Accuracy: 0.7857142857142857
Accuracy: 0.7857142857142857
Overall mean: 0.7357142857142858


### Exercise 4

Based on the Movie Reviews document classifier discussed in this chapter, build a new `NaiveBayesClassifier`. Tag first the Movie Reviews Corpus using the combined tagger from the previous chapter stored in `t2.pkl`. Filter the tagged words to contain only words for the tags `['JJ', 'JJR', 'JJS', 'RB', 'NN', 'NNS', 'VB', 'VBN', 'VBG', 'VBZ', 'VBD', 'QL']` as well as only alphabetic tokens with at least three characters. Convert the words to lowercase. Use the most common 5000 words as `word_features` in the function `document_features`. 

Run 10 iterations by reshuffling the instances and printing the accuracy and 5 most informative features for each iteration. Finally, print the average accuracy.
    

### Exercise 5

The PP Attachment Corpus is a corpus describing prepositional phrase attachment decisions. Each instance in the training corpus is encoded as a `PPAttachment` object:

    from nltk.corpus import ppattach
    ppattach.attachments('training')
    
        [PPAttachment(sent='0', verb='join', noun1='board',
            prep='as', noun2='director', attachment='V'),
        PPAttachment(sent='1', verb='is', noun1='chairman',
            prep='of', noun2='N.V.', attachment='N'),
        ...]

    inst = ppattach.attachments('training')[1]
    (inst.noun1, inst.prep, inst.noun2)
    
        ('chairman', 'of', 'N.V.')

In the same way, `ppattach.attachments('test')` accesses the test instances. Select only the instances where `inst.attachment` is `'N'`:

In [1]:

nattach = [inst for inst in ppattach.attachments('training')
               if inst.attachment == 'N']

Using this sub-corpus, build a `NaiveBayesClassifier` that attempts to predict which preposition is used to connect a given pair of nouns. For example, given the pair of nouns "team" and "researchers", the classifier should predict the preposition "of". 

Write for this purpose a function `prepare_featuresets(subcorpus)`, where `subcorpus` is either the string "training" or "test" to return the training set or the test set. 

Print the achieved accuracy as well as the result of `classifier.classify({ 'noun1': 'team', 'noun2': 'researchers' })`.

In [58]:
def extract(attachment):
    return ({'noun1':attachment.noun1, 'noun2': attachment.noun2},attachment.prep)

def prepare_featuresets(subcorpus):
    nattach = [inst for inst in ppattach.attachments(subcorpus)
               if inst.attachment == 'N']
    set_ = apply_features(extract,nattach)
    return set_

train_set = prepare_featuresets('training')
test_set = prepare_featuresets('test')
classifier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classifier, test_set))
classifier.classify({ 'noun1': 'team', 'noun2': 'researchers' })


0.5690032858707558


'of'