# Natural Language Processing

## Exercise Sheet 5

In [1]:
#imports for all exercises
import nltk
from nltk.corpus import brown

### Exercise 1

Produce a sorted list of tags used in the Brown corpus, removing duplicates. Do the same for the universal part-of-speech tagset.

In [4]:
# the Brown corpus tagged words
brown_tagged_words = nltk.corpus.brown.tagged_words()

# the universal part-of-speech tagset
universal_tagset_tagged_words = nltk.corpus.brown.tagged_words(tagset='universal')

# sorted list of the tags and removing duplicates
brown_tags = sorted(set(tag for word, tag in brown_tagged_words))
universal_tags = sorted(set(tag for word, tag in universal_tagset_tagged_words))

print("Tags in the Brown Corpus:")
print(brown_tags)

print("\nTags in the Universal Tagset:")
print(universal_tags)

Tags in the Brown Corpus:
["'", "''", '(', '(-HL', ')', ')-HL', '*', '*-HL', '*-NC', '*-TL', ',', ',-HL', ',-NC', ',-TL', '--', '---HL', '.', '.-HL', '.-NC', '.-TL', ':', ':-HL', ':-TL', 'ABL', 'ABN', 'ABN-HL', 'ABN-NC', 'ABN-TL', 'ABX', 'AP', 'AP$', 'AP+AP-NC', 'AP-HL', 'AP-NC', 'AP-TL', 'AT', 'AT-HL', 'AT-NC', 'AT-TL', 'AT-TL-HL', 'BE', 'BE-HL', 'BE-TL', 'BED', 'BED*', 'BED-NC', 'BEDZ', 'BEDZ*', 'BEDZ-HL', 'BEDZ-NC', 'BEG', 'BEM', 'BEM*', 'BEM-NC', 'BEN', 'BEN-TL', 'BER', 'BER*', 'BER*-NC', 'BER-HL', 'BER-NC', 'BER-TL', 'BEZ', 'BEZ*', 'BEZ-HL', 'BEZ-NC', 'BEZ-TL', 'CC', 'CC-HL', 'CC-NC', 'CC-TL', 'CC-TL-HL', 'CD', 'CD$', 'CD-HL', 'CD-NC', 'CD-TL', 'CD-TL-HL', 'CS', 'CS-HL', 'CS-NC', 'CS-TL', 'DO', 'DO*', 'DO*-HL', 'DO+PPSS', 'DO-HL', 'DO-NC', 'DO-TL', 'DOD', 'DOD*', 'DOD*-TL', 'DOD-NC', 'DOZ', 'DOZ*', 'DOZ*-TL', 'DOZ-HL', 'DOZ-TL', 'DT', 'DT$', 'DT+BEZ', 'DT+BEZ-NC', 'DT+MD', 'DT-HL', 'DT-NC', 'DT-TL', 'DTI', 'DTI-HL', 'DTI-TL', 'DTS', 'DTS+BEZ', 'DTS-HL', 'DTX', 'EX', 'EX+BEZ', 'EX+

### Exercise 2

Write a program to process the Brown Corpus using the universal part-of-speech tagset to find out which nouns are more common in their plural form than in their singular form. Only consider regular plurals formed with the "-s" suffix. Print an alphabetically sorted list of the nouns together with the frequencies for the singular and plural forms, one per line. 


In [11]:
tagged_words = nltk.corpus.brown.tagged_words(tagset='universal')

# dictionaries to store frequencies of singular and plural nouns
singular_nouns = {}
plural_nouns = {}

for word, tag in tagged_words:
    if tag == 'NOUN':
        # regular plurals formed with the "-s" suffix
        if word.endswith('s'):
            plural_nouns[word] = plural_nouns.get(word, 0) + 1
        else:
            singular_nouns[word] = singular_nouns.get(word, 0) + 1

# nouns that are more common in their plural form
more_common_plural_nouns = {}
for noun in plural_nouns:
    plural_count = plural_nouns[noun]
    singular_count = singular_nouns.get(noun, 0)
    if plural_count > singular_count:
        more_common_plural_nouns[noun] = (singular_count, plural_count)

sorted_results = sorted(more_common_plural_nouns.items())

for noun, (singular_count, plural_count) in sorted_results:
    print(f"{noun}: Singular={singular_count}, Plural={plural_count}")


'20's: Singular=0, Plural=1
'20s: Singular=0, Plural=1
'30s: Singular=0, Plural=2
'40's: Singular=0, Plural=1
'50's: Singular=0, Plural=1
'60s: Singular=0, Plural=1
'80's: Singular=0, Plural=1
'90s: Singular=0, Plural=1
'thirties: Singular=0, Plural=1
-16-degrees: Singular=0, Plural=1
-20-degrees: Singular=0, Plural=1
-78-degrees: Singular=0, Plural=3
10-degrees: Singular=0, Plural=1
105-degrees: Singular=0, Plural=1
110-degrees: Singular=0, Plural=1
120-degrees: Singular=0, Plural=2
1700's: Singular=0, Plural=1
1770's: Singular=0, Plural=1
180-degrees: Singular=0, Plural=2
1800's: Singular=0, Plural=1
1840's: Singular=0, Plural=2
1850's: Singular=0, Plural=2
1860's: Singular=0, Plural=1
1870's: Singular=0, Plural=1
1880s: Singular=0, Plural=2
1890's: Singular=0, Plural=3
1890s: Singular=0, Plural=1
1900's: Singular=0, Plural=2
1920's: Singular=0, Plural=8
1920s: Singular=0, Plural=4
1930's: Singular=0, Plural=7
1930s: Singular=0, Plural=2
1940's: Singular=0, Plural=1
1940s: Singular=0

### Exercise 3

Find out which word has the greatest number of distinct tags in the Brown corpus using the original tagset. Without using the `most_common` function, print a list of the tags together with the frequencies for the word, sorted by frequency from highest to lowest, one per line.



In [13]:
# tagged words from the Brown Corpus
tagged_words = nltk.corpus.brown.tagged_words(tagset='brown')

# a dictionary to store tag frequencies for each word
tag_frequencies = {}

for word, tag in tagged_words:
    if word not in tag_frequencies:
        tag_frequencies[word] = {}
    tag_frequencies[word][tag] = tag_frequencies[word].get(tag, 0) + 1

# the word with the greatest number of distinct tags
word_with_max_tags = max(tag_frequencies, key=lambda word: len(tag_frequencies[word]))

sorted_tags = sorted(tag_frequencies[word_with_max_tags].items(), key=lambda item: item[1], reverse=True)
for tag, frequency in sorted_tags:
    print(f"{tag}: {frequency}")

CS: 6419
DT: 1975
WPS: 1638
WPO: 135
QL: 54
DT-NC: 6
WPS-NC: 3
CS-NC: 2
WPS-HL: 2
CS-HL: 1
NIL: 1
WPO-NC: 1


### Exercise 4

Tabulate the frequencies of the universal tags that precede nouns in the Brown Corpus. 

In [14]:
from nltk import FreqDist

tagged_words = brown.tagged_words(tagset='universal')

# a list to store the universal tags that precede nouns
tags_before_nouns = []

# iterate through the tagged words to find tags that precede nouns
for i in range(1, len(tagged_words)):
    current_word, current_tag = tagged_words[i]
    previous_word, previous_tag = tagged_words[i - 1]
    
    if current_tag == 'NOUN':
        tags_before_nouns.append(previous_tag)

# calculate the frequency distribution of tags before nouns
tag_freq = FreqDist(tags_before_nouns)
print(tag_freq.tabulate())


  DET   ADJ  NOUN   ADP     .  VERB  CONJ   NUM   ADV   PRT  PRON     X 
85845 54653 41309 37418 20084 17851  9294  5668  1851  1068   440    77 
None


### Exercise 5

Write a function `ambiguous(tagged_text)` that returns the number of ambiguous word types as well as the number of all word types in a tagged text. A word type is ambiguous if it is tagged with at least two different tags. Use the function to print both values as well as the percentage of ambiguous word types for the Brown Corpus both for the original and the universal tagset.

In [15]:
def ambiguous(tagged_text, tagset='original'):
    # a dictionary to store word types and their associated tags
    word_type_tags = {}
    
    # collect word types and their tags
    for word, tag in tagged_text:
        if word not in word_type_tags:
            word_type_tags[word] = set()
        word_type_tags[word].add(tag)
    
    # count the number of ambiguous word types and all word types
    ambiguous_count = 0
    all_count = 0
    for word, tags in word_type_tags.items():
        all_count += 1
        if len(tags) >= 2:
            ambiguous_count += 1
    
    return ambiguous_count, all_count

# tagged words from the Brown Corpus using the original tagset
brown_tagged_words_original = nltk.corpus.brown.tagged_words(tagset='brown')

# tagged words from the Brown Corpus using the universal tagset
brown_tagged_words_universal = nltk.corpus.brown.tagged_words(tagset='universal')

# cthe number of ambiguous word types and all word types for both tagsets
ambiguous_original, all_original = ambiguous(brown_tagged_words_original, 'original')
ambiguous_universal, all_universal = ambiguous(brown_tagged_words_universal, 'universal')

# the percentage of ambiguous word types
percentage_ambiguous_original = (ambiguous_original / all_original) * 100
percentage_ambiguous_universal = (ambiguous_universal / all_universal) * 100

print("Original Tagset:")
print(f"Ambiguous word types: {ambiguous_original}")
print(f"All word types: {all_original}")
print(f"Percentage of ambiguous word types: {percentage_ambiguous_original:.2f}%")

print("\nUniversal Tagset:")
print(f"Ambiguous word types: {ambiguous_universal}")
print(f"All word types: {all_universal}")
print(f"Percentage of ambiguous word types: {percentage_ambiguous_universal:.2f}%")


Original Tagset:
Ambiguous word types: 8729
All word types: 56057
Percentage of ambiguous word types: 15.57%

Universal Tagset:
Ambiguous word types: 3596
All word types: 56057
Percentage of ambiguous word types: 6.41%


### Exercise 6

Write code to search the Brown Corpus to answer the following questions:

a) produce an alphabetically sorted list of the distinct words tagged as `MD`  
b) identify words that can be plural nouns or third person singular verbs  
c) print an alphabetically sorted list of distinct three-word prepositional phrases of the form `IN+AT+NN`, separated by semicolons


In [25]:
tagged_words = nltk.corpus.brown.tagged_words(tagset='brown')

# a) produce an alphabetically sorted list of the distinct words tagged as MD
distinct_md_words = sorted(set(word for word, tag in tagged_words if tag == 'MD'))
print("Distinct words tagged as MD:")
print(distinct_md_words)

# b) identify words that can be plural nouns or third person singular verbs
plural_noun_verb_candidates = set()
for word, tag in tagged_words:
    if tag in ['NNS', 'VBZ'] and word.isalpha():
        plural_noun_verb_candidates.add(word)
print("\nWords that can be plural nouns or third person singular verbs:")
print(sorted(plural_noun_verb_candidates))

# c) print an alphabetically sorted list of distinct three-word prepositional phrases of the form IN+AT+NN
prepositional_phrases = set()
for i in range(2, len(tagged_words) - 2):
    if tagged_words[i][1] == 'IN' and tagged_words[i + 1][1] == 'AT' and tagged_words[i + 2][1] == 'NN':
        phrase = f"{tagged_words[i][0]} {tagged_words[i + 1][0]} {tagged_words[i + 2][0]}"
        prepositional_phrases.add(phrase)

sorted_prepositional_phrases = sorted(prepositional_phrases)
print("\nDistinct three-word prepositional phrases of the form IN+AT+NN:")
for phrase in sorted_prepositional_phrases:
    print(phrase)

Distinct words tagged as MD:
['Can', 'Could', 'May', 'Might', 'Must', 'Ought', 'Shall', 'Should', 'Will', 'Would', "c'n", 'can', 'colde', 'could', 'dare', 'kin', 'maht', 'mai', 'may', 'maye', 'mayst', 'might', 'must', 'need', 'ought', 'shall', 'should', 'shuld', 'shulde', 'wil', 'will', 'wilt', 'wod', 'wold', 'wolde', 'would']

Words that can be plural nouns or third person singular verbs:

Distinct three-word prepositional phrases of the form IN+AT+NN:
'bout the saddle
'ceptin' the light
About the murder
Above the tongue
Across the bay
Across the bridge
Across the front
Across the road
Across the street
Across the table
Across the way
After a conversation
After a day
After a dinner
After a flood
After a minute
After a moment
After a pause
After a reception
After a roundup
After a sort
After a supper
After a time
After a while
After a year
After an earthquake
After every money
After every session
After the collapse
After the demise
After the diagnosing
After the game
After the meal
Aft

### Exercise 7

Write a function `prec_adv(word, text)` that returns an alphabetically sorted list of distinct adverbs that precede `word` in `text`. Use this function to find out which adverbs precede the words "love", "like", and "prefer" in the Brown corpus. 

In [24]:
tagged_words = nltk.corpus.brown.tagged_words()

def prec_adv(word, text):
    adverbs = set()
    for i in range(1, len(text)):
        current_word, current_tag = text[i]
        previous_word, previous_tag = text[i - 1]
        if current_word == word and current_tag in ['VB', 'VBZ', 'VBP'] and previous_tag == 'RB':
            adverbs.add(previous_word)
    return sorted(list(adverbs))

words_to_search = ["love", "like", "prefer"]

for word in words_to_search:
    adverbs = prec_adv(word, tagged_words)
    print(f"Adverbs that precede '{word}':")
    print(adverbs)
    print()

Adverbs that precede 'love':
['always', 'dearly', 'just']

Adverbs that precede 'like':
['always', 'even', 'rather', 'simply', 'sure']

Adverbs that precede 'prefer':
['generally', 'much', 'spontaneously']

