In [1]:
import nltk
nltk.download('brown')
nltk.download('universal_tagset')
nltk.download('brown',quiet=True)


[nltk_data] Downloading package brown to C:\Users\Monaliza
[nltk_data]     Lumbao\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to C:\Users\Monaliza
[nltk_data]     Lumbao\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [2]:
DATA = nltk.corpus.brown.sents(categories='fiction')
DATA_TAGGED = nltk.corpus.brown.tagged_sents(categories='fiction',tagset='universal')

DATA_TAGGED[:500]

[[('Thirty-three', 'NUM')], [('Scotty', 'NOUN'), ('did', 'VERB'), ('not', 'ADV'), ('go', 'VERB'), ('back', 'ADV'), ('to', 'ADP'), ('school', 'NOUN'), ('.', '.')], ...]

In [3]:
train_split = int(len(DATA_TAGGED) * 0.75)
DATA_TRAIN = DATA_TAGGED[:train_split]
DATA_TEST = DATA_TAGGED[train_split:]

In [4]:
len(DATA), len(DATA_TAGGED), len(DATA_TRAIN), len(DATA_TEST)

(4249, 4249, 3186, 1063)

## 1. Explore the performance of N-Gram taggers on the corpus.

### a. Unigram Tagger

In [5]:
unigram_tagger = nltk.tag.UnigramTagger(DATA_TRAIN)
unigram_tagger.evaluate(DATA_TEST)

0.8438119069961422

In [6]:
unigram_tagger.tag_sents(DATA)[:5]

[[('Thirty-three', 'NUM')],
 [('Scotty', 'NOUN'),
  ('did', 'VERB'),
  ('not', 'ADV'),
  ('go', 'VERB'),
  ('back', 'ADV'),
  ('to', 'PRT'),
  ('school', 'NOUN'),
  ('.', '.')],
 [('His', 'DET'),
  ('parents', 'NOUN'),
  ('talked', 'VERB'),
  ('seriously', 'ADV'),
  ('and', 'CONJ'),
  ('lengthily', 'ADV'),
  ('to', 'PRT'),
  ('their', 'DET'),
  ('own', 'ADJ'),
  ('doctor', 'NOUN'),
  ('and', 'CONJ'),
  ('to', 'PRT'),
  ('a', 'DET'),
  ('specialist', 'NOUN'),
  ('at', 'ADP'),
  ('the', 'DET'),
  ('University', 'NOUN'),
  ('Hospital', 'NOUN'),
  ('--', '.'),
  ('Mr.', 'NOUN'),
  ('McKinley', 'NOUN'),
  ('was', 'VERB'),
  ('entitled', 'VERB'),
  ('to', 'PRT'),
  ('a', 'DET'),
  ('discount', 'NOUN'),
  ('for', 'ADP'),
  ('members', 'NOUN'),
  ('of', 'ADP'),
  ('his', 'DET'),
  ('family', 'NOUN'),
  ('--', '.'),
  ('and', 'CONJ'),
  ('it', 'PRON'),
  ('was', 'VERB'),
  ('decided', 'VERB'),
  ('it', 'PRON'),
  ('would', 'VERB'),
  ('be', 'VERB'),
  ('best', 'ADJ'),
  ('for', 'ADP'),
  ('him'

In [7]:
unigran_tagger_train_accuracy = unigram_tagger.evaluate(DATA_TRAIN)
print("Training data accuracy: ", unigran_tagger_train_accuracy)

Training data accuracy:  0.9633715977771468


In [8]:
unigram_tagger_test_accuracy = unigram_tagger.evaluate(DATA_TEST)
print("Testing data accuracy: ", unigram_tagger_test_accuracy)

Testing data accuracy:  0.8438119069961422


### b. Unigram Tagger with a verb backoff

In [9]:
default_tagger = nltk.tag.DefaultTagger('VERB')
unigram_tagger_backoff = nltk.tag.UnigramTagger(DATA_TRAIN, backoff=default_tagger)

In [10]:
unigram_tagger_backoff_train_accuracy = unigram_tagger_backoff.evaluate(DATA_TRAIN)
print("Training data accuracy: ", unigram_tagger_backoff_train_accuracy)

Training data accuracy:  0.9633715977771468


In [11]:
unigram_tagger_backoff_test_accuracy = unigram_tagger_backoff.evaluate(DATA_TEST)
print("Testing data accuracy: ", unigram_tagger_backoff_test_accuracy)

Testing data accuracy:  0.8678448545511417


### c. Trigram Tagger with Unigram Tagger and adjective backoff

In [12]:
adjective_tagger = nltk.tag.DefaultTagger('ADJ')
adjective_tagger_backoff = nltk.tag.UnigramTagger(DATA_TRAIN, backoff=adjective_tagger)

In [13]:
trigram_tagger_backoff = nltk.tag.TrigramTagger(DATA_TRAIN, backoff=adjective_tagger_backoff)

In [14]:
trigram_tag_train = trigram_tagger_backoff.evaluate(DATA_TRAIN)
print("Training data accuracy: ", trigram_tag_train)

Training data accuracy:  0.9799618707662353


In [15]:
trigram_tag_test = trigram_tagger_backoff.evaluate(DATA_TEST)
print("Testing data accuracy: ", trigram_tag_test)

Testing data accuracy:  0.8609112709832134


### d. Trigram Tagger with a Bigram Tagger backoff

In [16]:
bigram_tagger_backoff = nltk.tag.BigramTagger(DATA_TRAIN, backoff=unigram_tagger_backoff)

In [17]:
tribigram_tagger_backoff = nltk.tag.TrigramTagger(DATA_TRAIN, backoff=bigram_tagger_backoff)

In [18]:
tribigram_tag_train = tribigram_tagger_backoff.evaluate(DATA_TRAIN)
print("Training data accuracy: ", tribigram_tag_train)

Training data accuracy:  0.9796982111710543


In [19]:
tribigram_tag_test = tribigram_tagger_backoff.evaluate(DATA_TEST)
print("Testing data accuracy: ", tribigram_tag_test)

Testing data accuracy:  0.8716505056824106


## 2. Train an Average Perceptron Tagger with different iterations. Compare the results of using different iterations.

### a. 1 iteration

In [20]:
perceptron_trained_1 = nltk.perceptron.PerceptronTagger(load=False)
perceptron_trained_1.train(DATA_TRAIN, nr_iter=1)

In [21]:
pt_iter_1_train = perceptron_trained_1.evaluate(DATA_TRAIN)
print("Training data accuracy: ", pt_iter_1_train)

Training data accuracy:  0.9591327627469274


In [22]:
pt_iter_1_test = perceptron_trained_1.evaluate(DATA_TEST)
print("Testing data accuracy: ", pt_iter_1_test)

Testing data accuracy:  0.9283182149932229


### b. 5 iteration

In [23]:
perceptron_trained_5 = nltk.perceptron.PerceptronTagger(load=False)
perceptron_trained_5.train(DATA_TRAIN, nr_iter=5)

In [24]:
pt_iter_5_train = perceptron_trained_5.evaluate(DATA_TRAIN)
print("Training data accuracy: ", pt_iter_5_train)

Training data accuracy:  0.9947470896036994


In [25]:
pt_iter_5_test = perceptron_trained_5.evaluate(DATA_TEST)
print("Testing data accuracy: ", pt_iter_5_test)

Testing data accuracy:  0.9500052132207277


### c. 10 iteration

In [26]:
perceptron_trained_10 = nltk.perceptron.PerceptronTagger(load=False)
perceptron_trained_10.train(DATA_TRAIN, nr_iter=10)

In [27]:
pt_iter_10_train = perceptron_trained_10.evaluate(DATA_TRAIN)
print("Training data accuracy: ", pt_iter_10_train)

Training data accuracy:  0.9989250801119539


In [28]:
pt_iter_10_test = perceptron_trained_10.evaluate(DATA_TEST)
print("Testing data accuracy: ", pt_iter_10_test)

Testing data accuracy:  0.9517255760608904


## 3. Train a 3 Conditional Random Field using a different custom feature function. The feature function must contain the features below. Model A should use features a-c. Model B should use features a-e and Model C should use all the features.

### Model A (Features a-c)

In [29]:
def modelA_features(sentence, index):
    return {
        # a. Previous, Current, and Next Word
        # Current word
        'word': sentence[index],
        # Previous word
        'prev_word': '' if index == 0 else sentence[index - 1],
        # Next word
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        
        # b. 1-3 Character Prefix
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        
        # c. 1-3 Character Suffix
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:]
    }

In [30]:
crf_custom_A = nltk.crf.CRFTagger(feature_func=modelA_features)
crf_custom_A.train(DATA_TRAIN, 'crf_custom_A.tag')

In [31]:
crf_modelA_train = crf_custom_A.evaluate(DATA_TRAIN)
print("Training data accuracy: ", crf_modelA_train)

Training data accuracy:  0.9786232912830082


In [32]:
crf_modelA_test = crf_custom_A.evaluate(DATA_TEST)
print("Testing data accuracy: ", crf_modelA_test)

Testing data accuracy:  0.9516213116463351


### Model B (Features a-e)

In [33]:
def modelB_features(sentence, index):
    return {
        # a. Previous, Current, and Next Word
        # Current word
        'word': sentence[index],
        # Previous word
        'prev_word': '' if index == 0 else sentence[index - 1],
        # Next word
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        
        # b. 1-3 Character Prefix
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        
        # c. 1-3 Character Suffix
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        
        # d. Capitalize
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        
        # e. Word contains a number
        'is_numeric': sentence[index].isdigit()
    }

In [34]:
crf_custom_B = nltk.crf.CRFTagger(feature_func=modelB_features)
crf_custom_B.train(DATA_TRAIN, 'crf_custom_B.tag')

In [35]:
crf_modelB_train = crf_custom_B.evaluate(DATA_TRAIN)
print("Training data accuracy: ", crf_modelB_train)

Training data accuracy:  0.9785827282683649


In [36]:
crf_modelB_test = crf_custom_B.evaluate(DATA_TEST)
print("Testing data accuracy: ", crf_modelB_test)

Testing data accuracy:  0.9516734438536127


### Model C (All features)

In [37]:
def modelC_features(sentence, index):
    return {
        # a. Previous, Current, and Next Word
        # Current word
        'word': sentence[index],
        # Previous word
        'prev_word': '' if index == 0 else sentence[index - 1],
        # Next word
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        
        # b. 1-3 Character Prefix
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        
        # c. 1-3 Character Suffix
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        
        # d. Capitalize
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        
        # e. Word contains a number
        'is_numeric': sentence[index].isdigit(),
        
        # f. Word is first in the sentence
        'is_first': index == 0,
        
        # g. Word is last in the sentence
        'is_last': index == len(sentence) - 1
    }

In [38]:
crf_custom_C = nltk.crf.CRFTagger(feature_func=modelC_features)
crf_custom_C.train(DATA_TRAIN, 'crf_custom_C.tag')

In [39]:
crf_modelC_train = crf_custom_C.evaluate(DATA_TRAIN)
print("Training data accuracy: ", crf_modelC_train)

Training data accuracy:  0.9785421652537216


In [40]:
crf_modelC_test = crf_custom_C.evaluate(DATA_TEST)
print("Testing data accuracy: ", crf_modelC_test)

Testing data accuracy:  0.9516734438536127
