<a href="https://colab.research.google.com/github/alex-smith-uwec/CS491/blob/main/POS_HMM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The purpose of this notebook is to experiment with part of speech tagging using a Hidden Markov Model and the Viterbi algoritm. We train the algorithm on the Brown Corpus.

In [1]:
import numpy as np
import pandas as pd
import nltk

nltk.download('brown')
from nltk.corpus import brown



# Accessing the tagged sentences
nltk.download('universal_tagset')

brown_tagged_sents = brown.tagged_sents(tagset='universal')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


In [2]:
brown_tagged_sents

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

# Explore Brown corpus

In [2]:
# List categories
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
# Access words in the specified category
category_words = brown.words(categories='fiction')

print(category_words[:10])  # Print first 50 words

# Sample text from a specific category
print(" ".join(category_words[:30]))

['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.', 'His']
Thirty-three Scotty did not go back to school . His parents talked seriously and lengthily to their own doctor and to a specialist at the University Hospital -- Mr. McKinley


In [4]:
len(category_words)

68488

In [6]:
# Count occurrences of a specific word
from nltk import FreqDist
fdist = FreqDist(brown.words())
print(fdist['the'])
print(fdist['The'])

62713
7258


In [7]:
# Fetching the first 2 sentences from the Brown corpus, tagged with part-of-speech
tagged_sentences = brown_tagged_sents[0:2]

for sentence in tagged_sentences:
    print(sentence)

[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')]
[('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City',

# Moving on

 Say words = $[w_1\ldots w_N]$ and tags = $[t_1\ldots t_N]$.

 Then  $P(\text{tags}\vert\text{words})$ is proportional to  $$\prod P(t_i\vert t_{i-1}) \cdot P(w_i \vert t_i)$$

 To find the best tag sequence for a given sequence of words, we want to find the tag sequence that maximizes $P(\text{tags} | \text{words})$

In [None]:
brown_tagged_sents[42]

In [4]:
# prompt: From brown_tagged_sents get parts of speech tags and form a set.

parts_of_speech = set()
for sentence in brown_tagged_sents:
  for word, tag in sentence:
    parts_of_speech.add(tag)
print(parts_of_speech)


{'NUM', 'VERB', 'CONJ', 'ADJ', 'NOUN', 'PRON', 'X', 'ADV', 'ADP', '.', 'DET', 'PRT'}


See [universal POS tags readme](https://github.com/slavpetrov/universal-pos-tags/blob/master/README.md)

In [5]:
brown_tagged_sents = brown.tagged_sents(tagset='universal')
brown_tagged_sents = [[('START', 'START')] + sentence + [('END', 'END')] for sentence in brown_tagged_sents]


In [6]:
brown_tagged_sents[11]

[('START', 'START'),
 ('Implementation', 'NOUN'),
 ('of', 'ADP'),
 ("Georgia's", 'NOUN'),
 ('automobile', 'NOUN'),
 ('title', 'NOUN'),
 ('law', 'NOUN'),
 ('was', 'VERB'),
 ('also', 'ADV'),
 ('recommended', 'VERB'),
 ('by', 'ADP'),
 ('the', 'DET'),
 ('outgoing', 'ADJ'),
 ('jury', 'NOUN'),
 ('.', '.'),
 ('END', 'END')]

In [7]:
# prompt: Take brown_words_tagged_sents and flatten it so that it is a list titled brow_word_tags

brown_word_tags = [word_tag for sentence in brown_tagged_sents for word_tag in sentence]


In [10]:
brown_word_tags[14]

('election', 'NOUN')

In [11]:
len(brown_word_tags)

1275872

In [12]:
brown_word_tags[30:40]

[('further', 'ADV'),
 ('said', 'VERB'),
 ('in', 'ADP'),
 ('term-end', 'NOUN'),
 ('presentments', 'NOUN'),
 ('that', 'ADP'),
 ('the', 'DET'),
 ('City', 'NOUN'),
 ('Executive', 'ADJ'),
 ('Committee', 'NOUN')]

In [13]:
tag_word_pairs = [(tag, word) for word, tag in brown_word_tags]

In [14]:
tag_word_pairs[30:40]

[('ADV', 'further'),
 ('VERB', 'said'),
 ('ADP', 'in'),
 ('NOUN', 'term-end'),
 ('NOUN', 'presentments'),
 ('ADP', 'that'),
 ('DET', 'the'),
 ('NOUN', 'City'),
 ('ADJ', 'Executive'),
 ('NOUN', 'Committee')]

In [15]:
#Getting the continuous frequency distribution for the words which are tagged
cfd_tag_words=nltk.ConditionalFreqDist(tag_word_pairs)


In [17]:
# Frequency of the tag 'VERB' for the word "run"
print(f'count of events POS being VERB given the word "run": {cfd_tag_words["VERB"]["run"]}')

count of events POS being VERB given the word "run": 154


 Now, we will estimate $P(w_i|t_i)$ from corpus data:

  $P(w_i | t_i)=\displaystyle{\frac{\text{count}(w_i,t_i)}{\text{count}(t_i)}}$

In [18]:
# Create a ConditionalFreqDist with (tag, word) pairs
cfd_tags_given_word = nltk.ConditionalFreqDist(tag_word_pairs)


In [20]:
# Create a ConditionalProbDist for emission probabilities
cpd_emission = nltk.ConditionalProbDist(cfd_tags_given_word, nltk.MLEProbDist)


In [23]:
# Probability of observing the word "further" given the tag "ADV"
cpd_emission['ADV'].prob('further')


0.0015825316950870392

 Estimating $P(t_i\vert t_{i-1})$  from corpus data:

 $P(t_i | t_{i-1}) = \displaystyle{\frac{\text{count}(t_{i-1}, t_i)}{\text{count}(t_{i-1})}}$

In [24]:
# Extract the sequence of tags from the original list of (word, tag) pairs
tags = [tag for word, tag in brown_word_tags]

# Create pairs of consecutive tags
tag_pairs = [(tags[i], tags[i+1]) for i in range(len(tags)-1)]


In [25]:
# Create a CFD from the pairs of consecutive tags
cfd_tag_transitions = nltk.ConditionalFreqDist(tag_pairs)


In [26]:
# Convert the CFD into a CPD for transition probabilities
cpd_tag_transitions = nltk.ConditionalProbDist(cfd_tag_transitions, nltk.MLEProbDist)


In [27]:
# Probability of transitioning from ADJ to NOUN
prob_adj_to_noun = cpd_tag_transitions['ADJ'].prob('NOUN')


In [28]:
# probability of tag sequence PRON VERB NOUN for the word sequence "We love food"
# Assuming cpd_emission and cpd_tag_transitions are already defined

# Emission Probabilities
prob_we_pron = cpd_emission['PRON'].prob('We')
prob_love_verb = cpd_emission['VERB'].prob('love')
prob_food_noun = cpd_emission['NOUN'].prob('food')

# Transition Probabilities including START and END
prob_start_to_pron = cpd_tag_transitions['START'].prob('PRON')
prob_pron_to_verb = cpd_tag_transitions['PRON'].prob('VERB')
prob_verb_to_noun = cpd_tag_transitions['VERB'].prob('NOUN')
prob_noun_to_end = cpd_tag_transitions['NOUN'].prob('END')

# Combine all probabilities
total_probability = (
    prob_start_to_pron *
    prob_we_pron *
    prob_pron_to_verb *
    prob_love_verb *
    prob_verb_to_noun *
    prob_food_noun *
    prob_noun_to_end
)

# Print the total probability
print(f"Total Probability of 'PRON VERB NOUN' for 'We love food': {total_probability}")



Total Probability of 'PRON VERB NOUN' for 'We love food': 7.079380056115793e-14


In [31]:
def calculate_sequence_probability(words, pos_tags, cpd_emission, cpd_tag_transitions):
    # Check if lengths of words and POS tags match
    if len(words) != len(pos_tags):
        return "Error: The number of words and POS tags must match."

    # Initialize the total probability with 1 (since we'll be multiplying probabilities)
    total_probability = 1.0

    # Transition Probability from START to the first POS tag
    total_probability *= cpd_tag_transitions['START'].prob(pos_tags[0])

    # Loop through words and POS tags to calculate emission and transition probabilities
    for i in range(len(words)):
        word = words[i]
        pos_tag = pos_tags[i]

        # Emission Probability for the current word given its POS tag
        total_probability *= cpd_emission[pos_tag].prob(word)

        # Transition Probability to the next POS tag (if not the last word)
        if i < len(words) - 1:
            next_pos_tag = pos_tags[i + 1]
            total_probability *= cpd_tag_transitions[pos_tag].prob(next_pos_tag)

    # Transition Probability from the last POS tag to END
    total_probability *= cpd_tag_transitions[pos_tags[-1]].prob('END')

    # Return the total probability
    return total_probability



In [39]:
print(f'allowed POS tags: {parts_of_speech}')

allowed POS tags: {'NUM', 'VERB', 'CONJ', 'ADJ', 'NOUN', 'PRON', 'X', 'ADV', 'ADP', '.', 'DET', 'PRT'}


In [44]:
# Example usage (assuming cpd_emission and cpd_tag_transitions are defined):
words = ['My', 'cat', 'sleeps']
pos_tags = ['DET', 'NOUN', 'VERB']
total_probability = calculate_sequence_probability(words, pos_tags, cpd_emission, cpd_tag_transitions)
print(f"Total Probability of {' '.join(pos_tags)} for '{' '.join(words)}': {total_probability}")


Total Probability of DET NOUN VERB for 'My cat sleeps': 5.819432070356075e-18


# Viterbi Algorithm

In [29]:
def viterbi(observed_words, cpd_tag_transitions, cpd_emission, states):
    # Initialize the dynamic programming table to store probabilities
    V = [{}]
    path = {}

    # Initialize base case (t == 0)
    for state in states:
        V[0][state] = cpd_tag_transitions['START'].prob(state) * cpd_emission[state].prob(observed_words[0])
        path[state] = [state]

    # Run Viterbi for t > 0
    for t in range(1, len(observed_words)):
        V.append({})
        newpath = {}

        for cur_state in states:
            # Check if state is 'START' or 'END'
            if cur_state in ['START', 'END']:
                continue

            # Select the state transition path with the maximum probability
            (prob, state) = max(
                (V[t-1][prev_state] * cpd_tag_transitions[prev_state].prob(cur_state) * cpd_emission[cur_state].prob(observed_words[t]), prev_state)
                for prev_state in states if prev_state not in ['START', 'END']
            )

            V[t][cur_state] = prob
            newpath[cur_state] = path[state] + [cur_state]

        # Don't need to remember the old paths
        path = newpath

    # Add a final step for transition to 'END' state
    prob, state = max((V[len(observed_words) - 1][state] * cpd_tag_transitions[state].prob('END'), state) for state in states if state not in ['START', 'END'])
    return (prob, path[state])




In [30]:
# Example usage
observed_words = ["The", "red", "fox"]
states = ['NOUN', 'VERB', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', 'START', 'END']  # Add your list of states/tags
(prob, sequence) = viterbi(observed_words, cpd_tag_transitions, cpd_emission, states)
print(f"Probability of the best tag sequence: {prob}")
print(f"Best tag sequence: {sequence}")

Probability of the best tag sequence: 2.655173396390551e-13
Best tag sequence: ['DET', 'ADJ', 'NOUN']
