# Download necessary resources

In [294]:
import numpy as np
import pandas as pd
import nltk

nltk.download('brown')
from nltk.corpus import brown


nltk.download('universal_tagset')

brown_tagged_sents = brown.tagged_sents(tagset='universal')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [295]:
brown_tagged_sents

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

#Test

In [296]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [297]:
# Your example sentence
#sentence = "I go home by bus"
sentence = "The big cat sat on the mat in the kitchen"

In [298]:
# Tokenize the sentence and convert to lowercase
tokens = [word.lower() for word in word_tokenize(sentence)]

In [299]:
# Get the set of unique words in the Brown corpus
brown_words = set(brown.words())

In [300]:
# Check if all tokens are in the Brown corpus
all_in_corpus = all(token in brown_words for token in tokens)
print(all_in_corpus)

True


#Viterbi algorithm

In [301]:
!pip install viterbi



In [302]:
import viterbi
import math

In [305]:
def viterbi(observed_words, cpd_tag_transitions, cpd_emission, states):
    # Initialize the dynamic programming table to store log probabilities
    V = [{}]
    path = {}

    # Initialize base case (t == 0)
    for state in states:
        # Compute log probabilities instead of multiplying probabilities
        emission_prob = cpd_emission[state].prob(observed_words[0])
        if emission_prob == 0.0:
            V[0][state] = float('-inf')  # Assign a very large negative value
        else:
            V[0][state] = math.log(cpd_tag_transitions['START'].prob(state)) + math.log(emission_prob)
        path[state] = [state]

    # Run Viterbi for t > 0
    for t in range(1, len(observed_words)):
        V.append({})
        newpath = {}

        for cur_state in states:
            # Check if state is 'START' or 'END'
            if cur_state in ['START', 'END']:
                continue

            # Select the state transition path with the maximum log probability
            max_prob = float('-inf')
            max_state = None
            for prev_state in states:
                if prev_state not in ['START', 'END']:
                    transition_prob = cpd_tag_transitions[prev_state].prob(cur_state)
                    emission_prob = cpd_emission[cur_state].prob(observed_words[t])
                    if emission_prob == 0.0:
                        emission_log_prob = float('-inf')  # Assign a very large negative value
                    else:
                        emission_log_prob = math.log(emission_prob)
                    prob = V[t-1][prev_state] + math.log(transition_prob) + emission_log_prob
                    if prob > max_prob:
                        max_prob = prob
                        max_state = prev_state

            V[t][cur_state] = max_prob
            newpath[cur_state] = path[max_state] + [cur_state]

        # Don't need to remember the old paths
        path = newpath

    # Add a final step for transition to 'END' state
    max_prob = float('-inf')
    max_state = None
    for state in states:
        if state not in ['START', 'END']:
            prob = V[len(observed_words) - 1][state] + math.log(cpd_tag_transitions[state].prob('END'))
            if prob > max_prob:
                max_prob = prob
                max_state = state

    return (max_prob, path[max_state])

In [309]:
# Example sage 1
observed_words = ["I", "go", "home", "by","bus"]
states = ['NOUN', 'VERB', 'PRON', 'ADP']  # Add your list of states/tags
(prob, sequence) = viterbi(observed_words, cpd_tag_transitions, cpd_emission, states)
print(f"Log probability of the best tag sequence: {prob}")
print(f"Best tag sequence: {sequence}")

ValueError: math domain error

In [None]:
# Example sage 2
observed_words = ["The", "big", "cat" ,"sat" ,"on", "the" ,"mat" ,"in", "the", "kitchen"]
states = ['NOUN', 'VERB', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', 'START', 'END']  # Add your list of states/tags
(prob, sequence) = viterbi(observed_words, cpd_tag_transitions, cpd_emission, states)
print(f"Log probability of the best tag sequence: {prob}")
print(f"Best tag sequence: {sequence}")