In [2]:
import numpy as np
import nltk
from nltk.corpus import brown

nltk.download('brown')

# Create a mapping from short forms to full names
tag_mapping = {
    'AT': 'article',
    'JJ': 'adjective',
    'NN': 'noun',
    'VBD': 'past tense verb',
    'IN': 'preposition',
    'CS': 'subordinating conjunction',
    'BEZ': 'copular verb',
    'BEDZ': 'past tense form of "to be"',
    'VBG': 'present participle form of a verb',
    'VB': 'base form of a verb',
    'CC': 'Coordinating conjunction',
    'CD': 'Cardinal number',
    'DT': 'Determiner',
    'EX': 'Existential "there"',
    'FW': 'Foreign word',
    'JJR': 'Comparative adjective',
    'JJS': 'Superlative adjective',
    'LS': 'List item marker',
    'MD': 'Modal',
    'NNS': 'Plural noun',
    'NNP': 'Proper noun (singular)',
    'NNPS': 'Proper noun (plural)',
    'PDT': 'Predeterminer',
    'POS': 'Possessive ending',
    'PRP': 'Personal pronoun',
    'PRP$': 'Possessive pronoun',
    'RB': 'Adverb',
    'RBR': 'Comparative adverb',
    'RBS': 'Superlative adverb',
    'RP': 'Particle',
    'SYM': 'Symbol',
    'TO': 'Preposition "to"',
    'UH': 'Interjection',
    'VBN': 'Past participle verb',
    'VBP': 'Non-3rd person singular present verb',
    'VBZ': '3rd person singular present verb',
    'WDT': 'Wh-determiner',
    'NNS-TL':'combination of plural noun and determiner',
    'WP': 'Wh-pronoun',
    'WP$': 'Possessive wh-pronoun',
    'WRB': 'Wh-adverb'
}

class HMM:
    def __init__(self, states, observations):
        self.states = states
        self.observations = observations
        self.transition_probs = None
        self.emission_probs = None
        self.start_probs = None

    def train(self):
        T = len(self.observations)
        N = len(self.states)

        # Initialize the matrices with zeros
        self.transition_probs = np.zeros((N, N))
        self.emission_probs = np.zeros((N, T))
        self.start_probs = np.zeros(N)

        # Count occurrences
        for tagged_sent in brown.tagged_sents():
            prev_state = None
            for word, state in tagged_sent:
                if state in self.states and word in self.observations:
                    state_idx = self.states.index(state)
                    word_idx = self.observations.index(word)
                    self.emission_probs[state_idx, word_idx] += 1
                    if prev_state is not None:
                        prev_state_idx = self.states.index(prev_state)
                        self.transition_probs[prev_state_idx, state_idx] += 1
                    else:
                        self.start_probs[state_idx] += 1
                prev_state = state

        # Normalize probabilities
        self.emission_probs /= self.emission_probs.sum(axis=1, keepdims=True)
        self.transition_probs /= self.transition_probs.sum(axis=1, keepdims=True)
        self.start_probs /= self.start_probs.sum()

    def viterbi(self, sentence):
        if self.transition_probs is None or self.emission_probs is None or self.start_probs is None:
            raise ValueError("HMM must be trained before using Viterbi")

        T = len(sentence)
        N = len(self.states)

        viterbi_probs = np.zeros((T, N))
        backpointers = np.zeros((T, N), dtype=int)

        # Initialization
        for j in range(N):
            word_idx = self.observations.index(sentence[0])
            viterbi_probs[0, j] = self.start_probs[j] * self.emission_probs[j, word_idx]

        # Recursion
        for t in range(1, T):
            for j in range(N):
                word_idx = self.observations.index(sentence[t])
                probs = viterbi_probs[t - 1] * self.transition_probs[:, j] * self.emission_probs[j, word_idx]
                viterbi_probs[t, j] = np.max(probs)
                backpointers[t, j] = np.argmax(probs)

        # Termination
        best_prob = np.max(viterbi_probs[-1])
        best_pointer = np.argmax(viterbi_probs[-1])

        # Traceback
        best_path = [best_pointer]
        for t in range(T - 1, 0, -1):
            best_pointer = backpointers[t, best_pointer]
            best_path.insert(0, best_pointer)

        return [tag_mapping[self.states[i]] for i in best_path]

# Create the list of Brown corpus tags
brown_tags = set(tag for word, tag in brown.tagged_words())

# Create the HMM model
hmm = HMM(list(brown_tags), list(set(brown.words())))

# Train the HMM model
hmm.train()

# # Get user input for the sentence
# sentence = input("Enter a sentence: ").split()

# # Use Viterbi to tag the sentence
# tags = hmm.viterbi(sentence)

# # Print the full names of the tags
# print(tags)


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [3]:
# Get user input for the sentence
sentence = input("Enter a sentence: ").split()

# Use Viterbi to tag the sentence
tags = hmm.viterbi(sentence)

# Print the full names of the tags
print(tags)


Enter a sentence: this model is used for parts of speech tagging
['Determiner', 'noun', 'copular verb', 'Past participle verb', 'preposition', 'Plural noun', 'preposition', 'noun', 'present participle form of a verb']
