Name: Aditya Sharma                 

Roll Number: 2022BCD0035

# ENGLISH CORPUS

In [16]:
import nltk
from nltk.corpus import treebank
from collections import defaultdict, Counter
import math

nltk.download("treebank")
nltk.download("universal_tagset")

tagged_sentences = treebank.tagged_sents(tagset="universal")

print(len(tagged_sentences))

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


3914


In [30]:
split_idx = int(0.7 * len(tagged_sentences))
train_data = tagged_sentences[:split_idx]
test_data = tagged_sentences[split_idx:]

In [31]:
initial_counts = Counter()
transition_counts = defaultdict(Counter)
emission_counts = defaultdict(Counter)
tag_counts = Counter()

for sent in train_data:
    if sent:
        initial_counts[sent[0][1]] += 1
    for i in range(len(sent)):
        word, tag = sent[i]
        tag_counts[tag] += 1
        emission_counts[tag][word.lower()] += 1
        if i < len(sent) - 1:
            next_tag = sent[i + 1][1]
            transition_counts[tag][next_tag] += 1

all_tags = list(tag_counts.keys())
vocab = set(word.lower() for sent in train_data for word, _ in sent)

initial_probs = {tag: (initial_counts[tag] + 1) / (len(train_data) + len(all_tags))
                 for tag in all_tags}

transition_probs = {}
for tag in all_tags:
    total = sum(transition_counts[tag].values()) + len(all_tags)
    transition_probs[tag] = {t: (transition_counts[tag][t] + 1) / total for t in all_tags}

emission_probs = {}
for tag in all_tags:
    total = sum(emission_counts[tag].values()) + len(vocab)
    emission_probs[tag] = {w: (emission_counts[tag][w] + 1) / total for w in vocab}

In [32]:
def viterbi(sentence, initial_probs, transition_probs, emission_probs, all_tags, vocab):
    V = [{}]
    path = {}

    for tag in all_tags:
        word = sentence[0].lower()
        emission = emission_probs[tag].get(word, 1 / (sum(emission_counts[tag].values()) + len(vocab)))
        V[0][tag] = math.log(initial_probs[tag]) + math.log(emission)
        path[tag] = [tag]

    for t in range(1, len(sentence)):
        V.append({})
        new_path = {}
        word = sentence[t].lower()

        for curr_tag in all_tags:
            emission = emission_probs[curr_tag].get(word, 1 / (sum(emission_counts[curr_tag].values()) + len(vocab)))
            (prob, prev_tag) = max(
                (V[t-1][pt] + math.log(transition_probs[pt][curr_tag]) + math.log(emission), pt)
                for pt in all_tags
            )
            V[t][curr_tag] = prob
            new_path[curr_tag] = path[prev_tag] + [curr_tag]

        path = new_path

    (prob, last_tag) = max((V[len(sentence) - 1][tag], tag) for tag in all_tags)
    return path[last_tag]

In [33]:
def evaluate(test_data, tagger_func):
    total, correct = 0, 0
    for sent in test_data:
        words = [w for w, _ in sent]
        gold_tags = [t for _, t in sent]
        pred_tags = tagger_func(words)
        total += len(sent)
        correct += sum(g == p for g, p in zip(gold_tags, pred_tags))
    return correct / total

hmm_accuracy = evaluate(test_data, lambda s: viterbi(s, initial_probs, transition_probs, emission_probs, all_tags, vocab))
print(f"HMM Tagger Accuracy: {hmm_accuracy:.4f}")

HMM Tagger Accuracy: 0.8751


In [34]:
from nltk.tag import hmm, pos_tag

trainer = hmm.HiddenMarkovModelTrainer()
nltk_hmm = trainer.train_supervised(train_data)
nltk_hmm_acc = nltk_hmm.evaluate(test_data)
print(f"NLTK HMM Tagger Accuracy: {nltk_hmm_acc:.4f}")

default_tagger = nltk.DefaultTagger("NOUN")
default_acc = default_tagger.evaluate(test_data)
print(f"NLTK Default Tagger Accuracy: {default_acc:.4f}")

unigram_tagger = nltk.UnigramTagger(train_data)
unigram_acc = unigram_tagger.evaluate(test_data)
print(f"NLTK Unigram Tagger Accuracy: {unigram_acc:.4f}")

bigram_tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
bigram_acc = bigram_tagger.evaluate(test_data)
print(f"NLTK Bigram Tagger Accuracy: {bigram_acc:.4f}")

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  nltk_hmm_acc = nltk_hmm.evaluate(test_data)
  O[i, k] = self._output_logprob(si, self._symbols[k])
  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])


NLTK HMM Tagger Accuracy: 0.5002


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  default_acc = default_tagger.evaluate(test_data)


NLTK Default Tagger Accuracy: 0.2915


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  unigram_acc = unigram_tagger.evaluate(test_data)


NLTK Unigram Tagger Accuracy: 0.8680


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  bigram_acc = bigram_tagger.evaluate(test_data)


NLTK Bigram Tagger Accuracy: 0.8707


In [35]:
sample_sentence = ["the", "dog", "barks", "at", "the", "cat"]
predicted_tags = viterbi(sample_sentence, initial_probs, transition_probs, emission_probs, all_tags, vocab)

print("\nSample Sentence Test:")
print(list(zip(sample_sentence, predicted_tags)))



Sample Sentence Test:
[('the', 'DET'), ('dog', 'ADJ'), ('barks', 'NOUN'), ('at', 'ADP'), ('the', 'DET'), ('cat', 'NOUN')]


# HINDI CORPUS

In [24]:
!pip install conllu

from conllu import parse_incr

hindi_sents = []
# https://github.com/UniversalDependencies/UD_Hindi-HDTB/blob/master/hi_hdtb-ud-test.conllu
with open("/content/hi_hdtb-ud-test.conllu", "r", encoding="utf-8") as f:
    for tokenlist in parse_incr(f):
        sent = []
        for token in tokenlist:
            if isinstance(token["id"], int):
                sent.append((token["form"], token["upos"]))
        hindi_sents.append(sent)

print(hindi_sents[0])

[('इसके', 'PRON'), ('अतिरिक्त', 'ADP'), ('गुग्गुल', 'PROPN'), ('कुंड', 'PROPN'), (',', 'PUNCT'), ('भीम', 'PROPN'), ('गुफा', 'PROPN'), ('तथा', 'CCONJ'), ('भीमशिला', 'PROPN'), ('भी', 'PART'), ('दर्शनीय', 'ADJ'), ('स्थल', 'NOUN'), ('हैं', 'AUX'), ('।', 'PUNCT')]


In [25]:
print(len(hindi_sents))

1684


In [27]:
split_idx = int(0.7 * len(hindi_sents))

train_data = hindi_sents[:split_idx]
test_data = hindi_sents[split_idx:]

In [28]:
from collections import defaultdict, Counter

def estimate_hmm_params(train_data):
    initial_counts = Counter()
    transition_counts = defaultdict(Counter)
    emission_counts = defaultdict(Counter)
    tag_counts = Counter()

    for sent in train_data:
        if sent:
            initial_counts[sent[0][1]] += 1
        for i in range(len(sent)):
            word, tag = sent[i]
            tag_counts[tag] += 1
            emission_counts[tag][word] += 1
            if i < len(sent) - 1:
                next_tag = sent[i + 1][1]
                transition_counts[tag][next_tag] += 1

    all_tags = list(tag_counts.keys())
    vocab = set(word for sent in train_data for word, _ in sent)

    initial_probs = {tag: (initial_counts[tag] + 1) / (len(train_data) + len(all_tags))
                     for tag in all_tags}

    transition_probs = {}
    for tag in all_tags:
        total = sum(transition_counts[tag].values()) + len(all_tags)
        transition_probs[tag] = {t: (transition_counts[tag][t] + 1) / total for t in all_tags}

    emission_probs = {}
    for tag in all_tags:
        total = sum(emission_counts[tag].values()) + len(vocab)
        emission_probs[tag] = {w: (emission_counts[tag][w] + 1) / total for w in vocab}

    return initial_probs, transition_probs, emission_probs, all_tags, vocab


In [29]:
initial_probs, transition_probs, emission_probs, all_tags, vocab = estimate_hmm_params(train_data)

hmm_accuracy = evaluate(
    test_data,
    lambda s: viterbi(s, initial_probs, transition_probs, emission_probs, all_tags, vocab)
)
print(f"HMM Hindi Tagger Accuracy: {hmm_accuracy:.4f}")

sample_sentence = ["राम", "स्कूल", "जाता", "है"]
predicted_tags = viterbi(sample_sentence, initial_probs, transition_probs, emission_probs, all_tags, vocab)

print("\nSample Sentence Prediction:")
print(list(zip(sample_sentence, predicted_tags)))


HMM Hindi Tagger Accuracy: 0.7866

Sample Sentence Prediction:
[('राम', 'PROPN'), ('स्कूल', 'VERB'), ('जाता', 'AUX'), ('है', 'AUX')]
