In [3]:
import sys
import numpy as np

print("Python executable:", sys.executable)
print("Python version:", sys.version)
print("NumPy version:", np.__version__)
print("All set! You're in a clean environment. No NLTK/spaCy/sklearn allowed — we're good!")

Python executable: C:\Users\haric\hmm_pos_env\Scripts\python.exe
Python version: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]
NumPy version: 2.3.5
All set! You're in a clean environment. No NLTK/spaCy/sklearn allowed — we're good!


In [4]:
import os
from urllib.request import urlretrieve

dataset_file = "en_ewt-ud-train.conllu"

if not os.path.exists(dataset_file):
    print("Downloading Universal Dependencies English-EWT training data (~35 MB)...")
    url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu"
    urlretrieve(url, dataset_file)
    print(f"Download complete! Saved as {dataset_file}")
    print(f"File size: {os.path.getsize(dataset_file) / (1024*1024):.1f} MB")
else:
    print(f"Dataset already exists: {dataset_file} ({os.path.getsize(dataset_file) / (1024*1024):.1f} MB)")

print("You are ready to run the full HMM POS Tagger code!")

Dataset already exists: en_ewt-ud-train.conllu (14.3 MB)
You are ready to run the full HMM POS Tagger code!


In [7]:
import os
from urllib.request import urlretrieve
import random
from collections import defaultdict
import math

# Step 1: Download and parse the dataset
# Function to parse CoNLL-U format into list of sentences, each a list of (word, tag) pairs.
def parse_conllu(filename):
    data = []
    sent = []
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    if sent:
                        data.append(sent)
                        sent = []
                elif line.startswith('#'):
                    continue
                else:
                    parts = line.split('\t')
                    if len(parts) >= 4 and '-' not in parts[0]:  # Skip multi-word tokens and empty fields
                        word = parts[1] if len(parts) > 1 else ''
                        tag = parts[3] if len(parts) > 3 else ''
                        if word and tag:  # Only add if both are non-empty
                            sent.append((word, tag))
        if sent:
            data.append(sent)
        return data
    except FileNotFoundError:
        print(f"Error: File {filename} not found. Please download it manually.")
        return []
    except Exception as e:
        print(f"Parsing error: {e}")
        return []

# Download the dataset if not exists
dataset_file = 'en_ewt-ud-train.conllu'
if not os.path.exists(dataset_file):
    try:
        url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu'
        urlretrieve(url, dataset_file)
        print(f"Downloaded dataset to {dataset_file}")
    except Exception as e:
        print(f"Download failed ({e}). Please download manually:")
        print("1. Go to: https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu")
        print("2. Right-click → Save As → en_ewt-ud-train.conllu in your notebook folder.")
        print("3. Re-run this cell.")

# Load and shuffle the data
data = parse_conllu(dataset_file)
if not data:
    print("No data loaded. Exiting.")
else:
    random.seed(42)  # For reproducibility
    random.shuffle(data)
    # Split into 80% train and 20% test
    train_size = int(0.8 * len(data))
    train = data[:train_size]
    test = data[train_size:]
    print(f"Total sentences: {len(data)}, Train: {len(train)}, Test: {len(test)}")

    # Step 2: Training Phase
    # Compute counts for transitions, emissions, and tags.
    tag_counts = defaultdict(int)
    trans_counts = defaultdict(lambda: defaultdict(int))
    emis_counts = defaultdict(lambda: defaultdict(int))
    start = '<START>'

    # Count starts, transitions, emissions, and tag occurrences
    tag_counts[start] = len(train)
    for sent in train:
        if not sent:
            continue
        prev = start
        for word, tag in sent:
            trans_counts[prev][tag] += 1
            emis_counts[tag][word] += 1
            tag_counts[tag] += 1
            prev = tag

    # Get unique tags (excluding <START>) and words
    unique_tags = list(set(tag for sent in train for _, tag in sent))
    unique_words = set(word for sent in train for word, _ in sent)
    V_trans = len(unique_tags)  # Vocabulary size for transitions (number of tags)
    V_emis = len(unique_words)  # Vocabulary size for emissions

    print(f"Unique tags: {len(unique_tags)}, Unique words: {len(unique_words)}")

    # Step 3: Display top 10 transition and emission probabilities
    # Collect and sort transition probabilities
    trans_probs = []
    for prev in [start] + unique_tags:
        for tag in unique_tags:
            count = trans_counts[prev][tag]
            prob = (count + 1) / (tag_counts[prev] + V_trans)
            trans_probs.append((prev, tag, prob))
    trans_probs.sort(key=lambda x: x[2], reverse=True)

    print("\nTop 10 Transition Probabilities:")
    for i in range(min(10, len(trans_probs))):
        prev, tag, prob = trans_probs[i]
        print(f"P({tag} | {prev}) = {prob:.4f}")

    # Collect and sort emission probabilities (limit to top for large vocab to avoid long computation)
    emis_probs = []
    for tag in unique_tags[:5]:  # Limit to top 5 tags for speed; adjust if needed
        tag_count = tag_counts[tag]
        for word in list(unique_words)[:100]:  # Sample 100 words to keep it fast
            count = emis_counts[tag][word]
            prob = (count + 1) / (tag_count + V_emis)
            emis_probs.append((tag, word, prob))
    emis_probs.sort(key=lambda x: x[2], reverse=True)

    print("\nTop 10 Emission Probabilities:")
    for i in range(min(10, len(emis_probs))):
        tag, word, prob = emis_probs[i]
        print(f"P({word} | {tag}) = {prob:.4f}")

    # Step 4: Decoding Phase - Viterbi Algorithm
    # Function to predict tags for a sentence using Viterbi with log probs.
    def viterbi(words):
        if not words:
            return []
        states = unique_tags
        n = len(words)
        v = [{} for _ in range(n)]  # Viterbi probs
        bp = [{} for _ in range(n)]  # Backpointers

        # Initialization
        for state in states:
            trans_prob = (trans_counts[start][state] + 1) / (tag_counts[start] + V_trans)
            emis_prob = (emis_counts[state].get(words[0], 0) + 1) / (tag_counts[state] + V_emis)
            v[0][state] = math.log(trans_prob) + math.log(emis_prob)
            bp[0][state] = start

        # Recursion
        for t in range(1, n):
            for state in states:
                max_prob = float('-inf')
                max_prev = None
                for prev in states:
                    trans_prob = (trans_counts[prev][state] + 1) / (tag_counts[prev] + V_trans)
                    emis_prob = (emis_counts[state].get(words[t], 0) + 1) / (tag_counts[state] + V_emis)
                    prob = v[t-1][prev] + math.log(trans_prob) + math.log(emis_prob)
                    if prob > max_prob:
                        max_prob = prob
                        max_prev = prev
                v[t][state] = max_prob
                bp[t][state] = max_prev

        # Backtrack
        path = [None] * n
        path[n-1] = max(states, key=lambda s: v[n-1][s])
        for t in range(n-1, 0, -1):
            path[t-1] = bp[t][path[t]]
        return path

    # Step 5: Evaluation
    correct = 0
    total = 0
    errors = []  # To collect sentences with errors for analysis
    for sent in test[:50]:  # Limit to first 50 for speed; remove limit for full eval
        words = [w for w, t in sent]
        gold_tags = [t for w, t in sent]
        pred_tags = viterbi(words)
        if len(pred_tags) != len(gold_tags):
            continue  # Skip if mismatch
        for p, g in zip(pred_tags, gold_tags):
            if p == g:
                correct += 1
            total += 1
        if pred_tags != gold_tags:
            errors.append((words, gold_tags, pred_tags))

    accuracy = correct / total if total > 0 else 0
    print(f"\nAccuracy on test data: {accuracy:.4f} ({correct}/{total})")

    # Step 6: Example output for 3 sample sentences
    print("\nExample Outputs for 3 Sample Sentences:")
    for i in range(min(3, len(test))):
        sent = test[i]
        words = [w for w, t in sent]
        gold_tags = [t for w, t in sent]
        pred_tags = viterbi(words)
        print(f"\nSample {i+1}:")
        print("Sentence:", ' '.join(words))
        print("Gold Tags:", ' '.join(gold_tags))
        print("Predicted Tags:", ' '.join(pred_tags))

    # Step 7: Error Analysis (at least one case)
    if errors:
        words, gold_tags, pred_tags = errors[0]  # Take the first error case
        print("\nError Analysis Example:")
        print("Sentence:", ' '.join(words))
        print("Gold Tags:", ' '.join(gold_tags))
        print("Predicted Tags:", ' '.join(pred_tags))
        print("Analysis:")
        for j in range(len(words)):
            if pred_tags[j] != gold_tags[j]:
                print(f" - Word '{words[j]}' mis-tagged as {pred_tags[j]} (gold: {gold_tags[j]})")
                if words[j] not in unique_words:
                    print("   Reason: Unseen word in training data.")
                else:
                    print("   Possible Reason: Word ambiguity or contextual influence from transitions.")
    else:
        print("\nNo errors found in test set for analysis.")

Total sentences: 12544, Train: 10035, Test: 2509
Unique tags: 17, Unique words: 17515

Top 10 Transition Probabilities:
P(VERB | PART) = 0.6935
P(NOUN | DET) = 0.5925
P(NOUN | ADJ) = 0.5216
P(X | X) = 0.5185
P(NUM | SYM) = 0.5122
P(PRON | SCONJ) = 0.4782
P(PUNCT | INTJ) = 0.3724
P(DET | ADP) = 0.3614
P(NOUN | NUM) = 0.3492
P(VERB | AUX) = 0.3372

Top 10 Emission Probabilities:
P(June | PROPN) = 0.0010
P(handle | VERB) = 0.0003
P(confirm | VERB) = 0.0003
P(answer | VERB) = 0.0003
P(Martin | PROPN) = 0.0003
P(arrived | VERB) = 0.0003
P(settled | VERB) = 0.0003
P(Bangs | PROPN) = 0.0002
P(White | PROPN) = 0.0001
P(X940 | PROPN) = 0.0001

Accuracy on test data: 0.8628 (736/853)

Example Outputs for 3 Sample Sentences:

Sample 1:
Sentence: Lorie Leigh @ ECT
Gold Tags: PROPN X X X
Predicted Tags: PROPN X X X

Sample 2:
Sentence: They need to update the locker rooms ASAP .
Gold Tags: PRON VERB PART VERB DET NOUN NOUN ADV PUNCT
Predicted Tags: PRON VERB PART VERB DET ADJ NOUN ADV PUNCT

Sample