pos tagging


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import nltk
from nltk.corpus import brown
from collections import defaultdict
import random

# Download necessary NLTK data (if you haven't already)
nltk.download('brown')
nltk.download('universal_tagset')

# 1. Load and prepare the dataset
# We get sentences tagged with universal POS tags
tagged_sents = brown.tagged_sents(tagset='universal')

# For simplicity, let's use a smaller subset of the data
# Using the full dataset would take much longer to train
data = tagged_sents[:5000]  # Use first 5000 sentences for this example


# 2. Create vocabulary and tag mappings
word_counts = defaultdict(int)
tag_counts = defaultdict(int)

for sent in data:
    for word, tag in sent:
        word_counts[word.lower()] += 1
        tag_counts[tag] += 1

# Create word_to_ix and tag_to_ix dictionaries
# We'll add special tokens for handling sequences
# <PAD>: Padding, <UNK>: Unknown words, <SOS>: Start of Sequence, <EOS>: End of Sequence
word_to_ix = {"<PAD>": 0, "<UNK>": 1, "<SOS>": 2, "<EOS>": 3}
for word, count in word_counts.items():
    # A simple way to filter out rare words
    if count > 2: 
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

tag_to_ix = {"<PAD>": 0, "<UNK>": 1, "<SOS>": 2, "<EOS>": 3}
for tag in tag_counts:
    if tag not in tag_to_ix:
        tag_to_ix[tag] = len(tag_to_ix)

# Create inverse mapping for later use
ix_to_tag = {v: k for k, v in tag_to_ix.items()}

# 3. Convert sentences to integer sequences
def sentence_to_indices(sentence, word_to_ix):
    return [word_to_ix.get(word.lower(), word_to_ix["<UNK>"]) for word in sentence]

def tags_to_indices(tags, tag_to_ix):
    return [tag_to_ix[tag] for tag in tags]

# Process the entire dataset
processed_data = []
for sent in data:
    words = [word for word, tag in sent]
    tags = [tag for word, tag in sent]
    
    # Add start and end tokens
    word_indices = [word_to_ix["<SOS>"]] + sentence_to_indices(words, word_to_ix) + [word_to_ix["<EOS>"]]
    tag_indices = [tag_to_ix["<SOS>"]] + tags_to_indices(tags, tag_to_ix) + [tag_to_ix["<EOS>"]]
    
    processed_data.append((torch.tensor(word_indices), torch.tensor(tag_indices)))

print(f"Vocabulary size: {len(word_to_ix)}")
print(f"Tag set size: {len(tag_to_ix)}")
print("\nSample processed sentence:")
print(f"Original: {data[0]}")
print(f"Tensor: {processed_data[0]}")

[nltk_data] Downloading package brown to /Users/yashbhake/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/yashbhake/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Vocabulary size: 4461
Tag set size: 16

Sample processed sentence:
Original: [('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')]
Tensor: (tensor([ 2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
        21, 22, 23, 24, 25, 26, 27, 28,  3]), tensor([2, 4, 5, 5, 6, 5, 7, 5, 4, 5, 8, 5, 6, 5, 5, 7, 9, 4, 5, 9, 8, 4, 5, 7,
        5, 9, 3]))


In [26]:
class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMCell, self).__init__()
        self.hidden_size = hidden_size
        self.W_i = nn.Linear(input_size, hidden_size)
        self.W_f = nn.Linear(input_size, hidden_size)
        self.W_o = nn.Linear(input_size, hidden_size)
        self.W_g = nn.Linear(input_size, hidden_size)
        self.U_i = nn.Linear(hidden_size, hidden_size)
        self.U_f = nn.Linear(hidden_size, hidden_size)
        self.U_o = nn.Linear(hidden_size, hidden_size)
        self.U_g = nn.Linear(hidden_size, hidden_size)

    def forward(self, x, states):
        h_prev, c_prev = states
        f_t = torch.sigmoid(self.W_f(x) + self.U_f(h_prev))
        i_t = torch.sigmoid(self.W_i(x) + self.U_i(h_prev))
        o_t = torch.sigmoid(self.W_o(x) + self.U_o(h_prev))
        g_t = torch.tanh(self.W_g(x) + self.U_g(h_prev))
        c_t = f_t * c_prev + i_t * g_t
        h_t = o_t * torch.tanh(c_t)
        return h_t, c_t

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm_cell = LSTMCell(embedding_dim, hidden_size)
        self.hidden_size = hidden_size

    def forward(self, x):
        embedded = self.embedding(x)
        h = torch.zeros(1, self.hidden_size)
        c = torch.zeros(1, self.hidden_size)
        for i in range(embedded.shape[0]):
            h, c = self.lstm_cell(embedded[i].unsqueeze(0), (h, c))
        return h, c

# THIS IS THE CORRECTED CLASS
class Decoder(nn.Module):
    def __init__(self, tag_size, embedding_dim, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(tag_size, embedding_dim)
        self.lstm_cell = LSTMCell(embedding_dim, hidden_size)
        self.fc = nn.Linear(hidden_size, tag_size)

    def forward(self, x, states):
        # The buggy ".unsqueeze(0)" has been REMOVED here.
        embedded = self.embedding(x)
        h, c = self.lstm_cell(embedded, states)
        output = self.fc(h)
        return output, (h, c)

# THIS IS THE CORRECTED CLASS
class Seq2SeqPOSTagger(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2SeqPOSTagger, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target):
        batch_size = 1
        target_len = len(target)
        target_vocab_size = len(tag_to_ix)
        outputs = torch.zeros(target_len, batch_size, target_vocab_size)
        h, c = self.encoder(source)
        # We ensure the input is consistently shaped with .unsqueeze(0)
        x = target[0].unsqueeze(0)
        for t in range(1, target_len):
            output, (h, c) = self.decoder(x, (h, c))
            outputs[t] = output
            x = target[t].unsqueeze(0)
        return outputs.squeeze(1)

In [None]:
# Hyperparameters
EMBEDDING_DIM = 64
HIDDEN_DIM = 64
EPOCHS = 10
LEARNING_RATE = 0.005

# Model Initialization
encoder = Encoder(len(word_to_ix), EMBEDDING_DIM, HIDDEN_DIM)
decoder = Decoder(len(tag_to_ix), EMBEDDING_DIM, HIDDEN_DIM)
model = Seq2SeqPOSTagger(encoder, decoder)

# Loss and Optimizer
# We ignore the <PAD> token in our loss calculation
criterion = nn.CrossEntropyLoss(ignore_index=tag_to_ix["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training Loop
for epoch in range(EPOCHS):
    total_loss = 0
    for i, (sentence_in, tags_out) in enumerate(processed_data):
        
        model.train()
        optimizer.zero_grad()
        
        # Get model predictions
        output = model(sentence_in, tags_out)
        
        # Reshape for CrossEntropyLoss:
        # output should be (N, C) where C is number of classes
        # tags_out should be (N)
        # We skip the <SOS> token in our comparison
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        tags_out = tags_out[1:].view(-1)
        
        loss = criterion(output, tags_out)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

        if i % 500 == 0 and i > 0:
            print(f"Epoch {epoch+1}/{EPOCHS}, Step {i}/{len(processed_data)}, Loss: {total_loss / (i+1):.4f}")

    print(f"--- End of Epoch {epoch+1}, Average Loss: {total_loss / len(processed_data):.4f} ---")

Epoch 1/10, Step 500/5000, Loss: 1.7163
Epoch 1/10, Step 1000/5000, Loss: 1.6239
Epoch 1/10, Step 1500/5000, Loss: 1.5945
Epoch 1/10, Step 2000/5000, Loss: 1.5635
Epoch 1/10, Step 2500/5000, Loss: 1.5370
Epoch 1/10, Step 3000/5000, Loss: 1.5241
Epoch 1/10, Step 3500/5000, Loss: 1.5189
Epoch 1/10, Step 4000/5000, Loss: 1.5146
Epoch 1/10, Step 4500/5000, Loss: 1.5159
--- End of Epoch 1, Average Loss: 1.5115 ---
Epoch 2/10, Step 500/5000, Loss: 1.4354
Epoch 2/10, Step 1000/5000, Loss: 1.4120
Epoch 2/10, Step 1500/5000, Loss: 1.4193
Epoch 2/10, Step 2000/5000, Loss: 1.4159
Epoch 2/10, Step 2500/5000, Loss: 1.4075
Epoch 2/10, Step 3000/5000, Loss: 1.4108
Epoch 2/10, Step 3500/5000, Loss: 1.4160
Epoch 2/10, Step 4000/5000, Loss: 1.4217
Epoch 2/10, Step 4500/5000, Loss: 1.4332
--- End of Epoch 2, Average Loss: 1.4350 ---
Epoch 3/10, Step 500/5000, Loss: 1.4078
Epoch 3/10, Step 1000/5000, Loss: 1.3842
Epoch 3/10, Step 1500/5000, Loss: 1.3895
Epoch 3/10, Step 2000/5000, Loss: 1.3829
Epoch 3/10,

In [27]:
# Make sure you have these libraries. If not, run: pip install scikit-learn tqdm
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm # Provides a nice progress bar

# Split the data you loaded and processed in the first script
# We'll use 80% for training and 20% for testing
split_point = int(len(processed_data) * 0.8)
# We assume the model was trained on the first part
train_data = processed_data[:split_point] 
# And we'll test on the second, unseen part
test_data = processed_data[split_point:]

print(f"Total samples: {len(processed_data)}")
print(f"Test samples: {len(test_data)}")

# We need an inverse mapping from index to word for the evaluation function
ix_to_word = {v: k for k, v in word_to_ix.items()}

Total samples: 5000
Test samples: 1000


In [None]:
def evaluate_model(model, test_data, word_to_ix, ix_to_word, ix_to_tag):
    """
    Evaluates the model on the test data and prints performance metrics.
    """
    model.eval()  # Set the model to evaluation mode (disables dropout, etc.)
    
    all_predicted_tags = []
    all_true_tags = []

    # Loop through the test data with a progress bar
    for sentence_indices, tag_indices in tqdm(test_data, desc="Evaluating"):
        
        # 1. Get the true tags, ignoring <SOS> and <EOS> tokens
        true_tags = [ix_to_tag[ix.item()] for ix in tag_indices[1:-1]]
        
        # 2. Convert the input sentence indices back to a string
        words = [ix_to_word.get(ix.item(), "<UNK>") for ix in sentence_indices[1:-1]]
        sentence_str = " ".join(words)
        
        # 3. Get model's prediction for the sentence
        with torch.no_grad():
            predicted_tags = tag_sentence(sentence_str, model, word_to_ix, ix_to_tag)
        
        # 4. Align predicted and true tags in case of length mismatch
        # This can happen if the model predicts <EOS> too early or late
        min_len = min(len(predicted_tags), len(true_tags))
        
        all_predicted_tags.extend(predicted_tags[:min_len])
        all_true_tags.extend(true_tags[:min_len])

    # 5. Calculate and print the results
    accuracy = accuracy_score(all_true_tags, all_predicted_tags)
    print(f"\n✅ Overall Model Accuracy: {accuracy * 100:.2f}%\n")
    
    # Generate a detailed report
    # We get the list of all possible tags to include them in the report
    labels = sorted(list(set(all_true_tags)))
    report = classification_report(all_true_tags, all_predicted_tags, labels=labels)
    print("Classification Report:")
    print(report)

corrected_model.load_state_dict(model.state_dict())
evaluate_model(corrected_model, test_data, word_to_ix, ix_to_word, ix_to_tag)


Evaluating: 100%|██████████| 1000/1000 [00:04<00:00, 223.16it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



✅ Overall Model Accuracy: 28.78%

Classification Report:
              precision    recall  f1-score   support

           .       0.59      0.48      0.53      2617
         ADJ       0.31      0.02      0.03      1660
         ADP       0.17      0.29      0.21      2691
         ADV       0.24      0.02      0.04       925
        CONJ       0.29      0.08      0.12       630
         DET       0.19      0.37      0.25      2490
        NOUN       0.34      0.41      0.37      5709
         NUM       0.00      0.00      0.00       348
        PRON       0.76      0.17      0.28       620
         PRT       0.50      0.00      0.00       530
        VERB       0.26      0.20      0.23      3271
           X       0.00      0.00      0.00        31

    accuracy                           0.29     21522
   macro avg       0.30      0.17      0.17     21522
weighted avg       0.32      0.29      0.27     21522



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [25]:
# Re-create the components (assuming EMBEDDING_DIM and HIDDEN_DIM are defined)
new_encoder = Encoder(len(word_to_ix), EMBEDDING_DIM, HIDDEN_DIM)
new_decoder = Decoder(len(tag_to_ix), EMBEDDING_DIM, HIDDEN_DIM)
corrected_model = Seq2SeqPOSTagger(new_encoder, new_decoder)

# --- Step 3: Copy the trained weights from the old model to the new one ---
# This transfers all the knowledge your model learned during training.

# 'model' is the original trained model object from your training script
corrected_model.load_state_dict(model.state_dict())

print("✅ Trained weights successfully transferred to the corrected model structure.")

# --- Step 4: Run evaluation using the CORRECTED model ---
# Now, call your evaluation function, but pass in 'corrected_model'

print("\nStarting evaluation with the corrected model...")
evaluate_model(corrected_model, test_data, word_to_ix, ix_to_word, ix_to_tag)

✅ Trained weights successfully transferred to the corrected model structure.

Starting evaluation with the corrected model...


Evaluating: 100%|██████████| 1000/1000 [00:04<00:00, 222.67it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



✅ Overall Model Accuracy: 28.78%

Classification Report:
              precision    recall  f1-score   support

           .       0.59      0.48      0.53      2617
         ADJ       0.31      0.02      0.03      1660
         ADP       0.17      0.29      0.21      2691
         ADV       0.24      0.02      0.04       925
        CONJ       0.29      0.08      0.12       630
         DET       0.19      0.37      0.25      2490
        NOUN       0.34      0.41      0.37      5709
         NUM       0.00      0.00      0.00       348
        PRON       0.76      0.17      0.28       620
         PRT       0.50      0.00      0.00       530
        VERB       0.26      0.20      0.23      3271
           X       0.00      0.00      0.00        31

    accuracy                           0.29     21522
   macro avg       0.30      0.17      0.17     21522
weighted avg       0.32      0.29      0.27     21522



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [22]:
import torch
import json

# --- IMPORTANT ---
# This script assumes you have the following variables in your environment
# after running the training code:
# - model: The trained PyTorch model object.
# - word_to_ix: The word-to-index dictionary.
# - tag_to_ix: The tag-to-index dictionary.

print("Saving model state dictionary...")
# We save the model's state_dict, which is just the learned weights.
torch.save(model.state_dict(), 'pos_tagger_model.pth')
print("✅ Model state saved to pos_tagger_model.pth")

print("\nSaving word_to_ix dictionary...")
with open('word_to_ix.json', 'w') as f:
    json.dump(word_to_ix, f)
print("✅ Word vocabulary saved to word_to_ix.json")


print("\nSaving tag_to_ix dictionary...")
with open('tag_to_ix.json', 'w') as f:
    json.dump(tag_to_ix, f)
print("✅ Tag vocabulary saved to tag_to_ix.json")

print("\nAll artifacts have been saved successfully!")


Saving model state dictionary...
✅ Model state saved to pos_tagger_model.pth

Saving word_to_ix dictionary...
✅ Word vocabulary saved to word_to_ix.json

Saving tag_to_ix dictionary...
✅ Tag vocabulary saved to tag_to_ix.json

All artifacts have been saved successfully!
