In [1]:
# === Upload data.zip ===
from google.colab import files
uploaded = files.upload()  # Upload a zip file like 'data.zip'

# === Unzip it ===
import zipfile
import os

zip_path = next(iter(uploaded))  # get the uploaded filename
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall()  # will create the 'data/pos' folder if it's zipped as such

# Confirm structure
os.listdir("data/pos")

Saving data.zip to data.zip


['train', 'test', 'dev']

In [2]:
# POS Tagger - Jupyter Notebook Version for Colab

# ============================
# 📦 Imports and Setup
# ============================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Set device for GPU use in Colab
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cuda')

In [3]:

# ============================
# 📚 PosReader Class
# ============================
class PosReader:
    def __init__(self, train_file_path, pad_token="<PAD>", unk_token="<UNK>"):
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.train_file_path = train_file_path

        self.sentences = self._read_and_pad_sentences(train_file_path)
        self.vocab = self._build_vocab(self.sentences)
        self.word_to_index = {word: i for i, word in enumerate(self.vocab)}
        self.tag_vocab = self._build_tag_vocab(self.sentences)
        self.tag_to_index = {tag: i for i, tag in enumerate(self.tag_vocab)}

    def _read_and_pad_sentences(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            raw = f.read().strip()
        blocks = [s for s in raw.split("\n\n") if s]

        sentences = []
        for block in blocks:
            tokens = [tuple(line.split()) for line in block.splitlines() if line.strip()]
            padded = [(self.pad_token, "PAD")] * 2 + tokens + [(self.pad_token, "PAD")] * 2
            sentences.append(padded)

        return sentences

    def _build_vocab(self, sentences):
        unique_words = set()
        for sentence in sentences:
            for word, _ in sentence:
                unique_words.add(word)
        unique_words.update([self.pad_token, self.unk_token])
        return sorted(unique_words)

    def _build_tag_vocab(self, sentences):
        unique_tags = set()
        for sentence in sentences:
            for _, tag in sentence:
                unique_tags.add(tag)
        return sorted(unique_tags)

    def generate_windows(self, source="train"):
        """
        source: "train" or path to another file (e.g., "dev")
        Yields: (word_indices, tag_index) for each center word
        """
        if source == "train":
            sentences = self.sentences
        else:
            sentences = self._read_and_pad_sentences(source)

        unk_idx = self.word_to_index[self.unk_token]

        for sentence in sentences:
            for i in range(2, len(sentence) - 2):
                center_word = sentence[i][0]
                if center_word == self.pad_token:
                    continue

                window_words = [sentence[j][0] for j in range(i - 2, i + 3)]
                word_indices = [self.word_to_index.get(w, unk_idx) for w in window_words]
                tag = sentence[i][1]
                tag_index = self.tag_to_index[tag]
                yield word_indices, tag_index


In [4]:

# ============================
# 🧩 Data Loader Utility
# ============================
def load_dataset_from_reader(reader, source):
    features = []
    labels = []
    for window, tag_index in reader.generate_windows(source):
        features.append(torch.tensor(window))
        labels.append(tag_index)
    X = torch.stack(features)
    y = torch.tensor(labels, dtype=torch.long)
    return X, y


In [5]:
# ============================
# 🧠 MLP POS Tagger
# ============================
class MLPPosTagger(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx, dropout_prob=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 5, hidden_dim),
            nn.Tanh(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        embedded = self.embedding(x)         # (batch_size, 5, 50)
        flat = embedded.view(x.size(0), -1)  # (batch_size, 250)
        return self.mlp(flat)                # logits: (batch_size, num_classes)

In [6]:
# ============================
# 🚀 Training Script
# ============================
torch.manual_seed(42)

reader = PosReader("data/pos/train")

X_train, y_train = load_dataset_from_reader(reader, "train")
X_val, y_val = load_dataset_from_reader(reader, "data/pos/dev")

# Create data loaders
batch_size = 16384
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size)

# Model setup
vocab_size = len(reader.vocab)
pad_idx = reader.word_to_index[reader.pad_token]
output_dim = len(reader.tag_vocab)

model = MLPPosTagger(
    vocab_size=vocab_size,
    embedding_dim=50,
    hidden_dim=200,
    output_dim=output_dim,
    pad_idx=pad_idx,
).to(device)

In [7]:
# Training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0055, weight_decay=3e-4)
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss, total_correct, total_samples = 0, 0, 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item() * X_batch.size(0)
        total_correct += (logits.argmax(dim=1) == y_batch).sum().item()
        total_samples += X_batch.size(0)
    train_loss = total_loss / total_samples
    train_acc = total_correct / total_samples

    model.eval()
    val_loss, val_correct, val_samples = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            val_loss += loss.item() * X_batch.size(0)
            val_correct += (logits.argmax(dim=1) == y_batch).sum().item()
            val_samples += X_batch.size(0)
    val_loss /= val_samples
    val_acc = val_correct / val_samples

    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} Acc: {val_acc:.4f}")

print("Training complete.")

Epoch 1/20 | Train Loss: 1.5803 Acc: 0.5730 | Val Loss: 0.9993 Acc: 0.7020
Epoch 2/20 | Train Loss: 0.9180 Acc: 0.7183 | Val Loss: 0.7772 Acc: 0.7612
Epoch 3/20 | Train Loss: 0.7547 Acc: 0.7659 | Val Loss: 0.6506 Acc: 0.8005
Epoch 4/20 | Train Loss: 0.6343 Acc: 0.8060 | Val Loss: 0.5511 Acc: 0.8332
Epoch 5/20 | Train Loss: 0.5295 Acc: 0.8417 | Val Loss: 0.4639 Acc: 0.8630
Epoch 6/20 | Train Loss: 0.4386 Acc: 0.8719 | Val Loss: 0.3943 Acc: 0.8836
Epoch 7/20 | Train Loss: 0.3645 Acc: 0.8946 | Val Loss: 0.3309 Acc: 0.9021
Epoch 8/20 | Train Loss: 0.3047 Acc: 0.9132 | Val Loss: 0.2833 Acc: 0.9165
Epoch 9/20 | Train Loss: 0.2589 Acc: 0.9271 | Val Loss: 0.2461 Acc: 0.9275
Epoch 10/20 | Train Loss: 0.2244 Acc: 0.9375 | Val Loss: 0.2192 Acc: 0.9363
Epoch 11/20 | Train Loss: 0.1986 Acc: 0.9451 | Val Loss: 0.2015 Acc: 0.9423
Epoch 12/20 | Train Loss: 0.1807 Acc: 0.9504 | Val Loss: 0.1893 Acc: 0.9461
Epoch 13/20 | Train Loss: 0.1676 Acc: 0.9543 | Val Loss: 0.1862 Acc: 0.9464
Epoch 14/20 | Train L