In [2]:
from google.colab import files
uploaded = files.upload()


Saving en_ewt-ud-dev.jsonl to en_ewt-ud-dev (1).jsonl
Saving en_ewt-ud-test.jsonl to en_ewt-ud-test (1).jsonl
Saving en_ewt-ud-train.jsonl to en_ewt-ud-train (1).jsonl


In [3]:
{
 "tokens": ["I","love","NLP"],
 "upos":  ["PRON","VERB","PROPN"]
}


{'tokens': ['I', 'love', 'NLP'], 'upos': ['PRON', 'VERB', 'PROPN']}

In [7]:
import json

def load_jsonl(path):
    sentences = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue

            obj = json.loads(line)

            # Format thực tế của dataset bạn upload
            words = obj["words"]
            tags  = obj["tags"]

            # Gộp lại thành [(word, tag), ...]
            sentences.append(list(zip(words, tags)))

    return sentences

train_sentences = load_jsonl("en_ewt-ud-train.jsonl")
dev_sentences   = load_jsonl("en_ewt-ud-dev.jsonl")
test_sentences  = load_jsonl("en_ewt-ud-test.jsonl")

print("Train:", len(train_sentences))
print("Dev:", len(dev_sentences))
print("Test:", len(test_sentences))
print(train_sentences[0][:10])


Train: 12544
Dev: 2001
Test: 2077
[('Al', 'PROPN'), ('-', 'PUNCT'), ('Zaman', 'PROPN'), (':', 'PUNCT'), ('American', 'ADJ'), ('forces', 'NOUN'), ('killed', 'VERB'), ('Shaikh', 'PROPN'), ('Abdullah', 'PROPN'), ('al', 'PROPN')]


In [8]:
word_to_ix = {"<UNK>": 0, "<PAD>": 1}
tag_to_ix = {"<PAD>": 0}

for sent in train_sentences:
    for word, tag in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

ix_to_tag = {v: k for k, v in tag_to_ix.items()}

print("Word vocab size:", len(word_to_ix))
print("Tag vocab size:", len(tag_to_ix))


Word vocab size: 19675
Tag vocab size: 18


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class POSDataset(Dataset):
    def __init__(self, sentences, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sent = self.sentences[idx]
        words = [self.word_to_ix.get(w, 0) for w, _ in sent]
        tags  = [self.tag_to_ix[t] for _, t in sent]
        return torch.tensor(words), torch.tensor(tags)

def collate_fn(batch):
    words, tags = zip(*batch)
    words = pad_sequence(words, batch_first=True, padding_value=word_to_ix["<PAD>"])
    tags  = pad_sequence(tags, batch_first=True, padding_value=tag_to_ix["<PAD>"])
    return words, tags

train_loader = DataLoader(POSDataset(train_sentences, word_to_ix, tag_to_ix),
                          batch_size=32, shuffle=True, collate_fn=collate_fn)

dev_loader = DataLoader(POSDataset(dev_sentences, word_to_ix, tag_to_ix),
                        batch_size=32, shuffle=False, collate_fn=collate_fn)

test_loader = DataLoader(POSDataset(test_sentences, word_to_ix, tag_to_ix),
                         batch_size=32, shuffle=False, collate_fn=collate_fn)


In [10]:
import torch.nn as nn

class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, tagset_size, emb_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=word_to_ix["<PAD>"])
        self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.rnn(emb)
        logits = self.fc(out)
        return logits


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleRNNForTokenClassification(len(word_to_ix), len(tag_to_ix)).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tag_to_ix["<PAD>"])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [12]:
def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for words, tags in loader:
            words, tags = words.to(device), tags.to(device)
            logits = model(words)
            preds = logits.argmax(-1)

            mask = tags != tag_to_ix["<PAD>"]
            correct += (preds[mask] == tags[mask]).sum().item()
            total   += mask.sum().item()

    return correct / total


In [13]:
EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for words, tags in train_loader:
        words, tags = words.to(device), tags.to(device)

        optimizer.zero_grad()
        logits = model(words)

        loss = criterion(
            logits.reshape(-1, logits.shape[-1]),
            tags.reshape(-1)
        )
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    train_acc = evaluate(model, train_loader)
    dev_acc = evaluate(model, dev_loader)

    print(f"Epoch {epoch+1} | Loss={total_loss:.3f} | Train={train_acc:.4f} | Dev={dev_acc:.4f}")


Epoch 1 | Loss=385.760 | Train=0.7886 | Dev=0.7572
Epoch 2 | Loss=224.991 | Train=0.8519 | Dev=0.8060
Epoch 3 | Loss=166.991 | Train=0.8896 | Dev=0.8281
Epoch 4 | Loss=129.099 | Train=0.9172 | Dev=0.8376
Epoch 5 | Loss=102.488 | Train=0.9338 | Dev=0.8505


In [17]:
def predict(sentence):
    tokens = sentence.split()
    idxs = [word_to_ix.get(w, word_to_ix["<UNK>"]) for w in tokens]
    x = torch.tensor([idxs]).to(device)

    with torch.no_grad():
        logits = model(x)
        preds = logits.argmax(-1)[0]

    return list(zip(tokens, [ix_to_tag[p.item()] for p in preds]))

predict("I will get 10 points for this assignment.")


[('I', 'PRON'),
 ('will', 'AUX'),
 ('get', 'VERB'),
 ('10', 'NUM'),
 ('points', 'NOUN'),
 ('for', 'ADP'),
 ('this', 'DET'),
 ('assignment.', 'NOUN')]