In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

Start with BiLSTM + softmax: Learns per-token labels with context

Then try BiLSTM + CRF: Adds label dependency structure

Then try BERT: Learns contextual embeddings from huge corpora

Then BERT + CRF: Combines power of attention + structured decoding

In [20]:
import os
from pathlib import Path
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

In [23]:
import pandas as pd
import chardet

In [30]:
data_file = os.path.join(PROJECT_ROOT, 'local_only', 'ner_dataset.csv')
with open(data_file, 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [31]:
df = pd.read_csv(data_file, encoding='Windows-1252')

In [32]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [33]:
df['Sentence #'] = df['Sentence #'].ffill()

In [34]:
df.head(4)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O


In [45]:
data = []
for index, sentence_list in df.groupby('Sentence #'):
    record = [(word, tag) for word,tag in sentence_list[['Word','Tag']].itertuples(index=False)]
    data.append(record)

In [43]:
record

[('Thousands', 'O'),
 ('of', 'O'),
 ('demonstrators', 'O'),
 ('have', 'O'),
 ('marched', 'O'),
 ('through', 'O'),
 ('London', 'B-geo'),
 ('to', 'O'),
 ('protest', 'O'),
 ('the', 'O'),
 ('war', 'O'),
 ('in', 'O'),
 ('Iraq', 'B-geo'),
 ('and', 'O'),
 ('demand', 'O'),
 ('the', 'O'),
 ('withdrawal', 'O'),
 ('of', 'O'),
 ('British', 'B-gpe'),
 ('troops', 'O'),
 ('from', 'O'),
 ('that', 'O'),
 ('country', 'O'),
 ('.', 'O')]

In [46]:
data[2]

[('Helicopter', 'O'),
 ('gunships', 'O'),
 ('Saturday', 'B-tim'),
 ('pounded', 'O'),
 ('militant', 'O'),
 ('hideouts', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('Orakzai', 'B-geo'),
 ('tribal', 'O'),
 ('region', 'O'),
 (',', 'O'),
 ('where', 'O'),
 ('many', 'O'),
 ('Taliban', 'B-org'),
 ('militants', 'O'),
 ('are', 'O'),
 ('believed', 'O'),
 ('to', 'O'),
 ('have', 'O'),
 ('fled', 'O'),
 ('to', 'O'),
 ('avoid', 'O'),
 ('an', 'O'),
 ('earlier', 'O'),
 ('military', 'O'),
 ('offensive', 'O'),
 ('in', 'O'),
 ('nearby', 'O'),
 ('South', 'B-geo'),
 ('Waziristan', 'I-geo'),
 ('.', 'O')]

In [48]:
len(data)

47959

# ---------------------
# 1. Sample Dataset
# ---------------------
data = [
    [("Barack", "B-PER"), ("Obama", "I-PER"), ("visited", "O"), ("Paris", "B-LOC")],
    [("Google", "B-ORG"), ("is", "O"), ("in", "O"), ("California", "B-LOC")]
]


In [58]:
# Build vocab and label mappings
word2idx = {"<PAD>": 0, "<UNK>": 1}
#tag2idx = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6}
tag2idx = {tag: index for index, tag in enumerate(df['Tag'].unique())}
idx2tag = {v: k for k, v in tag2idx.items()}

for sentence in data:
    for word, tag in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

In [65]:
tag2idx

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-nat': 13,
 'B-eve': 14,
 'I-eve': 15,
 'I-nat': 16}

In [81]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [102]:
len(word2idx)

35180

In [103]:
# ---------------------
# 2. Dataset Class
# ---------------------
class NERDataset(Dataset):
    def __init__(self, data, word2idx, tag2idx, max_len=10):
        self.data = data
        self.word2idx = word2idx
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]
        words = [word2idx.get(w, word2idx["<UNK>"]) for w, _ in sentence]
        tags = [tag2idx[t] for _, t in sentence]
        length = len(words)

        # Padding
        words += [0] * (self.max_len - length)
        tags += [0] * (self.max_len - length)

        return torch.tensor(words), torch.tensor(tags), length

# ---------------------
# 3. Model Definition
# ---------------------
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=64, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)

    def forward(self, x, lengths):
        embeds = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embeds, lengths, batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.lstm(packed)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        logits = self.fc(unpacked)
        return logits

In [113]:
def collate_fn_old(batch):
    inputs, labels, lengths = zip(*batch)
    lengths = [int(l) for l in lengths]  # Ensure all are plain ints
    max_len = max(lengths)

    padded_inputs = [torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)]) for x in inputs]
    padded_labels = [torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)]) for y in labels]

    return torch.stack(padded_inputs), torch.stack(padded_labels), torch.tensor(lengths)


In [114]:
def collate_fn(batch):
    inputs, labels, _ = zip(*batch)  # discard passed lengths
    lengths = [len(x) for x in inputs]  # recompute from inputs
    max_len = max(lengths)

    padded_inputs = [
        torch.cat([x, torch.zeros(max_len - len(x), dtype=torch.long)])
        for x in inputs
    ]
    padded_labels = [
        torch.cat([y, torch.zeros(max_len - len(y), dtype=torch.long)])
        for y in labels
    ]

    return torch.stack(padded_inputs), torch.stack(padded_labels), torch.tensor(lengths)


In [115]:
# ---------------------
# 4. Training
# ---------------------


In [116]:
dataset = NERDataset(data, word2idx, tag2idx)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

In [117]:
len(dataset)

47959

In [118]:
len(data)

47959

In [119]:
model = BiLSTM_NER(len(word2idx), len(tag2idx)).to(device)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)  # ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [120]:
model

BiLSTM_NER(
  (embedding): Embedding(35180, 64)
  (lstm): LSTM(64, 32, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=64, out_features=17, bias=True)
)

In [121]:
%%time
for epoch in range(20):
    print('epoch ->', epoch)
    model.train()
    for batch in loader:        
        inputs, labels, lengths = batch
        inputs = inputs.to(device)
        labels = labels.to(device)
        #lengths = lengths.to(device)
        
        logits = model(inputs, lengths)
        logits = logits.view(-1, logits.shape[-1])
        labels = labels.view(-1)
        loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")

epoch -> 0
Epoch 1 Loss: 0.0769
epoch -> 1
Epoch 2 Loss: 0.0549
epoch -> 2
Epoch 3 Loss: 0.1799
epoch -> 3
Epoch 4 Loss: 0.0291
epoch -> 4
Epoch 5 Loss: 0.5463
epoch -> 5
Epoch 6 Loss: 3.9230
epoch -> 6
Epoch 7 Loss: 1.4852
epoch -> 7
Epoch 8 Loss: nan
epoch -> 8
Epoch 9 Loss: 0.0009
epoch -> 9
Epoch 10 Loss: 0.0934
epoch -> 10
Epoch 11 Loss: 0.0126
epoch -> 11
Epoch 12 Loss: nan
epoch -> 12
Epoch 13 Loss: nan
epoch -> 13
Epoch 14 Loss: 0.3253
epoch -> 14
Epoch 15 Loss: 0.4818
epoch -> 15
Epoch 16 Loss: 0.0690
epoch -> 16
Epoch 17 Loss: 0.0027
epoch -> 17
Epoch 18 Loss: nan
epoch -> 18
Epoch 19 Loss: 0.7862
epoch -> 19
Epoch 20 Loss: nan
CPU times: user 26min 47s, sys: 5min 8s, total: 31min 56s
Wall time: 31min 55s


In [123]:
# ---------------------
# 5. Prediction Example
# ---------------------
model.eval()
with torch.no_grad():
    test_input, _, test_len = dataset[0]
    
    logits = model(test_input.unsqueeze(0), [test_len])
    pred = torch.argmax(logits, dim=-1).squeeze().tolist()[:test_len]
    words = [w for (w, t) in data[0]]
    print("\nPredictions:")
    for w, t_idx in zip(words, pred):
        print(f"{w:10} -> {idx2tag[t_idx]}")


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [122]:
15 // 2

7