In [133]:
from typing import Dict, List, Optional
from collections import Counter
import os
import csv
!pip install torchmetrics
!pip install pytorch-metric-learning
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
!pip install pytorch-lightning
import torch.optim as optim
import torchmetrics
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [165]:
class Tokenizer:
    def __init__(self):
        # two special tokens for padding and unknown
        self.token2idx = {"<pad>": 0, "<unk>": 1}
        self.idx2token = ["<pad>", "<unk>"]
        self.is_fit = False
    
    @property
    def pad_id(self):
        return self.token2idx["<pad>"]
    
    def __len__(self):
        return len(self.idx2token)
    
    def fit(self, train_texts: List[str]):
        counter = Counter()
        for text in train_texts:
            counter.update(text.lower().split())
        
        # manually set a vocabulary size for the data set
        vocab_size = 20000
        self.idx2token.extend([token for token, count in counter.most_common(vocab_size - 2)])
        for (i, token) in enumerate(self.idx2token):
            self.token2idx[token] = i
            
        self.is_fit = True
                
    def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
        if not self.is_fit:
            raise Exception("Please fit the tokenizer on the training tokens")
        
        text_ids = text.strip().split()
        token_ids_num = []
        char_count = 1
        
        for every_word in text_ids:  
          every_word = every_word.lower()        
          if char_count > max_length and max_length != None:
            break
          char_count += 1 

          if every_word in self.token2idx:
            token_ids_num.append(self.token2idx[every_word])
          else:
            token_ids_num.append(self.token2idx['<unk>'])
        
        if max_length is None:
            return token_ids_num
        # truncate the tags if longer than max_length
        if len(token_ids_num) > max_length:
            return token_ids_num[:max_length]
        # pad with 0s if shorter than max_length
        if  max_length != None and char_count <= max_length:
            return token_ids_num + [self.token2idx['<pad>']] * (max_length - len(token_ids_num))  # 0 as padding for tags

In [166]:
def load_raw_data(filepath: str, with_tags: bool = True):
    data = {'text': []}
    if with_tags:
        data['tags'] = []
        with open(filepath) as f:
            reader = csv.reader(f)
            for text, tags in reader:
                data['text'].append(text)
                data['tags'].append(tags)
    else:
        with open(filepath) as f:
            for line in f:
                data['text'].append(line.strip())
    return data

In [167]:
tokenizer = Tokenizer()
data_dir = "/content/"
train_raw = load_raw_data(os.path.join(data_dir, "train.csv"))
val_raw = load_raw_data(os.path.join(data_dir, "val.csv"))
test_raw = load_raw_data(os.path.join(data_dir, "test_tokens.txt"), with_tags=False)
# fit the tokenizer on the training tokens
tokenizer.fit(train_raw['text'])

In [153]:
#modify as per workspace
tokenizer = Tokenizer()
train_raw = load_raw_data(os.path.join("train.csv"))
val_raw = load_raw_data(os.path.join("val.csv"))
test_raw = load_raw_data(os.path.join("test_tokens.txt"), with_tags=False)
# fit the tokenizer on the training tokens
tokenizer.fit(train_raw['text'])


In [168]:
class NERDataset: 
    tag2idx = {'O': 1, 'B-PER': 2, 'I-PER': 3, 'B-ORG': 4, 'I-ORG': 5, 'B-LOC': 6, 'I-LOC': 7, 'B-MISC': 8, 'I-MISC': 9}
    idx2tag = ['<pad>', 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG','B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
  
    def __init__(self, raw_data: Dict[str, List[str]], tokenizer: Tokenizer, max_length: int = 128):
        self.tokenizer = tokenizer
        self.token_ids = []
        self.tag_ids = []
        self.with_tags = False
        for text in raw_data['text']:
            self.token_ids.append(tokenizer.encode(text, max_length=max_length))
        if 'tags' in raw_data:
            self.with_tags = True
            for tags in raw_data['tags']:
                self.tag_ids.append(self.encode_tags(tags, max_length=max_length))
    
    def encode_tags(self, tags: str, max_length: Optional[int] = None):
        tag_ids = [self.tag2idx[tag] for tag in tags.split()]
        if max_length is None:
            return tag_ids
        # truncate the tags if longer than max_length
        if len(tag_ids) > max_length:
            return tag_ids[:max_length]
        # pad with 0s if shorter than max_length
        else:
            return tag_ids + [0] * (max_length - len(tag_ids))  # 0 as padding for tags
        
    def __len__(self):
        return len(self.token_ids)
    
    def __getitem__(self, idx):
        token_ids = torch.LongTensor(self.token_ids[idx])
        mask = token_ids == self.tokenizer.pad_id  # padding tokens
        if self.with_tags:
            # for training and validation
            return token_ids, mask, torch.LongTensor(self.tag_ids[idx])
        else:
            # for testing
            return token_ids, mask
        

In [169]:
tr_data = NERDataset(train_raw, tokenizer)
va_data = NERDataset(val_raw, tokenizer)
te_data = NERDataset(test_raw, tokenizer)

In [175]:
class PositionalEncoding(nn.Module):

  def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
    super ().__init__()
    self.dropout = nn.Dropout (p=dropout)

    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(
        torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
    )
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer("pe", pe)

  def forward (self, x: torch. Tensor) -> torch. Tensor:
    x = x + self.pe[:, : x.size(1)].requires_grad_(False)
    return self.dropout (x)

In [176]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, hidden_size, num_layers, dropout = 0.1):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(embed_size, dropout)
        encoder_layers = TransformerEncoderLayer(embed_size, num_heads, hidden_size, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers)
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.d_model = embed_size
        self.decoder = nn.Linear(embed_size, 10)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

        
    def forward(self, src: torch.Tensor, src_mask: torch.Tensor) -> torch.Tensor:
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src.transpose(1,0), src_key_padding_mask = src_mask)
        output = self.decoder(output.transpose(1,0))
        return output  


In [177]:
def validate(
    model: nn.Module, 
    dataloader: DataLoader, 
    device: torch.device,
):
    acc_metric = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10, compute_on_step=False).to(device)
    loss_metric = torchmetrics.MeanMetric(compute_on_step=False).to(device)
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, input_mask, tags = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            # output shape: (batch_size, max_length, num_classes)
            logits = model(input_ids, input_mask)
            # ignore padding index 0 when calculating loss
            loss = F.cross_entropy(logits.reshape(-1, 10), tags.reshape(-1), ignore_index=0)
                
            loss_metric.update(loss, input_mask.numel() - input_mask.sum())
            is_active = torch.logical_not(input_mask)  # non-padding elements
            # only consider non-padded tokens when calculating accuracy
            acc_metric.update(logits[is_active], tags[is_active])
    
    print(f"| Validate | loss {loss_metric.compute():.4f} | acc {acc_metric.compute():.4f} |")

In [178]:
def train(
    model: nn.Module, 
    dataloader: DataLoader, 
    optimizer: optim.Optimizer,
    device: torch.device,
    epoch: int,
):
    acc_metric = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10, compute_on_step=False).to(device)
    loss_metric = torchmetrics.MeanMetric(compute_on_step=False).to(device)
    model.train()
    
    # loop through all batches in the training
    for batch in tqdm(dataloader):
        input_ids, input_mask, tags = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        optimizer.zero_grad()
        # output shape: (batch_size, max_length, num_classes)
        logits = model(input_ids, input_mask)
        # ignore padding index 0 when calculating loss
        loss = F.cross_entropy(logits.reshape(-1, 10), tags.reshape(-1), ignore_index=0)
        
        loss.backward()
        optimizer.step()
        
        loss_metric.update(loss, input_mask.numel() - input_mask.sum())
        is_active = torch.logical_not(input_mask)  # non-padding elements
        # only consider non-padded tokens when calculating accuracy
        acc_metric.update(logits[is_active], tags[is_active])
    
    print(f"| Epoch {epoch} | loss {loss_metric.compute():.4f} | acc {acc_metric.compute():.4f} |")
    

In [179]:
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# data loaders
train_dataloader = DataLoader(tr_data, batch_size=32, shuffle=True)
val_dataloader = DataLoader(va_data, batch_size=32)
test_dataloader = DataLoader(te_data, batch_size=32)

# move the model to device
model = TransformerModel(vocab_size = len(tokenizer), 
    embed_size = 256, 
    num_heads = 4, 
    hidden_size = 256,
    num_layers = 2,).to(device)

optimizer = optim.Adam(model.parameters())

for epoch in range(5):
    train(model, train_dataloader, optimizer, device, epoch)
validate(model, val_dataloader, device)

100%|██████████| 439/439 [05:09<00:00,  1.42it/s]


| Epoch 0 | loss 0.3279 | acc 0.9115 |


100%|██████████| 439/439 [05:06<00:00,  1.43it/s]


| Epoch 1 | loss 0.1050 | acc 0.9683 |


100%|██████████| 439/439 [05:06<00:00,  1.43it/s]


| Epoch 2 | loss 0.0591 | acc 0.9815 |


100%|██████████| 439/439 [05:07<00:00,  1.43it/s]


| Epoch 3 | loss 0.0431 | acc 0.9862 |


100%|██████████| 439/439 [05:07<00:00,  1.43it/s]


| Epoch 4 | loss 0.0388 | acc 0.9875 |


100%|██████████| 102/102 [00:21<00:00,  4.83it/s]

| Validate | loss 0.2947 | acc 0.9426 |





In [183]:
# TODO: implement the predict function
from cmath import inf

def predict(model: nn.Module, dataloader: DataLoader, device: torch.device) -> List[List[str]]:
    model.eval()
    preds = []
    with torch.no_grad():
      for batch in tqdm(dataloader):
        input_ids, input_mask = batch[0].to(device), batch[1].to(device)
        logits = model(input_ids, input_mask)
        input_ids = input_ids.tolist()
        prec = torch.argmax(logits, dim = 2).tolist()
        sentence_index = 0        
        while sentence_index < len(prec):
          counter = 0
          inner_list = []
          for word in prec[sentence_index]:
            if input_ids[sentence_index][counter] == 0:
              break
            counter += 1
            inner_list.append(tr_data.idx2tag[word])
          preds.append(inner_list)
          sentence_index+=1
    return preds

In [184]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
from conlleval import evaluate

--2023-03-16 04:28:47--  https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7502 (7.3K) [text/plain]
Saving to: ‘conlleval.py.11’


2023-03-16 04:28:47 (86.9 MB/s) - ‘conlleval.py.11’ saved [7502/7502]



In [185]:
# use the conlleval script to measure the entity-level f1
pred_tags = []
for tags in predict(model, val_dataloader, device):
    pred_tags.extend(tags)
    pred_tags.append('O')
    
true_tags = []
for tags in val_raw['tags']:
    true_tags.extend(tags.strip().split())
    true_tags.append('O')

evaluate(true_tags, pred_tags)

100%|██████████| 102/102 [00:21<00:00,  4.81it/s]


processed 54612 tokens with 5942 phrases; found: 5613 phrases; correct: 4185.
accuracy:  69.27%; (non-O)
accuracy:  94.61%; precision:  74.56%; recall:  70.43%; FB1:  72.44
              LOC: precision:  87.82%; recall:  83.18%; FB1:  85.43  1740
             MISC: precision:  78.91%; recall:  73.86%; FB1:  76.30  863
              ORG: precision:  66.11%; recall:  65.18%; FB1:  65.64  1322
              PER: precision:  65.28%; recall:  59.83%; FB1:  62.44  1688


(74.55905932656334, 70.43083136990911, 72.43617481609694)

Example output from the above codeblock. We will take the overall test F1 score (69.24 in this example) and grade accordingly.
```
processed 54612 tokens with 5942 phrases; found: 5554 phrases; correct: 3980.
accuracy:  65.78%; (non-O)
accuracy:  93.88%; precision:  71.66%; recall:  66.98%; FB1:  69.24
              LOC: precision:  84.58%; recall:  77.03%; FB1:  80.63  1673
             MISC: precision:  77.31%; recall:  71.69%; FB1:  74.40  855
              ORG: precision:  58.71%; recall:  63.83%; FB1:  61.16  1458
              PER: precision:  66.84%; recall:  56.89%; FB1:  61.47  1568
(71.66006481814908, 66.98081454055873, 69.24147529575504)
```
If the codeblock above errors out, check your implementation of the `predict` function. It should return a nested list of lists, each containing predicted tags in their IOB string forms.

In [187]:
# YOU SHOULD NOT CHANGE THIS CODEBLOCK
# make prediction on the test set and save to submission.txt
preds = predict(model, test_dataloader, device)
with open("submission.txt", "w") as f:
    for tags in preds:
        f.write(" ".join(tags) + "\n")

100%|██████████| 108/108 [00:24<00:00,  4.34it/s]
