In [None]:
from typing import Dict, List, Optional
from collections import Counter
import os
import csv
!pip install torchmetrics
!pip install pytorch-metric-learning
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
!pip install pytorch-lightning
import torch.optim as optim
import torchmetrics
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

## Todo Part 1
Complete the implementation of the encode method of the Tokenizer class:

`encode`: encode a given space-separated text into list of token ids according to the `self.token2idx` property. For tokens not present in the mapping, use the id of the `<unk>` token. If `max_length` is set, pad the input to `max_length` if it is less than `max_length` and truncate to `max_length` if it exceeds the length.

Examples
```python
text = "hello transformers !"
tokenizer.encode(text)                  # example output: [3, 4, 5]
tokenizer.encode(text, max_length=5)    # example output: [3, 4, 5, 0, 0]
tokenizer.encode(text, max_length=2)    # example output: [3, 4]
```

In [None]:
class Tokenizer:
    def __init__(self):
        # two special tokens for padding and unknown
        self.token2idx = {"<pad>": 0, "<unk>": 1}
        self.idx2token = ["<pad>", "<unk>"]
        self.is_fit = False
    
    @property
    def pad_id(self):
        return self.token2idx["<pad>"]
    
    def __len__(self):
        return len(self.idx2token)
    
    def fit(self, train_texts: List[str]):
        counter = Counter()
        for text in train_texts:
            counter.update(text.lower().split())
        
        # manually set a vocabulary size for the data set
        vocab_size = 20000
        self.idx2token.extend([token for token, count in counter.most_common(vocab_size - 2)])
        for (i, token) in enumerate(self.idx2token):
            self.token2idx[token] = i
            
        self.is_fit = True
                
    def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
        if not self.is_fit:
            raise Exception("Please fit the tokenizer on the training tokens")
            
        # TODO: implement the encode method, the method signature shouldn't be changed
        raise NotImplemented


In [None]:
def load_raw_data(filepath: str, with_tags: bool = True):
    data = {'text': []}
    if with_tags:
        data['tags'] = []
        with open(filepath) as f:
            reader = csv.reader(f)
            for text, tags in reader:
                data['text'].append(text)
                data['tags'].append(tags)
    else:
        with open(filepath) as f:
            for line in f:
                data['text'].append(line.strip())
    return data

In [None]:
tokenizer = Tokenizer()
train_raw = load_raw_data(os.path.join(data_dir, "train.csv"))
val_raw = load_raw_data(os.path.join(data_dir, "val.csv"))
test_raw = load_raw_data(os.path.join(data_dir, "test_tokens.txt"), with_tags=False)
# fit the tokenizer on the training tokens
tokenizer.fit(train_raw['text'])

In [None]:
#upload the dataset
#for google colb, use this
#from google.colab import files
#uploaded = files.upload()

In [None]:
#modify as per workspace
tokenizer = Tokenizer()
train_raw = load_raw_data(os.path.join("train.csv"))
val_raw = load_raw_data(os.path.join("val.csv"))
test_raw = load_raw_data(os.path.join("test_tokens.txt"), with_tags=False)
# fit the tokenizer on the training tokens
tokenizer.fit(train_raw['text'])


In [None]:
class NERDataset: 
    tag2idx = {'O': 1, 'B-PER': 2, 'I-PER': 3, 'B-ORG': 4, 'I-ORG': 5, 'B-LOC': 6, 'I-LOC': 7, 'B-MISC': 8, 'I-MISC': 9}
    idx2tag = ['<pad>', 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG','B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
  
    def __init__(self, raw_data: Dict[str, List[str]], tokenizer: Tokenizer, max_length: int = 128):
        self.tokenizer = tokenizer
        self.token_ids = []
        self.tag_ids = []
        self.with_tags = False
        for text in raw_data['text']:
            self.token_ids.append(tokenizer.encode(text, max_length=max_length))
        if 'tags' in raw_data:
            self.with_tags = True
            for tags in raw_data['tags']:
                self.tag_ids.append(self.encode_tags(tags, max_length=max_length))
    
    def encode_tags(self, tags: str, max_length: Optional[int] = None):
        tag_ids = [self.tag2idx[tag] for tag in tags.split()]
        if max_length is None:
            return tag_ids
        # truncate the tags if longer than max_length
        if len(tag_ids) > max_length:
            return tag_ids[:max_length]
        # pad with 0s if shorter than max_length
        else:
            return tag_ids + [0] * (max_length - len(tag_ids))  # 0 as padding for tags
        
    def __len__(self):
        return len(self.token_ids)
    
    def __getitem__(self, idx):
        token_ids = torch.LongTensor(self.token_ids[idx])
        mask = token_ids == self.tokenizer.pad_id  # padding tokens
        if self.with_tags:
            # for training and validation
            return token_ids, mask, torch.LongTensor(self.tag_ids[idx])
        else:
            # for testing
            return token_ids, mask
        

In [None]:
tr_data = NERDataset(train_raw, tokenizer)
va_data = NERDataset(val_raw, tokenizer)
te_data = NERDataset(test_raw, tokenizer)

## Todo Part 2
Implement and experiment with transformer models. The implementation should include **at least** the following:
- `nn.Embedding` layer to embed input token ids to the embedding space
- `nn.TransformerEncoder` layer to perform transformer operations
- `nn.Linear` layer as the output layer to map the output to the number of classes

As we will be using the cross-entropy loss, an `nn.Softmax` or `nn.LogSoftmax` layer is not needed.

You can refer to the following links for transformer Docs and examples:

https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html

https://pytorch.org/tutorials/beginner/transformer_tutorial.html

You can modify the `__init__` method including the signature needed. For the `forward` method, the method signature is given as follows:

- `src`: a `torch.LongTensor` of shape (batch_size, max_length, vocab_size) representing the input text tokens.

- `src_mask`: a `torch.BoolTensor` of shape (batch_size, max_length) indicating whether an input position is padded. This is needed to prevent the transformer model attending to padded tokens.

The output from the `forward` method should be of shape (batch_size, max_length, num_classes). Note that the number of classes should be 10 instead of 9 because of an additional padding class.


In [None]:
# TODO: implement the Transformer model architecture and forward method
class TransformerModel(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, src: torch.Tensor, src_mask: torch.Tensor) -> torch.Tensor:
        raise NotImplemented

In [None]:
#modify as required
def validate(
    model: nn.Module, 
    dataloader: DataLoader, 
    device: torch.device,
):
    acc_metric = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10, compute_on_step=False).to(device)
    loss_metric = torchmetrics.MeanMetric(compute_on_step=False).to(device)
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, input_mask, tags = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            # output shape: (batch_size, max_length, num_classes)
            logits = model(input_ids, input_mask)
            # ignore padding index 0 when calculating loss
            loss = F.cross_entropy(logits.reshape(-1, 10), tags.reshape(-1), ignore_index=0)
                
            loss_metric.update(loss, input_mask.numel() - input_mask.sum())
            is_active = torch.logical_not(input_mask)  # non-padding elements
            # only consider non-padded tokens when calculating accuracy
            acc_metric.update(logits[is_active], tags[is_active])
    
    print(f"| Validate | loss {loss_metric.compute():.4f} | acc {acc_metric.compute():.4f} |")

In [None]:
#modify as required
def train(
    model: nn.Module, 
    dataloader: DataLoader, 
    optimizer: optim.Optimizer,
    device: torch.device,
    epoch: int,
):
    acc_metric = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10, compute_on_step=False).to(device)
    loss_metric = torchmetrics.MeanMetric(compute_on_step=False).to(device)
    model.train()
    
    # loop through all batches in the training
    for batch in tqdm(dataloader):
        input_ids, input_mask, tags = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        optimizer.zero_grad()
        # output shape: (batch_size, max_length, num_classes)
        logits = model(input_ids, input_mask)
        # ignore padding index 0 when calculating loss
        loss = F.cross_entropy(logits.reshape(-1, 10), tags.reshape(-1), ignore_index=0)
        
        loss.backward()
        optimizer.step()
        
        loss_metric.update(loss, input_mask.numel() - input_mask.sum())
        is_active = torch.logical_not(input_mask)  # non-padding elements
        # only consider non-padded tokens when calculating accuracy
        acc_metric.update(logits[is_active], tags[is_active])
    
    print(f"| Epoch {epoch} | loss {loss_metric.compute():.4f} | acc {acc_metric.compute():.4f} |")
    

In [None]:
#modify as required
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# data loaders
train_dataloader = DataLoader(tr_data, batch_size=32, shuffle=True)
val_dataloader = DataLoader(va_data, batch_size=32)
test_dataloader = DataLoader(te_data, batch_size=32)

# move the model to device
model = TransformerModel(vocab_size = len(tokenizer), 
    embed_size = 256, 
    num_heads = 4, 
    hidden_size = 256,
    num_layers = 2,).to(device)

optimizer = optim.Adam(model.parameters())

for epoch in range(5):
    train(model, train_dataloader, optimizer, device, epoch)
validate(model, val_dataloader, device)

## Todo Part 3
Make predictions on the validation data and evaluate entity-level F1 scores using conlleval script.

`predict`: taking inputs of a trained model, a dataloader, and a torch device, predict the tags for all tokens in the data set. The output should be a nested list of lists, each containing tag predictions for a single sentence.

    Input texts in the dataloader (2 sentences):
    EU rejects German call
    Only France and Britain backed Fischler 's proposal .
    
    Example output:
    [['B-ORG', 'O', 'B-MISC', 'O'], ['O', 'B-LOC', 'O', 'B-LOC', 'O', 'B-PER', 'O', 'O', 'O']]
        

In [None]:
# TODO: implement the predict function
def predict(model: nn.Module, dataloader: DataLoader, device: torch.device) -> List[List[str]]:
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            raise NotImplemented
                    
    return preds

In [None]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
from conlleval import evaluate

In [None]:
# use the conlleval script to measure the entity-level f1
pred_tags = []
for tags in predict(model, val_dataloader, device):
    pred_tags.extend(tags)
    pred_tags.append('O')
    
true_tags = []
for tags in val_raw['tags']:
    true_tags.extend(tags.strip().split())
    true_tags.append('O')

evaluate(true_tags, pred_tags)

Example output from the above codeblock. We will take the overall test F1 score (69.24 in this example) and grade accordingly.
```
processed 54612 tokens with 5942 phrases; found: 5554 phrases; correct: 3980.
accuracy:  65.78%; (non-O)
accuracy:  93.88%; precision:  71.66%; recall:  66.98%; FB1:  69.24
              LOC: precision:  84.58%; recall:  77.03%; FB1:  80.63  1673
             MISC: precision:  77.31%; recall:  71.69%; FB1:  74.40  855
              ORG: precision:  58.71%; recall:  63.83%; FB1:  61.16  1458
              PER: precision:  66.84%; recall:  56.89%; FB1:  61.47  1568
(71.66006481814908, 66.98081454055873, 69.24147529575504)
```
If the codeblock above errors out, check your implementation of the `predict` function. It should return a nested list of lists, each containing predicted tags in their IOB string forms.

## Todo Part 4
Once you finish all previous todos and are satisfied with the model performance on the validation set, make predictions on the test set and keep a copy of the `submission.txt` file by downloading it to your local machine. You can find `submission.txt` under Output > `/kaggle/working`.

In [None]:
# YOU SHOULD NOT CHANGE THIS CODEBLOCK
# make prediction on the test set and save to submission.txt
preds = predict(model, test_dataloader, device)
with open("submission.txt", "w") as f:
    for tags in preds:
        f.write(" ".join(tags) + "\n")

In [None]:
pwd

In [None]:
ls