In [39]:
from typing import Dict, List, Optional
from collections import Counter
import os
import csv
!pip install torchmetrics
!pip install pytorch-metric-learning
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
!pip install pytorch-lightning
import torch.optim as optim
import torchmetrics
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Todo Part 1
Complete the implementation of the encode method of the Tokenizer class:

`encode`: encode a given space-separated text into list of token ids according to the `self.token2idx` property. For tokens not present in the mapping, use the id of the `<unk>` token. If `max_length` is set, pad the input to `max_length` if it is less than `max_length` and truncate to `max_length` if it exceeds the length.

Examples
```python
text = "hello transformers !"
tokenizer.encode(text)                  # example output: [3, 4, 5]
tokenizer.encode(text, max_length=5)    # example output: [3, 4, 5, 0, 0]
tokenizer.encode(text, max_length=2)    # example output: [3, 4]
```

In [61]:
class Tokenizer:
    def __init__(self):
        # two special tokens for padding and unknown
        self.token2idx = {"<pad>": 0, "<unk>": 1}
        self.idx2token = ["<pad>", "<unk>"]
        self.is_fit = False
    
    @property
    def pad_id(self):
        return self.token2idx["<pad>"]
    
    def __len__(self):
        return len(self.idx2token)
    
    def fit(self, train_texts: List[str]):
        counter = Counter()
        for text in train_texts:
            counter.update(text.lower().split())
        
        # manually set a vocabulary size for the data set
        vocab_size = 20000
        self.idx2token.extend([token for token, count in counter.most_common(vocab_size - 2)])
        for (i, token) in enumerate(self.idx2token):
            self.token2idx[token] = i
            
        self.is_fit = True
                

    def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
        if not self.is_fit:
            raise Exception("Please fit the tokenizer on the training tokens")
        # 将文本转换为小写，然后分割成单词
        words = text.lower().split() 
        # 将单词转换为对应的ID
        ids = []
        for word in words:
          if word in self.token2idx:
              ids.append(self.token2idx[word])
          else:
              ids.append(self.token2idx["<unk>"])
        # 将序列截断或填充到指定的长度
        if max_length is not None:
          if len(ids) > max_length:
            ids = ids[:max_length]
          else:  
            ids = ids + [self.pad_id] * (max_length - len(ids))
        return ids


In [62]:
def load_raw_data(filepath: str, with_tags: bool = True):
    data = {'text': []}
    if with_tags:
        data['tags'] = []
        with open(filepath) as f:
            reader = csv.reader(f)
            for text, tags in reader:
                data['text'].append(text)
                data['tags'].append(tags)
    else:
        with open(filepath) as f:
            for line in f:
                data['text'].append(line.strip())
    return data

In [63]:
#手动上传data到colab

In [64]:
tokenizer = Tokenizer()
# 定义数据库路径
data_dir = "/content/"

train_raw = load_raw_data(os.path.join(data_dir, "train.csv"))
val_raw = load_raw_data(os.path.join(data_dir, "val.csv"))
test_raw = load_raw_data(os.path.join(data_dir, "test_tokens.txt"), with_tags=False)
# fit the tokenizer on the training tokens
tokenizer.fit(train_raw['text'])

In [65]:
class NERDataset: 
    tag2idx = {'O': 1, 'B-PER': 2, 'I-PER': 3, 'B-ORG': 4, 'I-ORG': 5, 'B-LOC': 6, 'I-LOC': 7, 'B-MISC': 8, 'I-MISC': 9}
    idx2tag = ['<pad>', 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG','B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
  
    def __init__(self, raw_data: Dict[str, List[str]], tokenizer: Tokenizer, max_length: int = 128):
        self.tokenizer = tokenizer
        self.token_ids = []
        self.tag_ids = []
        self.with_tags = False
        for text in raw_data['text']:
            self.token_ids.append(self.tokenizer.encode(text, max_length=max_length))
        if 'tags' in raw_data:
            self.with_tags = True
            for tags in raw_data['tags']:
                self.tag_ids.append(self.encode_tags(tags, max_length=max_length))
    
    def encode_tags(self, tags: str, max_length: Optional[int] = None):
        tag_ids = [self.tag2idx[tag] for tag in tags.split()]
        if max_length is None:
            return tag_ids
        # truncate the tags if longer than max_length
        if len(tag_ids) > max_length:
            return tag_ids[:max_length]
        # pad with 0s if shorter than max_length
        else:
            return tag_ids + [0] * (max_length - len(tag_ids))  # 0 as padding for tags
        
    def __len__(self):
        return len(self.token_ids)
    
    def __getitem__(self, idx):
        token_ids = torch.LongTensor(self.token_ids[idx])
        mask = token_ids == self.tokenizer.pad_id  # padding tokens
        if self.with_tags:
            # for training and validation
            return token_ids, mask, torch.LongTensor(self.tag_ids[idx])
        else:
            # for testing
            return token_ids, mask
        

In [66]:
tr_data = NERDataset(train_raw, tokenizer)
va_data = NERDataset(val_raw, tokenizer)
te_data = NERDataset(test_raw, tokenizer)

## Todo Part 2
Implement and experiment with transformer models. The implementation should include **at least** the following:
- `nn.Embedding` layer to embed input token ids to the embedding space
- `nn.TransformerEncoder` layer to perform transformer operations
- `nn.Linear` layer as the output layer to map the output to the number of classes

As we will be using the cross-entropy loss, an `nn.Softmax` or `nn.LogSoftmax` layer is not needed.

You can refer to the following links for transformer Docs and examples:

https://pytorch.org/docs/stable/_modules/torch/nn/modules/transformer.html

https://pytorch.org/tutorials/beginner/transformer_tutorial.html

You can modify the `__init__` method including the signature needed. For the `forward` method, the method signature is given as follows:

- `src`: a `torch.LongTensor` of shape (batch_size, max_length, vocab_size) representing the input text tokens.

- `src_mask`: a `torch.BoolTensor` of shape (batch_size, max_length) indicating whether an input position is padded. This is needed to prevent the transformer model attending to padded tokens.

The output from the `forward` method should be of shape (batch_size, max_length, num_classes). Note that the number of classes should be 10 instead of 9 because of an additional padding class.


In [67]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, hidden_size, num_layers):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.transformer_encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(embed_size, num_heads, hidden_size), num_layers)
        self.fc = nn.Linear(embed_size, 10) # 10 classes including padding

    def forward(self, src, src_mask):
        src_embed = self.embedding(src)
        src_embed = src_embed.permute(1,0,2) # (max_length, batch_size, embed_size)
        output = self.transformer_encoder(src_embed, src_key_padding_mask=src_mask)
        output = output.permute(1,0,2) # (batch_size, max_length, embed_size)
        output = self.fc(output)
        return output


In [68]:
 #modify as required
def validate(
    model: nn.Module, 
    dataloader: DataLoader, 
    device: torch.device,
):
    acc_metric = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10, compute_on_step=False).to(device)
    loss_metric = torchmetrics.MeanMetric(compute_on_step=False).to(device)
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, input_mask, tags = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            # output shape: (batch_size, max_length, num_classes)
            logits = model(input_ids, input_mask)
            # ignore padding index 0 when calculating loss
            loss = F.cross_entropy(logits.reshape(-1, 10), tags.reshape(-1), ignore_index=0)
                
            loss_metric.update(loss, input_mask.numel() - input_mask.sum())
            is_active = torch.logical_not(input_mask)  # non-padding elements
            # only consider non-padded tokens when calculating accuracy
            acc_metric.update(logits[is_active], tags[is_active])
    
    acc = acc_metric.compute()
    print(f"| Validate | loss {loss_metric.compute():.4f} | acc {acc:.4f} |")
    return acc

In [69]:
#modify as required
def train(
    model: nn.Module, 
    dataloader: DataLoader, 
    optimizer: optim.Optimizer,
    device: torch.device,
    epoch: int,
):
    acc_metric = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10, compute_on_step=False).to(device)
    loss_metric = torchmetrics.MeanMetric(compute_on_step=False).to(device)
    model.train()
    
    # loop through all batches in the training
    for batch in tqdm(dataloader):
        input_ids, input_mask, tags = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        optimizer.zero_grad()
        # output shape: (batch_size, max_length, num_classes)
        logits = model(input_ids, input_mask)
        # ignore padding index 0 when calculating loss
        loss = F.cross_entropy(logits.reshape(-1, 10), tags.reshape(-1), ignore_index=0)
        
        loss.backward()
        optimizer.step()
        
        loss_metric.update(loss, input_mask.numel() - input_mask.sum())
        is_active = torch.logical_not(input_mask)  # non-padding elements
        # only consider non-padded tokens when calculating accuracy
        acc_metric.update(logits[is_active], tags[is_active])
    
    print(f"| Epoch {epoch} | loss {loss_metric.compute():.4f} | acc {acc_metric.compute():.4f} |")
    

In [79]:
#modify as required
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# data loaders
train_dataloader = DataLoader(tr_data, batch_size=16, shuffle=True)
val_dataloader = DataLoader(va_data, batch_size=16)
test_dataloader = DataLoader(te_data, batch_size=16)

# move the model to device
model = TransformerModel(vocab_size = len(tokenizer), 
    embed_size = 256, 
    num_heads = 4, 
    hidden_size = 256,
    num_layers = 2,).to(device)

optimizer = optim.Adam(model.parameters())

for epoch in range(20):
    train(model, train_dataloader, optimizer, device, epoch)
validate(model, val_dataloader, device)

100%|██████████| 878/878 [04:52<00:00,  3.01it/s]


| Epoch 0 | loss 0.4305 | acc 0.8856 |


100%|██████████| 878/878 [04:50<00:00,  3.03it/s]


| Epoch 1 | loss 0.2405 | acc 0.9297 |


100%|██████████| 878/878 [04:48<00:00,  3.04it/s]


| Epoch 2 | loss 0.1702 | acc 0.9484 |


100%|██████████| 878/878 [04:48<00:00,  3.04it/s]


| Epoch 3 | loss 0.1335 | acc 0.9587 |


100%|██████████| 878/878 [04:48<00:00,  3.05it/s]


| Epoch 4 | loss 0.1115 | acc 0.9647 |


100%|██████████| 878/878 [04:48<00:00,  3.04it/s]


| Epoch 5 | loss 0.0968 | acc 0.9689 |


100%|██████████| 878/878 [04:48<00:00,  3.04it/s]


| Epoch 6 | loss 0.0857 | acc 0.9727 |


100%|██████████| 878/878 [04:48<00:00,  3.04it/s]


| Epoch 7 | loss 0.0777 | acc 0.9750 |


100%|██████████| 878/878 [04:48<00:00,  3.05it/s]


| Epoch 8 | loss 0.0705 | acc 0.9769 |


100%|██████████| 878/878 [04:48<00:00,  3.04it/s]


| Epoch 9 | loss 0.0693 | acc 0.9778 |


100%|██████████| 878/878 [04:47<00:00,  3.05it/s]


| Epoch 10 | loss 0.0643 | acc 0.9793 |


100%|██████████| 878/878 [04:50<00:00,  3.02it/s]


| Epoch 11 | loss 0.0605 | acc 0.9804 |


100%|██████████| 878/878 [04:50<00:00,  3.02it/s]


| Epoch 12 | loss 0.0570 | acc 0.9814 |


100%|██████████| 878/878 [04:51<00:00,  3.02it/s]


| Epoch 13 | loss 0.0545 | acc 0.9823 |


100%|██████████| 878/878 [04:51<00:00,  3.01it/s]


| Epoch 14 | loss 0.0536 | acc 0.9828 |


100%|██████████| 878/878 [04:51<00:00,  3.01it/s]


| Epoch 15 | loss 0.0515 | acc 0.9829 |


100%|██████████| 878/878 [04:51<00:00,  3.01it/s]


| Epoch 16 | loss 0.0488 | acc 0.9842 |


100%|██████████| 878/878 [04:52<00:00,  3.00it/s]


| Epoch 17 | loss 0.0444 | acc 0.9856 |


100%|██████████| 878/878 [04:52<00:00,  3.00it/s]


| Epoch 18 | loss 0.0455 | acc 0.9851 |


100%|██████████| 878/878 [04:52<00:00,  3.00it/s]


| Epoch 19 | loss 0.0416 | acc 0.9864 |


100%|██████████| 204/204 [00:18<00:00, 11.31it/s]

| Validate | loss 0.3429 | acc 0.9266 |





tensor(0.9266)

In [None]:
print()

## Todo Part 3
Make predictions on the validation data and evaluate entity-level F1 scores using conlleval script.

`predict`: taking inputs of a trained model, a dataloader, and a torch device, predict the tags for all tokens in the data set. The output should be a nested list of lists, each containing tag predictions for a single sentence.

    Input texts in the dataloader (2 sentences):
    EU rejects German call
    Only France and Britain backed Fischler 's proposal .
    
    Example output:
    [['B-ORG', 'O', 'B-MISC', 'O'], ['O', 'B-LOC', 'O', 'B-LOC', 'O', 'B-PER', 'O', 'O', 'O']]
        

In [80]:
# TODO: implement the predict function

def predict(model: nn.Module, dataloader: DataLoader, device: torch.device) -> List[List[str]]:
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, input_mask = batch[0].to(device), batch[1].to(device)
            # output shape: (batch_size, max_length, num_classes)
            logits = model(input_ids, input_mask)
            # apply softmax to obtain probabilities
            probs = F.softmax(logits, dim=-1)
            # find the predicted class labels (indices)
            pred_indices = torch.argmax(probs, dim=-1).cpu().numpy()

            # convert the predicted indices back to their corresponding tags and ignore padding tokens
            for i, seq in enumerate(pred_indices):
                preds.append([NERDataset.idx2tag[idx] for idx in seq[:sum(~input_mask[i].cpu().numpy())]])
                    
    return preds


In [81]:
tag2idx = {'O': 1, 'B-PER': 2, 'I-PER': 3, 'B-ORG': 4, 'I-ORG': 5, 'B-LOC': 6, 'I-LOC': 7, 'B-MISC': 8, 'I-MISC': 9}

In [82]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
from conlleval import evaluate

--2023-03-21 06:02:04--  https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7502 (7.3K) [text/plain]
Saving to: ‘conlleval.py.4’


2023-03-21 06:02:04 (44.7 MB/s) - ‘conlleval.py.4’ saved [7502/7502]



In [83]:
# use the conlleval script to measure the entity-level f1
pred_tags = []
for tags in predict(model, val_dataloader, device):
    pred_tags.extend(tags)
    pred_tags.append('O')
    
true_tags = []
for tags in val_raw['tags']:
    true_tags.extend(tags.strip().split())
    true_tags.append('O')

evaluate(true_tags, pred_tags)

100%|██████████| 204/204 [00:19<00:00, 10.46it/s]

processed 54612 tokens with 5942 phrases; found: 5802 phrases; correct: 3843.
accuracy:  63.14%; (non-O)
accuracy:  93.10%; precision:  66.24%; recall:  64.68%; FB1:  65.45
              LOC: precision:  80.06%; recall:  78.44%; FB1:  79.24  1800
             MISC: precision:  73.16%; recall:  72.13%; FB1:  72.64  909
              ORG: precision:  54.58%; recall:  59.51%; FB1:  56.94  1462
              PER: precision:  57.57%; recall:  50.98%; FB1:  54.07  1631





(66.23578076525337, 64.67519353752945, 65.44618528610356)

Example output from the above codeblock. We will take the overall test F1 score (69.24 in this example) and grade accordingly.
```
processed 54612 tokens with 5942 phrases; found: 5554 phrases; correct: 3980.
accuracy:  65.78%; (non-O)
accuracy:  93.88%; precision:  71.66%; recall:  66.98%; FB1:  69.24
              LOC: precision:  84.58%; recall:  77.03%; FB1:  80.63  1673
             MISC: precision:  77.31%; recall:  71.69%; FB1:  74.40  855
              ORG: precision:  58.71%; recall:  63.83%; FB1:  61.16  1458
              PER: precision:  66.84%; recall:  56.89%; FB1:  61.47  1568
(71.66006481814908, 66.98081454055873, 69.24147529575504)
```
If the codeblock above errors out, check your implementation of the `predict` function. It should return a nested list of lists, each containing predicted tags in their IOB string forms.

## Todo Part 4
Once you finish all previous todos and are satisfied with the model performance on the validation set, make predictions on the test set and keep a copy of the `submission.txt` file by downloading it to your local machine. You can find `submission.txt` under Output > `/kaggle/working`.

In [84]:
# YOU SHOULD NOT CHANGE THIS CODEBLOCK
# make prediction on the test set and save to submission.txt
preds = predict(model, val_dataloader, device)
with open("submission.txt", "w") as f:
    for tags in preds:
        f.write(" ".join(tags) + "\n")

100%|██████████| 204/204 [00:19<00:00, 10.21it/s]


In [85]:
pwd

'/content'

In [86]:
ls

conlleval.py    conlleval.py.3  [0m[01;34msample_data[0m/     train.csv
conlleval.py.1  conlleval.py.4  submission.txt   val.csv
conlleval.py.2  [01;34m__pycache__[0m/    test_tokens.txt
