In [1]:
!pip3 install -U datasets transformers seqeval
!pip3 install -U pytorch-lightning torch
!pip3 install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 23.4 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 66.4 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 51.1 MB/s 
[?25hCollecting dill<0.3.5
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.3 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |███████████████

In [2]:
import os, csv
from itertools import compress
import warnings

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, RandomSampler, random_split
import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, EarlyStopping
import transformers
from datasets import load_dataset, load_metric

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import sklearn.preprocessing
from argparse import ArgumentParser

In [3]:
# download the original CONLL2003 dataset in tgz compressed format
!wget 'https://data.deepai.org/conll2003.zip'

--2022-05-23 18:59:57--  https://data.deepai.org/conll2003.zip
Resolving data.deepai.org (data.deepai.org)... 138.201.36.183
Connecting to data.deepai.org (data.deepai.org)|138.201.36.183|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 982975 (960K) [application/x-zip-compressed]
Saving to: ‘conll2003.zip’


2022-05-23 18:59:58 (2.03 MB/s) - ‘conll2003.zip’ saved [982975/982975]



In [4]:
# Unzip the file
!unzip conll2003.zip

Archive:  conll2003.zip
  inflating: metadata                
  inflating: test.txt                
  inflating: train.txt               
  inflating: valid.txt               


In [5]:
def get_conll_data(file_path):

    # read data from file.
    data = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file, delimiter = ' ')
        for row in reader:
            data.append([row])

    sentences = []
    sentence = []
    entities = []
    tags = []

    for row in data:
        # extract first element of list.
        row = row[0]
        # TO DO: move to data reader.
        if len(row) > 0 and row[0] != '-DOCSTART-':
            sentence.append(row[0])
            tags.append(row[-1])        
        if len(row) == 0 and len(sentence) > 0:
            # clean up sentence/tags.
            # remove white spaces.
            selector = [word != ' ' for word in sentence]
            sentence = list(compress(sentence, selector))
            tags = list(compress(tags, selector))
            # append if sentence length is still greater than zero..
            if len(sentence) > 0:
                sentences.append(sentence)
                entities.append(tags)
            sentence = []
            tags = []
    
    return {'sentences': sentences, 'tags': entities}

In [6]:
MODEL_NAME_OR_PATH  = 'bert-base-uncased'
transformer_model = transformers.AutoModel.from_pretrained(MODEL_NAME_OR_PATH)
transformer_tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
transformer_config = transformers.AutoConfig.from_pretrained(MODEL_NAME_OR_PATH) 

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [7]:
class NERDataSet(Dataset):
    
    def __init__(self, 
                examples, 
                tokenizer: transformers.PreTrainedTokenizer,
                tag_encoder: sklearn.preprocessing.LabelEncoder, 
                label_all_tokens: bool = False  
                ):
        self.sentences = examples['sentences']
        self.tags = examples['tags']
        self.tokenizer = tokenizer
        self.tag_encoder = tag_encoder
        self.label_all_tokens = label_all_tokens
    
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        tags = self.tags[item]
        # encode tags and sentence words
        tags = self.tag_encoder.transform(tags)
        tokenized_inputs = self.tokenizer(self.sentences[item], truncation=True, is_split_into_words=True)

        word_ids = tokenized_inputs.word_ids()
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(tags[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.  A word could be split into two or more tokens occasionally depending on the model tokenizer
            else:
                label_ids.append(tags[word_idx] if self.label_all_tokens else -100)
            previous_word_idx = word_idx

        tokenized_inputs["target_tags"] = label_ids
        return tokenized_inputs

In [8]:
class NERDataModule(pl.LightningDataModule):
    def __init__(self, batch_size: int = 16, num_workers: int = 2):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.tokenizer = transformer_tokenizer
        self.label_pad_token_id = -100
  
    def prepare_data(self):
        self.train_data = get_conll_data('train.txt')
        self.val_data = get_conll_data('valid.txt')
        self.test_data = get_conll_data('test.txt')

        self.tag_complete = self.get_tag_scheme()
        self.tag_encoder = sklearn.preprocessing.LabelEncoder()
        self.tag_encoder.fit(self.tag_complete)
  
    def setup(self, stage=None):
        self.train_dataset = NERDataSet(self.train_data, tokenizer=self.tokenizer, tag_encoder=self.tag_encoder, label_all_tokens=True)
        self.val_dataset = NERDataSet(self.val_data, tokenizer=self.tokenizer, tag_encoder=self.tag_encoder, label_all_tokens=True)
        self.test_dataset = NERDataSet(self.test_data, tokenizer=self.tokenizer, tag_encoder=self.tag_encoder, label_all_tokens=True)
    
    def get_tag_scheme(self):
      tag_scheme = [
          'B-PER',
          'I-PER',
          'B-ORG',
          'I-ORG',
          'B-LOC',
          'I-LOC',
          'B-MISC',
          'I-MISC'
      ]
      tag_outside = 'O'
      tag_complete = [tag_outside] + tag_scheme
      return tag_complete

    def custom_collate(self,features):
        label_name = "target_tags"
        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
        
        batch = self.tokenizer.pad(  
            features,
            padding=True,
            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
            return_tensors="pt" if labels is None else None,
        )

        if labels is None:
            return batch

        sequence_length = torch.tensor(batch["input_ids"]).shape[1]
        padding_side = self.tokenizer.padding_side
        if padding_side == "right":
            batch[label_name] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels]
        else:
            batch[label_name] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels]

        batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}

        return batch    
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, shuffle=True, batch_size=self.batch_size, num_workers=self.num_workers, collate_fn=self.custom_collate)

    def val_dataloader(self):
         return DataLoader(self.val_dataset,batch_size=self.batch_size, num_workers=self.num_workers, collate_fn=self.custom_collate)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers, collate_fn=self.custom_collate)

In [9]:
class NERModel(torch.nn.Module):

    def __init__(self,
                  n_tags: int, dropout: float = 0.1, 
                 **kwargs):
      
        super().__init__()
 
        self.n_tags = n_tags
        self.dropout = dropout
        self.transformer = transformer_model
        self.transformer_name = self.transformer.name_or_path
        self.transformer_config = transformer_config
        self.dropout = torch.nn.Dropout(dropout)
        self.tags = torch.nn.Linear(self.transformer_config.hidden_size, n_tags)
    
    def forward(self,  batch)-> torch.Tensor:

        outputs = self.transformer(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        hidden_state = outputs[0]
        outputs = self.dropout(hidden_state)
        outputs = self.tags(outputs)

        return outputs

In [10]:
class NERTokenClassifier(pl.LightningModule):

    def __init__(self, n_tags: int, learning_rate: float = 0.0001 * 8, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        self.n_tags = n_tags
        self.metric = load_metric("seqeval")       
        self.model = NERModel(n_tags=self.n_tags)         

    def training_step(self, batch, batch_nb):
        target_tags = batch['target_tags']
        y_hat = self.model(batch)
        loss_fct = torch.nn.CrossEntropyLoss()
        active_loss = batch['attention_mask'].view(-1) == 1

        active_logits = y_hat.view(-1, self.n_tags)
        active_labels = torch.where(
            active_loss,
            target_tags.view(-1),
            torch.tensor(loss_fct.ignore_index).type_as(target_tags)
        )
        loss = loss_fct(active_logits, active_labels)
        self.log_dict({'train_loss':loss}, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_nb):
        target_tags = batch['target_tags']
        y_hat = self.model(batch)
        loss_fct = torch.nn.CrossEntropyLoss()
        active_loss = batch['attention_mask'].view(-1) == 1

        active_logits = y_hat.view(-1, self.n_tags)
        active_labels = torch.where(
            active_loss,
           target_tags.view(-1),
            torch.tensor(loss_fct.ignore_index).type_as(target_tags)
        )
        loss = loss_fct(active_logits, active_labels)

        metrics = self.compute_metrics([y_hat,target_tags])
        self.log_dict({'val_loss':loss, 'val_f1':metrics['f1'], 'val_accuracy':metrics['accuracy'], 
                       'val_precision':metrics['precision'], 'val_recall':metrics['recall']}, prog_bar=True)
        return loss    

    def test_step(self, batch, batch_nb):
        target_tags = batch['target_tags']
        y_hat = self.model(batch)
        loss_fct = torch.nn.CrossEntropyLoss()
        active_loss = batch['attention_mask'].view(-1) == 1

        active_logits = y_hat.view(-1, self.n_tags)
        active_labels = torch.where(
            active_loss,
            target_tags.view(-1),
            torch.tensor(loss_fct.ignore_index).type_as(target_tags)
        )

        loss = loss_fct(active_logits, active_labels)
        metrics = self.compute_metrics([y_hat,target_tags])
        
        self.log_dict({'test_loss':loss, 'test_f1':metrics['f1'], 'test_accuracy':metrics['accuracy'], 
                       'test_precision':metrics['precision'], 'test_recall':metrics['recall']}, prog_bar=True)
        return loss

    def predict_step(self, batch, batch_idx: int , dataloader_idx: int = None):
        y_hat = self.model(batch)
        return {'logits':y_hat, 
                'target_tags':batch['target_tags'],
                'input_ids':batch['input_ids'],
                'attention_mask':batch['attention_mask']
                }

    def configure_optimizers(self):
        optimizer = torch.optim.Adam([p for p in self.parameters() if p.requires_grad], lr=self.hparams.learning_rate, eps=1e-08)
        scheduler = {
        'scheduler': torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-5, steps_per_epoch=len(self.trainer.datamodule.train_dataloader()), epochs=self.hparams.max_epochs),
        'interval': 'step' 
        } 
        return [optimizer], [scheduler]
        
       
    @staticmethod
    def add_model_specific_args(parent_parser, root_dir):
        parser = ArgumentParser(parents=[parent_parser])
        parser.add_argument('--data_root', default=os.path.join(root_dir, 'train_val_data'), type=str)
        parser.add_argument('--learning_rate', default=2e-5, type=float, help = "type (default: %(default)f)")
        return parser

    def compute_metrics(self,p):
      predictions, labels = p
      predictions = torch.argmax(predictions, dim=2)
      label_len = len(self.trainer.datamodule.tag_complete)
      label_list = self.trainer.datamodule.tag_encoder.inverse_transform(np.arange(label_len))
      
      true_predictions = [
          [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
          for prediction, label in zip(predictions, labels)
      ]
      true_labels = [
          [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
          for prediction, label in zip(predictions, labels)
      ]

      results = self.metric.compute(predictions=true_predictions, references=true_labels)
      return {
          "precision": results["overall_precision"],
          "recall": results["overall_recall"],
          "f1": results["overall_f1"],
          "accuracy": results["overall_accuracy"],
      }

In [11]:
from pytorch_lightning.loggers import WandbLogger

wandb_logger = WandbLogger(project="BERT NER Sample")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [12]:
root_dir = os.getcwd()
parent_parser = ArgumentParser(add_help=False)
parent_parser = pl.Trainer.add_argparse_args(parent_parser)
parser = NERTokenClassifier.add_model_specific_args(parent_parser,root_dir)

tag_scheme = [
          'B-PER',
          'I-PER',
          'B-ORG',
          'I-ORG',
          'B-LOC',
          'I-LOC',
          'B-MISC',
          'I-MISC',
          'O'
      ]

parser.set_defaults(
    #profiler='simple',
    deterministic=True,
    max_epochs=3,
    gpus=1,
    distributed_backend=None,
    fast_dev_run=False,
    model_load=False,
    model_name='best_model',
    n_tags = len(tag_scheme)
)

args, extra = parser.parse_known_args()


if (vars(args)['model_load']):
  model = NERTokenClassifier.load_from_checkpoint(vars(args)['model_name'])
else:  
  model = NERTokenClassifier(**vars(args))
print('n_tags',model.n_tags)


early_stop = EarlyStopping(
    monitor='val_loss',
    min_delta=0.0,
    patience=3,
    verbose=True,
    mode='min',
    strict=True,
)

lr_monitor = LearningRateMonitor(logging_interval='step')

checkpoint_callback = ModelCheckpoint(
     monitor='val_loss',
     #dirpath='my/path/',
     filename='conll-ner-epoch{epoch:02d}-val_loss{val_loss:.2f}',
     auto_insert_metric_name=False
)

trainer = Trainer.from_argparse_args(args,
    callbacks=[early_stop,lr_monitor,checkpoint_callback],
    logger=wandb_logger
    )  

seed_everything(42, workers=True)
conll_dm = NERDataModule()

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Global seed set to 42


n_tags 9


In [None]:
trainer.fit(model, conll_dm)
trainer.validate()
trainer.test()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type     | Params
-----------------------------------
0 | model | NERModel | 109 M 
-----------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.957   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Metric val_loss improved. New best score: 0.077


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.020 >= min_delta = 0.0. New best score: 0.056


In [None]:
import gradio as gr

## LOAD MODEL

def predict(sentence):
  ########
  # FILL #
  ########

iface = gr.interface(fn=predict, inputs="text", outputs="text")
iface.launch()