In [None]:
from google.colab import drive

drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/custom-EM-BERT/prof_entity/lightning

/content/gdrive/MyDrive/custom-EM-BERT/prof_entity/lightning


In [None]:
!nvidia-smi

Mon May 24 09:40:15 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Set-up

In [1]:
pip install jsonlines datasets pytorch_lightning transformers lightning_transformers ipython-autotime deepspeed

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
import jsonlines
from typing import Any, Dict, List, Optional
from datasets import Dataset, load_dataset, DatasetDict
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import BackboneFinetuning, BaseFinetuning
from torch.optim.optimizer import Optimizer
from pytorch_lightning.callbacks import ModelPruning, EarlyStopping, ModelCheckpoint
from lightning_transformers.core.nlp import HFBackboneConfig, HFTransformerDataConfig, HFDataModule
from lightning_transformers.task.nlp.text_classification import (
    TextClassificationDataModule, TextClassificationTransformer)
from transformers import AutoTokenizer, PreTrainedTokenizerBase, AutoModelForSequenceClassification
from sklearn.metrics import classification_report

# Lightning Modules

In [None]:
# utilize Lightning Transformer's data module
# train, val and test datasets are processed together
# tokenizing is done within the module

class EntityMatchingDataModule(TextClassificationDataModule):
    def __init__(self,
                 cfg: HFTransformerDataConfig,
                 tokenizer: PreTrainedTokenizerBase,
                 train_data: pd.DataFrame,
                 val_data: pd.DataFrame,
                 test_data: pd.DataFrame):
        super().__init__(tokenizer, cfg)
        self.train_data = train_data
        self.val_data = val_data
        self.test_data = test_data

    def load_dataset(self) -> DatasetDict:
        return DatasetDict({
            'train': Dataset.from_pandas(train_data),
            'validation': Dataset.from_pandas(val_data),
            'test': Dataset.from_pandas(test_data)})

    def process_data(self, dataset, stage: Optional[str] = None) -> Dataset:
        dataset = EntityMatchingDataModule.preprocess(
            dataset,
            tokenizer=self.tokenizer,
            padding=self.cfg.padding,
            truncation=self.cfg.truncation,
            max_length=self.cfg.max_length,
        )
        cols_to_keep = [
            x for x in ["input_ids", "attention_mask", "token_type_ids", "labels"] if x in dataset["train"].features
        ]
        dataset.set_format("torch", columns=cols_to_keep)
        self.labels = dataset["train"].features["labels"]
        self.labels.num_classes = len(dataset['train']['labels'].unique())
        return dataset

    @staticmethod
    def convert_to_features(
        example_batch: Any, _, tokenizer: PreTrainedTokenizerBase, **tokenizer_kwargs
    ):
    # for our use case, we will have to tokenize our 2 examples for entity matching
        return tokenizer(example_batch['descA'],
                         example_batch['descB'],
                         padding=True,
                         truncation=True)

    @staticmethod
    def preprocess(ds: Dataset, **fn_kwargs) -> Dataset:
        ds = ds.map(
            # todo: change this to self.convert_to_features for users to override
            EntityMatchingDataModule.convert_to_features,
            batched=True,
            with_indices=True,
            fn_kwargs=fn_kwargs,
        )
        ds.rename_column_("label", "labels")
        return ds

In [None]:
# utilize Lightning Transformer's auto-model for transformers
# choose single learning rate or cyclic finder

class EntityMatcher(TextClassificationTransformer):
    def __init__(self, learning_rate=1e-5, max_lr=1e-3,
                 *args, **kwargs):
        super().__init__(*args, **kwargs)

        # this is to initialize the backbone in this instance
        for k,v in kwargs.items():
            if k == 'backbone': 
                self.backbone = v

        self.lr = learning_rate
        self.max_lr = max_lr
    
    def forward(self, x): # for inference
        # import pdb; pdb.set_trace()
        input_ids = x['input_ids']
        token_type_ids = x['token_type_ids']
        attention_mask = x['attention_mask']
        return self.model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )

    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = None):
        return self(batch)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-5)

        # cyclic learning rate finder
        # optimizer = torch.optim.AdamW(self.parameters())
        # return {
        #     'optimizer': optimizer,
        #     # cyclic LR not really necessary in this use-case, base LR is better
        #     'lr_scheduler': {
        #         'scheduler': torch.optim.lr_scheduler.CyclicLR(
        #             optimizer,
        #             base_lr=self.lr,
        #             max_lr=self.max_lr,
        #             mode='triangular',
        #             cycle_momentum=False),
        #         'interval': 'step',
        #         'frequency': 500,
        #         'monitor': 'val_loss'
        #     }
        # }

# change this line to utilize different models without the need to change tokenization
# BERT, roBERTa, XLNet, T5, GPT, etc.....
# refer to https://huggingface.co/transformers/pretrained_models.html
model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=model_name)

# lightning uses huggingface backbone to accelerate training
model = EntityMatcher(
    backbone=HFBackboneConfig(pretrained_model_name_or_path=model_name)
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# create a custom BERT fine-tuning call-back 
# specify when to unfreeze backbone, etc....

# class MyBackboneFinetuning(BaseFinetuning):

#     def __init__(self, unfreeze_backbone_at_epoch: int = 5, train_bn: bool = True, backbone_lr: float = 1e-5):
#         self._unfreeze_backbone_at_epoch = unfreeze_backbone_at_epoch
#         self._train_bn = train_bn
#         self._backbone_lr = backbone_lr

#     def freeze_before_training(self, pl_module: pl.LightningModule):
#         self.freeze(pl_module.backbone, train_bn=self._train_bn)

#     def finetune_function(self, pl_module: pl.LightningModule, epoch: int, optimizer: Optimizer, opt_idx: int):
#         """Called on every epoch starts."""
#         if epoch == self.unfreeze_backbone_at_epoch:
#             self.unfreeze_and_add_param_group(
#                 pl_module.backbone,
#                 optimizer,
#                 lr=self._backbone_lr,
#                 train_bn=self.train_bn,
#             )

In [None]:
# Load data and push into dataloader
train_data = pd.read_csv('data/train.csv') 
val_data = pd.read_csv('data/val.csv')
test_data = pd.read_csv('data/test.csv')

data_loader = EntityMatchingDataModule(
    cfg=HFTransformerDataConfig(
        # num_workers=12,
        batch_size=8, # keep to max of 8, only use 16 with colab pro
        max_length=512),
    tokenizer=tokenizer,
    train_data=train_data,
    val_data=val_data,
    test_data= test_data
)

# Model Training

In [None]:
%load_ext autotime
# display time for each cell execution

time: 107 µs (started: 2021-05-24 08:41:07 +00:00)


In [None]:
#call-back functions

# stop training if accuracy don't improve in 5 epochs
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
cp = ModelCheckpoint(filename='best',monitor='val_accuracy')

# eliminate weights that contribute little to performance
prune = pl.callbacks.ModelPruning(
    pruning_fn='l1_unstructured',
    amount=0.05,
    use_global_unstructured=True,
    use_lottery_ticket_hypothesis=True)

time: 4.59 ms (started: 2021-05-24 08:41:09 +00:00)


In [None]:
trainer = pl.Trainer(
    gpus=1,
    precision=16, # change from fp32 to 16 for faster run-time
    max_epochs=15,
    progress_bar_refresh_rate=20, # slow down refresh rate for colab
    # auto_lr_find=True, # automatically find best lr
    # auto_scale_batch_size=True, # auto find largest batch size for the model
    stochastic_weight_avg=True, # similar to ensembling

    # plugins='deepspeed',
    # callbacks = [BackboneFinetuning(unfreeze_backbone_at_epoch=5, train_bn=True)]
    # callbacks = [BackboneFinetuning(unfreeze_backbone_at_epoch=5)]
    callbacks = [early_stopping, cp, prune]
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Using native 16bit precision.


time: 27.3 ms (started: 2021-05-24 08:41:14 +00:00)


In [None]:
trainer.tune(model, data_loader)

{}

time: 3.19 ms (started: 2021-05-24 08:41:16 +00:00)


In [None]:
trainer.fit(model, data_loader)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]






  | Name   | Type                          | Params
---------------------------------------------------------
0 | model  | BertForSequenceClassification | 109 M 
1 | prec   | Precision                     | 0     
2 | recall | Recall                        | 0     
3 | acc    | Accuracy                      | 0     
---------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

  value = torch.tensor(value, device=device, dtype=torch.float)






HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


time: 22min 3s (started: 2021-05-24 08:41:16 +00:00)


In [None]:
trainer.validate(model, data_loader)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

  value = torch.tensor(value, device=device, dtype=torch.float)


--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_accuracy': 0.8248772621154785,
 'val_loss': 0.6124130487442017,
 'val_precision': 0.8248772621154785,
 'val_recall': 0.8248772621154785}
--------------------------------------------------------------------------------


[{'val_accuracy': 0.8248772621154785,
  'val_loss': 0.6124130487442017,
  'val_precision': 0.8248772621154785,
  'val_recall': 0.8248772621154785}]

time: 15.1 s (started: 2021-05-24 09:03:21 +00:00)


In [None]:
# save weights to a checkpoint file for loading during inference
trainer.save_checkpoint('bert_final.ckpt')

time: 18.8 s (started: 2021-05-24 09:03:37 +00:00)


# Validate and Predict

In [None]:
# load test dataset and model saved
test_loader = data_loader.test_dataloader()
# new_model = EntityMatcher.load_from_checkpoint(checkpoint_path = 'lightning.ckpt')

predicted_values = trainer.predict(model, test_loader)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]







HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Predicting', layout=Layout(flex='2'), m…


time: 19.1 s (started: 2021-05-24 08:30:16 +00:00)


In [None]:
# Check classification report
predictions = torch.cat([p.logits for p in predicted_values], dim=0)
preds = predictions.softmax(dim=1).argmax(dim=1).cpu()
actuals = test_data['label'].to_list()

print(classification_report(actuals, preds))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87       526
           1       0.70      0.76      0.73       238

    accuracy                           0.83       764
   macro avg       0.80      0.81      0.80       764
weighted avg       0.83      0.83      0.83       764

time: 8.12 ms (started: 2021-05-24 08:30:35 +00:00)
