In [None]:
from google.colab import drive

drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/custom-EM-BERT/prof_entity/lightning

/content/gdrive/MyDrive/custom-EM-BERT/prof_entity/lightning


In [None]:
!nvidia-smi

Thu May 27 04:45:17 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Set-up

In [None]:
pip install jsonlines datasets pytorch_lightning transformers lightning_transformers ipython-autotime



In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
import jsonlines
from typing import Any, Dict, List, Optional
from datasets import Dataset, load_dataset, DatasetDict
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import BackboneFinetuning, BaseFinetuning
from torch.optim.optimizer import Optimizer
from pytorch_lightning.callbacks import ModelPruning, EarlyStopping, ModelCheckpoint
from lightning_transformers.core.nlp import HFBackboneConfig, HFTransformerDataConfig, HFDataModule
from lightning_transformers.task.nlp.text_classification import (
    TextClassificationDataModule, TextClassificationTransformer)
from transformers import AutoTokenizer, PreTrainedTokenizerBase, AutoModelForSequenceClassification
from sklearn.metrics import classification_report

# Lightning Modules

In [None]:
class EntityMatchingDataModule(TextClassificationDataModule):
    def __init__(self,
                 cfg: HFTransformerDataConfig,
                 tokenizer: PreTrainedTokenizerBase,
                 train_data: pd.DataFrame,
                 val_data: pd.DataFrame,
                 test_data: pd.DataFrame):
        super().__init__(tokenizer, cfg)
        self.train_data = train_data
        self.val_data = val_data
        self.test_data = test_data

    def load_dataset(self) -> DatasetDict:
        return DatasetDict({
            'train': Dataset.from_pandas(train_data),
            'validation': Dataset.from_pandas(val_data),
            'test': Dataset.from_pandas(test_data)})

    def process_data(self, dataset, stage: Optional[str] = None) -> Dataset:
        dataset = EntityMatchingDataModule.preprocess(
            dataset,
            tokenizer=self.tokenizer,
            padding=self.cfg.padding,
            truncation=self.cfg.truncation,
            max_length=self.cfg.max_length,
        )
        cols_to_keep = [
            x for x in ["input_ids", "attention_mask", "token_type_ids", "labels"] if x in dataset["train"].features
        ]
        dataset.set_format("torch", columns=cols_to_keep)
        self.labels = dataset["train"].features["labels"]
        self.labels.num_classes = len(dataset['train']['labels'].unique())
        return dataset

    @staticmethod
    def convert_to_features(
        example_batch: Any, _, tokenizer: PreTrainedTokenizerBase, **tokenizer_kwargs
    ):
        return tokenizer(example_batch['descA'],
                         example_batch['descB'],
                         padding=True,
                         truncation=True)

    @staticmethod
    def preprocess(ds: Dataset, **fn_kwargs) -> Dataset:
        ds = ds.map(
            # todo: change this to self.convert_to_features for users to override
            EntityMatchingDataModule.convert_to_features,
            batched=True,
            with_indices=True,
            fn_kwargs=fn_kwargs,
        )
        ds.rename_column_("label", "labels")
        return ds

In [None]:
class EntityMatcher(TextClassificationTransformer):
    def __init__(self, learning_rate=1e-5, max_lr=1e-3,
                 *args, **kwargs):
        super().__init__(*args, **kwargs)

        # this is to initialize the backbone in this instance
        for k,v in kwargs.items():
            if k == 'backbone': 
                self.backbone = v

        self.lr = learning_rate
        self.max_lr = max_lr
    
    def forward(self, x): # for inference
        # import pdb; pdb.set_trace()
        input_ids = x['input_ids']
        token_type_ids = x['token_type_ids']
        attention_mask = x['attention_mask']
        return self.model(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )

    def predict_step(self, batch, batch_idx: int, dataloader_idx: int = None):
        return self(batch)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-5)

        # cyclic learning rate finder
        # optimizer = torch.optim.AdamW(self.parameters())
        # return {
        #     'optimizer': optimizer,
        #     # cyclic LR not really necessary in this use-case, base LR is better
        #     'lr_scheduler': {
        #         'scheduler': torch.optim.lr_scheduler.CyclicLR(
        #             optimizer,
        #             base_lr=self.lr,
        #             max_lr=self.max_lr,
        #             mode='triangular',
        #             cycle_momentum=False),
        #         'interval': 'step',
        #         'frequency': 500,
        #         'monitor': 'val_loss'
        #     }
        # }

# Predict

In [None]:
%load_ext autotime
# display time for each cell execution

time: 62 µs (started: 2021-05-27 04:45:32 +00:00)


In [None]:
test_data = pd.read_csv('data/test.csv')

test_data.shape

(764, 4)

time: 32.6 ms (started: 2021-05-27 04:45:32 +00:00)


In [None]:
# data_loader = EntityMatchingDataModule(
#     cfg=HFTransformerDataConfig(
#         # num_workers=12,
#         batch_size=8, # keep to max of 8, only use 16 with colab pro
#         max_length=512),
#     tokenizer=tokenizer,
#     train_data=train_data,
#     val_data=val_data,
#     test_data= test_data)

time: 1.05 ms (started: 2021-05-27 04:45:32 +00:00)


In [None]:
model = EntityMatcher.load_from_checkpoint(checkpoint_path = 'bert_final.ckpt')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

time: 50.2 s (started: 2021-05-27 04:45:32 +00:00)


In [None]:
# call the pipeline from the model -- for inference
# https://huggingface.co/transformers/main_classes/pipelines.html#transformers.TextClassificationPipeline

pipeline = model.hf_pipeline
model.hf_pipeline

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x7fb6f2bdb3d0>

time: 5.29 ms (started: 2021-05-27 04:46:22 +00:00)


In [None]:
# # call setup to initiate data loader without training step -- for inference
# data_loader.setup()

# test_loader = data_loader.test_dataloader()

time: 1.28 ms (started: 2021-05-27 04:46:22 +00:00)


In [None]:
text_list = []

for index,row in test_data.iterrows():
  A = row['descA']
  B = row['descB']
  text = A + B
  # if len(text) > 512:
  #   text = text[0:512]
  text_list.append(text)

print(len(text_list))

764
time: 69.4 ms (started: 2021-05-27 04:46:22 +00:00)


In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

dataloader = list(chunks(text_list, 8))

time: 8.78 ms (started: 2021-05-27 04:48:52 +00:00)


In [None]:
print(len(dataloader[0]))

8
time: 1.08 ms (started: 2021-05-27 04:49:16 +00:00)


In [None]:
# https://huggingface.co/transformers/task_summary.html#sequence-classification
pred_list = []
for batch in dataloader:
  predicted_values = pipeline(batch, tokenizer='bert-base-uncased', model='bert-base-uncased', truncation=True, padding=True)
  pred_list = pred_list + predicted_values

time: 3min 42s (started: 2021-05-27 04:50:59 +00:00)


In [None]:
labels = []
for i in pred_list:
  labels.append(i['label'])

time: 2.91 ms (started: 2021-05-27 04:57:22 +00:00)


In [None]:
test_data['pred'] = labels
test_data.head()

Unnamed: 0.1,Unnamed: 0,descA,descB,label,pred
0,133,These clinical symptoms commonly occur in mult...,“BlackThorn is focused on developing a new gen...,1,LABEL_0
1,2774,"The innovation labs we have funded, have creat...",Approximately 60% of these jobs are held by Si...,0,LABEL_0
2,3102,"Mobile by design, Tabit is bringing the smartp...",At the 400 restaurants and cafes in Israel and...,0,LABEL_0
3,537,LeaseLock secured $10M in Series A financing f...,"MARINA DEL REY, Calif., Aug 22, 2019 /PRNewswi...",1,LABEL_1
4,1338,"Same-day services, including order pickups and...",And offline store sales actually grew 10% at a...,0,LABEL_0


time: 29.7 ms (started: 2021-05-27 04:57:25 +00:00)


In [None]:
label_dict = {'LABEL_0':0, 'LABEL_1':1}
test_data['preds'] = test_data['pred'].map(label_dict)
test_data.head()

Unnamed: 0.1,Unnamed: 0,descA,descB,label,pred,preds
0,133,These clinical symptoms commonly occur in mult...,“BlackThorn is focused on developing a new gen...,1,LABEL_0,0
1,2774,"The innovation labs we have funded, have creat...",Approximately 60% of these jobs are held by Si...,0,LABEL_0,0
2,3102,"Mobile by design, Tabit is bringing the smartp...",At the 400 restaurants and cafes in Israel and...,0,LABEL_0,0
3,537,LeaseLock secured $10M in Series A financing f...,"MARINA DEL REY, Calif., Aug 22, 2019 /PRNewswi...",1,LABEL_1,1
4,1338,"Same-day services, including order pickups and...",And offline store sales actually grew 10% at a...,0,LABEL_0,0


time: 21 ms (started: 2021-05-27 04:57:28 +00:00)


In [None]:
test_data.to_csv('data/predictions.csv')

time: 230 ms (started: 2021-05-27 04:57:30 +00:00)


In [None]:
actuals = test_data['label']
preds = test_data['preds']
print(classification_report(actuals, preds))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       526
           1       0.74      0.57      0.64       238

    accuracy                           0.80       764
   macro avg       0.78      0.74      0.75       764
weighted avg       0.80      0.80      0.80       764

time: 10.8 ms (started: 2021-05-27 04:57:32 +00:00)
