In [1]:
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from transformers import AutoModel, AutoTokenizer
import torch
import pytorch_lightning as pl

In [2]:
# corpus of linguistic acceptability
cola_dataset = load_dataset("glue", "cola")
cola_dataset

Reusing dataset glue (/home/ben/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [3]:
for i in range(200, 210):
    print(cola_dataset["train"][i])

{'sentence': 'The more he reads, the more books I wonder to whom he will give.', 'label': 0, 'idx': 200}
{'sentence': 'The more he reads, the more people I wonder what he will give to.', 'label': 0, 'idx': 201}
{'sentence': 'The sooner you call, the more carefully I know a man that will word the letter.', 'label': 0, 'idx': 202}
{'sentence': 'The richer John gets, the more geniuses John meets.', 'label': 1, 'idx': 203}
{'sentence': 'The richer he gets, the more John meets geniuses.', 'label': 0, 'idx': 204}
{'sentence': 'The more articles he reads, the fewer people he thinks will go into linguistics.', 'label': 1, 'idx': 205}
{'sentence': 'The more articles he reads, the fewer people he thinks that will go into linguistics.', 'label': 0, 'idx': 206}
{'sentence': 'The more articles he reads, the fewer people he thinks that under the current circumstances will go into linguistics.', 'label': 1, 'idx': 207}
{'sentence': 'The more articles he reads, the fewer people he thinks under the cur

In [4]:
class COLADataModule(pl.LightningDataModule):
    
    def __init__(self, model_name, batch_size=32):
        super().__init__()
        self.model_name = model_name
        self.batch_size = batch_size
        datasets = load_dataset("glue", "cola")
        self.train_data = datasets["train"]
        self.valid_data = datasets["validation"]
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
    def tokenize(self, sentence):
        return self.tokenizer(
            sentence["sentence"],
            truncation=True,
            padding="max_length",
            max_length=256,
        )
    
    def setup(self, stage=None):
        self.train_data = self.train_data.map(self.tokenize, batched=True)
        self.valid_data = self.valid_data.map(self.tokenize, batched=True)
        self.train_data.set_format(
            type="torch", columns=["input_ids", "attention_mask", "label"]
        )
        self.valid_data.set_format(
            type="torch", columns=["input_ids", "attention_mask", "label"]
        )
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True)
    
    def valid_dataloader(self):
        return torch.utils.data.DataLoader(self.valid_data, batch_size=self.batch_size, shuffle=True)

In [5]:
class COLAModel(pl.LightningModule):
    
    def __init__(self, model_name, lr=1e-2):
        super().__init__()
        self.save_hyperparameters()
        self.model_name = model_name
        self.lr = lr
        self.lm = AutoModel.from_pretrained(model_name)
        self.final_layer = torch.nn.Linear(self.lm.config.hidden_size, 2)
        self.num_classes = 2
        
    def forward(self, input_ids, attention_mask):
        embeddings = self.lm(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = embeddings.last_hidden_state[:, 0]  # classifier token
        logits = self.final_layer(last_hidden_state)
        return logits
    
    def training_step(self, batch, batch_idx):
        logits = self.forward(batch["input_ids"], batch["attention_mask"])
        loss = torch.nn.functional.cross_entropy(logits, batch["label"])
        self.log("train_loss", loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        logits = self.forward(batch["input_ids"], batch["attention_mask"])
        loss = torch.nn.functional.cross_entropy(logits, batch["label"])
        _, preds = torch.max(logits, dim=1)
        val_acc = accuracy_score(preds.cpu(), batch["label"].cpu())
        val_acc = torch.tensor(val_acc)
        self.log("valid_loss", loss, prog_bar=True)
        self.log("valid_acc", val_acc, prog_bar=True)
    
    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams["lr"])

In [6]:
model_name = "google/bert_uncased_L-2_H-128_A-2"
cola_data = COLADataModule(model_name=model_name)
cola_model = COLAModel(model_name=model_name)
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath="./models", monitor="train_loss", mode="min"
)

trainer = pl.Trainer(
    default_root_dir="logs",
    logger=pl.loggers.TensorBoardLogger("logs/", name="cola", version=1),
    gpus=(1 if torch.cuda.is_available() else 0),
    max_epochs=5,
    fast_dev_run=False,
    callbacks=[checkpoint_callback]
)
trainer.fit(cola_model, cola_data)


Reusing dataset glue (/home/ben/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPU

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type      | Params
------------------------------------------
0 | lm          | BertModel | 4.4 M 
1 | final_layer | Linear    | 258   
------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.545    Total estimated model params size (MB)
2022-05-16 21:58:22.938148: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-16 21:58:22.938170: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  rank_zero_warn(


Training: 0it [00:00, ?it/s]



In [7]:
class COLAPredictor:
    
    def __init__(self, model_path):
        self.model_path = model_path
        self.model = COLAModel.load_from_checkpoint(model_path)
        self.model.eval()
        self.model.freeze()
        self.processor = COLADataModule(model_name=model_name)
        self.softmax = torch.nn.Softmax(dim=0)
        self.labels = ["unacceptable", "acceptable"]
        
    def predict(self, text):
        inference_sample = {"sentence": text}
        processed = self.processor.tokenize(inference_sample)
        logits = self.model(
            torch.tensor([processed["input_ids"]]),
            torch.tensor([processed["attention_mask"]]),
        )
        scores = self.softmax(logits[0]).tolist()
        predictions = []
        for score, label in zip(scores, self.labels):
            predictions.append({"label": label, "score": score})
        return predictions

In [8]:
predictor = COLAPredictor("models/epoch=1-step=536.ckpt")
predictor.predict("Thiasdfs iasdfs asdf asdfasentence.")

Some weights of the model checkpoint at google/bert_uncased_L-2_H-128_A-2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Reusing dataset glue (/home/ben/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72

  0%|          | 0/3 [00:00<?, ?it/s]

[{'label': 'unacceptable', 'score': 0.254064679145813},
 {'label': 'acceptable', 'score': 0.745935320854187}]

In [9]:
cola_data

<__main__.COLADataModule at 0x7fb7df990a30>