In [21]:
import lightning as L
import torch
import torchmetrics


In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [24]:
special_tokens_dict = {'additional_special_tokens': ['[E1]', '[/E1]', '[E2]', '[/E2]']}
tokenizer.add_special_tokens(special_tokens_dict)

4

In [25]:
from transformers import AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30526, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin

In [27]:
class CustomLightningModule(L.LightningModule):
  def __init__(self, model, learning_rate=10e-12):
    super().__init__()
#     self.save_hyperparameters()
    self.learning_rate = learning_rate
    self.model = model

    self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=4)
    self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=4)

  def forward(self, input_ids, attention_mask, labels):
    return self.model(input_ids, attention_mask=attention_mask, labels=labels)
    
  def training_step(self, batch, batch_idx):
    outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                    labels=batch["relations_id"])        
    self.log("train_loss", outputs["loss"])
    return outputs["loss"]  # this is passed to the optimizer for training

  def validation_step(self, batch, batch_idx):
    outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                    labels=batch["relations_id"])        
    self.log("val_loss", outputs["loss"], prog_bar=True)
    
    logits = outputs["logits"]
    predicted_labels = torch.argmax(logits, 1)
    self.val_acc(predicted_labels, batch["relations_id"])
    self.log("val_acc", self.val_acc, prog_bar=True)
      
  def test_step(self, batch, batch_idx):
    outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                    labels=batch["relations_id"])        
    
    logits = outputs["logits"]
    predicted_labels = torch.argmax(logits, 1)
    self.test_acc(predicted_labels, batch["relations_id"])
    self.log("accuracy", self.test_acc, prog_bar=True)

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
    return optimizer
  



In [28]:
lightning_model = CustomLightningModule(model)

In [29]:
lightning_model

CustomLightningModule(
  (model): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30526, 768)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn): FF

In [44]:
path = "logs/my-model/version_0/checkpoints/epoch=0-step=18.ckpt"

In [31]:
from datasets import load_dataset

In [32]:
annotated_dataset = load_dataset("csv", data_files={
    
    "test": "test.csv",
})

Found cached dataset csv (/home/programmer/.cache/huggingface/datasets/csv/default-11cb6aee51d14a62/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [33]:
test_dataset = annotated_dataset["test"] 

In [35]:
def tokenize_text(batch):
  return tokenizer(batch["sentences"], truncation=True, padding=True)


data_tokenized = annotated_dataset.map(tokenize_text, batched=True, batch_size=None)

Loading cached processed dataset at /home/programmer/.cache/huggingface/datasets/csv/default-11cb6aee51d14a62/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-c12a3a05d9c11446.arrow


In [36]:
data_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "relations_id"])

In [37]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [38]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset


class annotated_dataset(Dataset):
  def __init__(self, dataset_dict, partition_key="train"):
    self.partition = dataset_dict[partition_key]

  def __getitem__(self, index):
    return self.partition[index]

  def __len__(self):
    return self.partition.num_rows


In [39]:
test_dataset = annotated_dataset(data_tokenized, partition_key="test")


In [40]:
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=32,
    num_workers=4
)

In [42]:
trainer = L.Trainer()

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [45]:
trainer.test(lightning_model, 
             dataloaders=test_loader, 
             ckpt_path=path, 
             verbose=True
            )

You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=0-step=18.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/my-model/version_0/checkpoints/epoch=0-step=18.ckpt


Testing: 0it [00:00, ?it/s]

[{'accuracy': 0.2222222238779068}]