In [25]:
!pip install lightning-bolts



In [26]:
!pip install pytorch-lightning



In [27]:
!pip install torch torchvision torchaudio



In [28]:
!pip install torchmetrics



In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
!gdown --id 1P-v4kMVqxIXs1CYjEkQgiYGw24L06HXp -O local_dataset_utilities.py

Downloading...
From (original): https://drive.google.com/uc?id=1P-v4kMVqxIXs1CYjEkQgiYGw24L06HXp
From (redirected): https://drive.google.com/uc?id=1P-v4kMVqxIXs1CYjEkQgiYGw24L06HXp&confirm=t&uuid=55116552-3807-42cb-89d6-ff708e8d4200
To: /content/local_dataset_utilities.py
100% 2.98k/2.98k [00:00<00:00, 15.6MB/s]


In [31]:
!gdown --id 1invn8q15DgSSjzVW4RlfPiyCqUTeqdfq -O local_utilities.py

Downloading...
From (original): https://drive.google.com/uc?id=1invn8q15DgSSjzVW4RlfPiyCqUTeqdfq
From (redirected): https://drive.google.com/uc?id=1invn8q15DgSSjzVW4RlfPiyCqUTeqdfq&confirm=t&uuid=65fa34ab-edbf-442e-b3f4-da24d9c676fa
To: /content/local_utilities.py
100% 2.46k/2.46k [00:00<00:00, 10.3MB/s]


In [32]:
!pip install datasets



In [33]:
import os.path as op

from datasets import load_dataset

import pytorch_lightning as L
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint

import numpy as np
import pandas as pd
import torch

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [34]:
download_dataset()

df = load_dataset_into_to_dataframe()
partition_dataset(df)

100%|██████████| 50000/50000 [00:53<00:00, 941.24it/s]


Class distribution: [25000 25000]


In [35]:
df_train = pd.read_csv('train.csv')
df_val = pd.read_csv('val.csv')
df_test = pd.read_csv('test.csv')

**2. Tokenization**

In [36]:
imdb_dataset = load_dataset(
    'csv',
    data_files={
        'train': 'train.csv',
        'val': 'val.csv',
        'test': 'test.csv'
    },
)

print(imdb_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    val: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})


In [37]:
from transformers import AutoTokenizer

In [38]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [39]:
print("tokenizer input max length:", tokenizer.model_max_length)
print("tokenizer vocabulary size:", tokenizer.vocab_size)

tokenizer input max length: 512
tokenizer vocabulary size: 30522


In [40]:
def tokenize_text(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

In [41]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [42]:
del imdb_dataset

In [43]:
imdb_tokenized.set_format("torch", columns = ["input_ids", "attention_mask", "label"])

In [44]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

**3. Set up dataloaders**

In [45]:
from torch.utils.data import DataLoader, Dataset


class IMDBDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [46]:
train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
val_dataset = IMDBDataset(imdb_tokenized, partition_key="val")
test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

In [47]:
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=12,
    shuffle=True,
    num_workers=4,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=12,
    shuffle=False,
    num_workers=4,
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=12,
    shuffle=False,
    num_workers=4,
)

**4. Initializing DistilBERT**

In [48]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**5. Finetuning**

In [49]:
import torchmetrics

In [50]:
class LightningModel(L.LightningModule):
    def __init__(self, model, learning_rate=5e-5):
        super().__init__()

        self.learning_rate = learning_rate
        self.model = model

        self.val_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=2)

    def forward(self, input_ids, attention_mask, labels):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("train_loss", outputs["loss"])
        return outputs["loss"]  # this is passed to the optimizer for training

    def validation_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])
        self.log("val_loss", outputs["loss"], prog_bar=True)

        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.val_acc(predicted_labels, batch["label"])
        self.log("val_acc", self.val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        outputs = self(batch["input_ids"], attention_mask=batch["attention_mask"],
                       labels=batch["label"])

        logits = outputs["logits"]
        predicted_labels = torch.argmax(logits, 1)
        self.test_acc(predicted_labels, batch["label"])
        self.log("accuracy", self.test_acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer


lightning_model = LightningModel(model)

In [51]:
callbacks = [
    ModelCheckpoint(
        save_top_k=1,
        mode="max",
        monitor="val_acc"
    )
]

logger = CSVLogger(save_dir="logs/", name="my-model")

In [52]:
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="gpu",
    #precision="16-mixed",
    devices=1,
    logger=logger,
    log_every_n_steps=10,
)

trainer.fit(model=lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name     | Type                                | Params
-----------------------------------------------------------------
0 | model    | DistilBertF

Sanity Checking: 0it [00:00, ?it/s]

  self.pid = os.fork()


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [53]:
trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best")

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from checkpoint at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'accuracy': 0.9876857399940491}]

In [54]:
trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best")

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from checkpoint at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt


Testing: 0it [00:00, ?it/s]

[{'accuracy': 0.9258000254631042}]

In [55]:
trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best")

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from checkpoint at logs/my-model/version_0/checkpoints/epoch=1-step=5834.ckpt


Testing: 0it [00:00, ?it/s]

[{'accuracy': 0.9236999750137329}]