In [None]:
import transformers

In [None]:
# Hugging face trainer to build classification model for text data

In [None]:
#Dataset class
#Model
#Trainer - training arguments

In [None]:
pip install torch torchvision torchaudio transformers datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import get_scheduler
from tqdm.auto import tqdm


# **Loading and Splitting the IMDB dataset**

In [None]:
dataset = load_dataset("imdb")

In [None]:
# Split into train and test

train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [None]:
#Tokenizing the dataset

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
def tokenize_function(examples):
  return tokenizer(examples["text"],padding="max_length",truncation=True)

In [None]:
tokenized_train = train_dataset.map(tokenize_function,batched=True)
tokenized_test = test_dataset.map(tokenize_function,batched=True)

#Pytorch tensors

tokenized_train.set_format("torch",columns=["input_ids","attention_mask","label"])
tokenized_test.set_format("torch",columns=["input_ids","attention_mask","label"])

tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")

train_dataloader = DataLoader(tokenized_train,shuffle=True,batch_size=8)
eval_dataloader = DataLoader(tokenized_test,batch_size=8)

# **Defining Encoder Models**

In [None]:
model_names = ['bert-base-cased','distilbert-base-cased','roberta-base']

models = [AutoModelForSequenceClassification.from_pretrained(name,num_labels=2) for name in model_names]

# **Training and Evaluation**

In [None]:
def train_epoch(model,dataloader,optimizer,lr_scheduler, device):
  model.train()
  total_loss = 0
  for batch in dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    total_loss += loss.item()

    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()

  return total_loss/len(dataloader)

In [None]:
def evaluate(model, dataloader, device):
  model.eval()
  total_loss=0
  correct=0
  total=0

  with torch.no_grad():
    for batch in dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      total_loss += loss.item()

      predictions = outputs.logits.argmax(dim=-1)
      correct += (predictions == batch["labels"]).sum().item()
      total += len(batch["labels"])

    return total_loss/len(dataloader), correct/total

# **Training the Models**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 3

for model in models:
  model.to(device)

  optimizer = AdamW(model.parameters(),lr=5e-5)
  num_training_steps = num_epochs * len(train_dataloader)
  lr_scheduler = get_scheduler(
      name="linear",optimizer=optimizer,num_warmup_steps=0,num_training_steps=num_training_steps
  )

  for epoch in range(num_epochs):
    train_loss = train_epoch(model,train_dataloader,optimizer,lr_scheduler,device)
    test_loss, test_accuracy = evaluate(model, eval_dataloader,device)

    print(f"Epoch {epoch+1}")
    print(f"Train Loss: {train_loss: .4f}")
    print(f"Test Loss: {test_loss: .4f}")
    print(f"Test Accuracy: {test_accuracy: .4f}")



Epoch 1
Train Loss:  0.1916
Test Loss:  0.2119
Test Accuracy:  0.9192
