In [None]:
!pip install datasets
!pip install transformers

In [1]:
import sys
import matplotlib.pyplot as plt

import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.optim import AdamW

from sklearn.metrics import accuracy_score

In [None]:
dataset = load_dataset("imdb")
dataset

In [81]:
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 
cls_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [69]:
class MainData(Dataset):
  def __init__(self, data, tokenizer) -> None:
    super().__init__()
    self.tokenizer = tokenizer
    self.data = data
  
  def __getitem__(self, index):
    text = self.data['text'][index]
    label = self.data['label'][index]

    token = self.tokenizer(text, padding=True, truncation=True, max_length=512)

    input_ids = token['input_ids']
    attention_mask = token['attention_mask']
    label = torch.tensor(label)

    return (input_ids, attention_mask, label)
  
  def __len__(self):
    return len(self.data['text'])

In [70]:
def create_batch(datas):
  input_ids = [torch.Tensor(i[0]) for i in datas]
  attention_mask = [torch.Tensor(i[1]) for i in datas]
  
  if datas[0][2] is not None:
      labels = torch.stack([i[2] for i in datas])
  else:
      labels = None

  input_ids_tensors = pad_sequence(input_ids, batch_first=True)
  masks_tensors = pad_sequence(attention_mask, batch_first=True)

  input_ids_tensors   = input_ids_tensors.to(torch.long)
  masks_tensors     = masks_tensors.to(torch.long)
  
  return input_ids_tensors, masks_tensors, labels

In [85]:
def eval(model, data_loader):
  loss = 0
  model.eval()
  prediction = None
  true = None

  with torch.no_grad():
    for index, data in enumerate(data_loader):
      input_ids, marks, label= [t.to("cuda:0") for t in data]
      
      output_loss = model(input_ids, marks, labels=label)
      loss += output_loss[0].item()

      output = model(input_ids, marks)
      logits = output[0]
      _, pred = torch.max(logits.data, 1)

      pred = pred.cpu()
      label = label.cpu()

      if prediction is None:
        prediction = pred
        true = label
      else:
        prediction = torch.cat((prediction, pred))
        true = torch.cat((true, label))

  loss = loss / len(data_loader)
  acc = accuracy_score(true, prediction)
  return acc, loss

In [86]:
train_count = 2500
val_count = 1000

torch.random.manual_seed(16)
train_index = torch.randint(len(dataset['train']), (train_count,))
val_index = torch.randint(len(dataset['test']), (val_count,))

SequenceClassifierOutput(loss=tensor(0.6399, grad_fn=<NllLossBackward0>), logits=tensor([[0.4549, 0.7538],
        [0.2158, 0.5265],
        [0.0066, 0.4286]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [65]:
train_dataset = MainData(dataset['train'][train_index], tokenizer)
val_dataset = MainData(dataset['test'][val_index], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False, collate_fn=create_batch)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=create_batch)

In [None]:
device = 'cuda:0'
cls_model = cls_model.to(device)

optimizer = AdamW(cls_model.parameters(), lr=4e-6)
Train_loss = []
Val_loss = []
Train_acc = []
Val_acc = []

EPOCH = 5
for epoch in range(EPOCH):
  bts = 0
  for index, data in enumerate(train_loader):
    cls_model.train()
    input_ids, marks, label = [t.to(device) for t in data]
    
    optimizer.zero_grad()
    
    output = cls_model(input_ids, marks, labels=label)
    l = output[0]
    l.backward()
    optimizer.step()

    bts+=1
    stats = 'Epoch [%d/%d], Step [%d/%d], Batch-Loss: %.4f' % (epoch+1, EPOCH, bts, len(train_loader), l.item())
    print('\r' + stats, end="")
    sys.stdout.flush()

  
  train_acc, train_loss = eval(cls_model, train_loader)
  val_acc, val_loss = eval(cls_model, val_loader)
  Train_loss.append(train_loss)
  Val_loss.append(val_loss)
  Train_acc.append(train_acc)
  Val_acc.append(val_acc)

In [None]:
plt.figure(figsize=(10,5), dpi=100, linewidth = 2)
plt.plot(Train_loss, 's-', color='r', label="Train-Loss")   
plt.plot(Val_loss, 'o-', color='g', label="Val-Loss")    
plt.xlabel("epoch", fontsize=15, labelpad = 15)
plt.ylabel("loss", fontsize=15, labelpad = 20)
plt.legend(loc = "best", fontsize=10)
plt.show()

In [None]:
plt.figure(figsize=(10,5), dpi=100, linewidth = 2)
plt.plot(Train_acc, 's-', color='r', label="Train-Acc")   
plt.plot(val_acc, 'o-', color='g', label="Val-Acc")    
plt.xlabel("epoch", fontsize=15, labelpad = 15)
plt.ylabel("accuracy", fontsize=15, labelpad = 20)
plt.legend(loc = "best", fontsize=10)
plt.show()