<a href="https://colab.research.google.com/github/bogus1aw/text-classification-benchmark/blob/main/M_herBERT_wiki.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# HerBERT benchmark for wiki dataset 

In [1]:
# check available GPU
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv

name, driver_version, memory.total [MiB]
Tesla T4, 418.67, 15079 MiB


In [2]:
!pip install datasets transformers



In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
cuda.is_available()

True

In [4]:
import pandas as pd

def load_corpora_to_dataframe(corpora):
  data = open(corpora).read()
  labels, texts = [], []
  for i, line in enumerate(data.split("\n")):
      content = line.split()
      labels.append(content[0])
      texts.append(" ".join(content[1:]))
  return texts, labels
  # # create a dataframe using texts and labels
  # trainDF = pd.DataFrame()
  # trainDF['text'] = texts
  # trainDF['label'] = labels
  # return trainDF
    
raw_corpora = '/content/drive/MyDrive/master_datasets/wiki_preprocessed/wikiInOneFileDataset.txt'
texts, labels = load_corpora_to_dataframe(raw_corpora)

In [5]:
from sklearn.model_selection import train_test_split
# create test dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=.2)
# create train and validation dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [6]:
print(len(train_texts), ' ', len(train_labels))
print(len(val_texts), ' ', len(val_labels))
print(len(test_texts), ' ', len(test_labels))

4406   4406
1102   1102
1377   1377


In [7]:
import torch
from transformers import HerbertTokenizer, RobertaForSequenceClassification, EvalPrediction

tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
model = RobertaForSequenceClassification.from_pretrained("allegro/herbert-klej-cased-v1", num_labels=34)

# encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
# outputs = model(encoded_input)

Some weights of the model checkpoint at allegro/herbert-klej-cased-v1 were not used when initializing RobertaForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-klej-cased-v1 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream

In [8]:
max_length = 200
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length)

from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
train_labels = encoder.fit_transform(train_labels)
val_labels = encoder.fit_transform(val_labels)
test_labels = encoder.fit_transform(test_labels)

In [9]:
# build pyTorch dataset
import torch

class wikiDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = wikiDataset(train_encodings, train_labels)
val_dataset = wikiDataset(val_encodings, val_labels)
test_dataset = wikiDataset(test_encodings, test_labels)

In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy = "epoch",
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

from datasets import load_metric
import numpy as np
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    print(predictions[:10])
    print(labels[:10])
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Runtime,Samples Per Second
1,0.1907,0.318187,0.92559,13.5806,81.145
2,0.2196,0.358241,0.928312,13.7524,80.131
3,0.1352,0.36513,0.930127,13.7805,79.968
4,0.0398,0.352544,0.940109,13.7811,79.964
5,0.006,0.356182,0.940109,13.7624,80.073


[20 26 32 19  0  1 17  5  0 15]
[29 26 32 19  0  1 17  5  0 15]
[20 26 32 19  0  1 17  5  0 15]
[29 26 32 19  0  1 17  5  0 15]
[29 26  6 19  0  1 17  5  0 15]
[29 26 32 19  0  1 17  5  0 15]
[20 26  6 19  0  1 17  5  0 15]
[29 26 32 19  0  1 17  5  0 15]
[20 26  6 19  0  1 17  5  0 15]
[29 26 32 19  0  1 17  5  0 15]


TrainOutput(global_step=1380, training_loss=0.1745831430390261, metrics={'train_runtime': 973.5816, 'train_samples_per_second': 1.417, 'total_flos': 3301812463368000, 'epoch': 5.0})

In [12]:
trainer.evaluate()

[20 26  6 19  0  1 17  5  0 15]
[29 26 32 19  0  1 17  5  0 15]


{'epoch': 5.0,
 'eval_accuracy': 0.9401088929219601,
 'eval_loss': 0.35254397988319397,
 'eval_runtime': 13.2114,
 'eval_samples_per_second': 83.413}

In [13]:
trainer.predict(test_dataset)

[28 26  9  6 26 33  5  2 13 26]
[28 26  9  6 26 33  5  2 13 26]


PredictionOutput(predictions=array([[-0.5571416 ,  0.18439111,  0.52761453, ...,  0.9496374 ,
        -0.6464733 , -1.6530906 ],
       [-0.55233616,  0.5646693 , -0.24755786, ..., -0.14540587,
        -0.60143787,  0.34126842],
       [-0.46673205,  0.2631073 , -0.6643272 , ...,  0.03545144,
        -0.82061845, -1.1058676 ],
       ...,
       [-0.6099048 ,  9.369973  ,  0.10133141, ..., -0.20009485,
         0.08358958, -0.32795525],
       [ 1.0136505 , -0.6784751 , -0.17293319, ..., -0.9429146 ,
         0.1761758 , -0.723683  ],
       [-0.31450817, -0.48556608, -0.4629359 , ...,  1.8547591 ,
         1.5557394 , -0.12153225]], dtype=float32), label_ids=array([28, 26,  9, ...,  1, 24, 20]), metrics={'eval_loss': 0.34637635946273804, 'eval_accuracy': 0.9310094408133623, 'eval_runtime': 15.6207, 'eval_samples_per_second': 88.152})

In [None]:
# from torch.utils.data import DataLoader
# from transformers import  AdamW

# # device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')

# model.to(device)
# model.train()

# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# optim = AdamW(model.parameters(), lr=5e-5)

# for epoch in range(3):
#     for batch in train_loader:
#         optim.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs[0]
#         loss.backward()
#         optim.step()

# model.eval()