In [2]:
import os
import pandas as pd
import ast
from datasets import Dataset
from transformers import AutoTokenizer ,AutoModelForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
filename = "TokenCLS_Dataset.csv"
filepath = os.path.abspath(filename)

df = pd.read_csv(filepath)

df["tokens"] = df["tokens"].apply(ast.literal_eval)
df["labels"] = df["labels"].apply(ast.literal_eval)

dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 30000
})

In [None]:
class BERTTokenClassification:
    def __init__(self, modelname,col_name,num_labels):
        self.modelname = modelname
        self.tokens = col_name
        self.num_labels = num_labels
        self.tokenizer = AutoTokenizer.from_pretrained(modelname)

    def encoding_data(self, dataset, batch_size=2000, batched=True):
        def tokenize(batch):
            tokenized_inputs = self.tokenizer(batch["tokens"], is_split_into_words=True, truncation=True, padding='max_length', return_attention_mask=True)
            
            labels = []
            for i, label in enumerate(batch["labels"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                label_ids = []
                previous_word_idx = None
                for word_idx in word_ids:
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)
            tokenized_inputs["labels"] = labels
            return tokenized_inputs
        return dataset.map(tokenize, batched=batched, batch_size=batch_size)
    
        
    def train(self, train, test, learning_rate, epochs):

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        model = AutoModelForTokenClassification.from_pretrained(self.modelname, num_labels=self.num_labels).to(device)

        training_args = TrainingArguments(
            output_dir='./results',
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate= learning_rate,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs= epochs,
            weight_decay=0.01,
        )

        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = logits.argmax(axis=-1)
            return {"accuracy": accuracy_score(labels, predictions)}

        self.trainer = Trainer(
            model= model,
            args = training_args,
            train_dataset= train,
            eval_dataset= test,
            compute_metrics= compute_metrics
        )

        self.trainer.train()

    def evaluate(self):
        return self.trainer.evaluate()
    
    def save_model(self, path):
        self.trainer.save_model(path)
        self.tokenizer.save_pretrained(path)

In [19]:
bert_token_cls = BERTTokenClassification("distilbert-base-uncased","tokens",7)

tokenized_data = bert_token_cls.encoding_data(dataset, 5000)


Map: 100%|██████████| 30000/30000 [00:08<00:00, 3608.28 examples/s]


In [20]:
split = tokenized_data.train_test_split(test_size=0.2)

trained = bert_token_cls.train(split["train"],split['test'],2e-5,3)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 6.64 MiB is free. Of the allocated memory 2.49 GiB is allocated by PyTorch, and 29.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)