In [None]:
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from datasets.dataset_dict import DatasetDict, IterableDatasetDict
from datasets.iterable_dataset import IterableDataset
import torch

import dataHandler as dh

In [2]:
# Dataset id from huggingface.co/dataset
dataset_id = "argilla/synthetic-domain-text-classification"
 
# Load raw dataset
train_dataset = load_dataset(dataset_id, split='train')

split_dataset = train_dataset.train_test_split(test_size=0.1)
split_dataset['train'][0]
# {'text': 'Recently, there has been an increase in property values within the suburban areas of several cities due to improvements in infrastructure and lifestyle amenities such as parks, retail stores, and educational institutions nearby. Additionally, new housing developments are emerging, catering to different family needs with varying sizes and price ranges. These changes have influenced investment decisions for many looking to buy or sell properties.', 'label': 14}


{'text': 'The impact of regular exercise on mental health has been well-documented in numerous studies. Engaging in physical activities such as running, yoga, and team sports can significantly reduce symptoms of depression and anxiety. Furthermore, exercise promotes the release of endorphins which are chemicals in the brain that act as natural mood lifters. Regular workouts also help maintain a healthy weight, strengthen muscles and bones, and improve cardiovascular health. Additionally, exercising with friends or joining clubs can enhance social interactions and community bonds.',
 'label': 17}

In [3]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 900
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})

In [4]:
split_dataset = toxigenDataset('black')

In [5]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 15902
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3976
    })
})

In [6]:
from transformers import AutoTokenizer
 
# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
 
# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, return_tensors="pt", max_length=140)
 
# Tokenize dataset
if "label" in split_dataset["train"].features.keys():
    split_dataset =  split_dataset.rename_column("label", "labels") # to match Trainer
tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=["text"])
 
tokenized_dataset["train"].features.keys()
# dict_keys(['labels', 'input_ids', 'attention_mask'])


Map: 100%|██████████| 15902/15902 [00:00<00:00, 16509.33 examples/s]
Map: 100%|██████████| 3976/3976 [00:00<00:00, 18026.54 examples/s]


dict_keys(['labels', 'input_ids', 'attention_mask'])

In [7]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 15902
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 3976
    })
})

In [8]:
from transformers import AutoModelForSequenceClassification
 
# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"
 
# Prepare model labels - useful for inference
labels = ["no hate", "hate"]
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
 
# Download the model from huggingface.co/models
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,
)


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
label2id

{'no hate': '0', 'hate': '1'}

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      

In [11]:
torch.cuda.is_available()

True

In [12]:
print(torch.version.cuda)

12.6


In [13]:
import numpy as np
from sklearn.metrics import f1_score
 
# Metric helper method
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(
            labels, predictions, labels=labels, pos_label=1, average="weighted"
        )
    return {"f1": float(score) if score == 1 else score}


In [14]:
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments
 
# Define training args
training_args = TrainingArguments(
    output_dir= "ModernBERT-domain-classifier",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=3,
    bf16=True, # bfloat16 training 
    optim="adamw_torch_fused", # improved optimizer 
    # logging & evaluation strategies
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
    # push to hub parameters
    # push_to_hub=True,
    # hub_strategy="every_save",
    # hub_token=HfFolder.get_token(),
)
 
# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)
trainer.train()
# {'train_runtime': 3642.7783, 'train_samples_per_second': 1.235, 'train_steps_per_second': 0.04, 'train_loss': 0.535627057634551, 'epoch': 5.0}


Epoch,Training Loss,Validation Loss,F1
1,0.2487,0.213493,0.923107
2,0.1055,0.227431,0.926186
3,0.0227,0.454019,0.925548


TrainOutput(global_step=1491, training_loss=0.14156248947664204, metrics={'train_runtime': 234.3896, 'train_samples_per_second': 203.533, 'train_steps_per_second': 6.361, 'total_flos': 4445052992745120.0, 'train_loss': 0.14156248947664204, 'epoch': 3.0})

In [18]:
model.save_pretrained("ModernBERT-domain-classifier_local")
tokenizer.save_pretrained("ModernBERT-domain-classifier_local_tokenizer")

('ModernBERT-domain-classifier_local_tokenizer\\tokenizer_config.json',
 'ModernBERT-domain-classifier_local_tokenizer\\special_tokens_map.json',
 'ModernBERT-domain-classifier_local_tokenizer\\tokenizer.json')

In [4]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
 
# load model from huggingface.co/models using our repository id
classifier = pipeline(
    task="text-classification", 
    tokenizer = AutoTokenizer.from_pretrained("ModernBERT-domain-classifier_local_tokenizer"),
    model=AutoModelForSequenceClassification.from_pretrained("ModernBERT-domain-classifier_local"),
    device=0 if torch.cuda.is_available() else -1
)
 



Device set to use cuda:0


In [6]:
sample = "you are a destroying stereotypes"
 
classifier(sample)

[{'label': 'no hate', 'score': 0.9976562261581421}]