In [1]:
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from datasets.dataset_dict import DatasetDict, IterableDatasetDict
from datasets.iterable_dataset import IterableDataset

# Dataset id from huggingface.co/dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_id = "/data/home/wangys/DataSelection-IF/datasets/synthetic"

# Load raw dataset
train_dataset = load_dataset(dataset_id, split='train')

split_dataset = train_dataset.train_test_split(test_size=0.1)
split_dataset['train'][5:7]

{'text': ['The text delves into the intricate storytelling mechanisms of video games across various genres. It explores how developers use narrative techniques such as branching storylines and non-linear progression to engage players deeply in their gaming experience. The analysis includes a comparison with traditional forms of storytelling, like novels and films, highlighting unique features found only in interactive media.',
  'In recent years, there has been a substantial shift in the way we communicate and access information, driven largely by advancements in internet technology and telecommunications. These improvements have fundamentally changed how we interact with each other and with various forms of media, transforming daily activities such as shopping, banking, education, entertainment, and more into experiences that are increasingly digital-centric. This trend is not merely a product of consumer demand but also reflects significant innovations from leading tech companies who

In [3]:
from transformers import AutoTokenizer

# Model id to load the tokenizer
model_id = "../../model/ModernBERT-base/"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True,padding=True, return_tensors="pt")

# Tokenize dataset
if "label" in split_dataset["train"].features.keys():
    split_dataset =  split_dataset.rename_column("label", "labels") # to match Trainer
tokenized_dataset = split_dataset.map(tokenize, batched=True, remove_columns=["text"])

Map:   0%|          | 0/900 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 900/900 [00:00<00:00, 4268.76 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 4397.88 examples/s]


In [4]:
from transformers import AutoModelForSequenceClassification

# Model id to load the tokenizer
# model_id = "answerdotai/ModernBERT-base"

# Prepare model labels - useful for inference
labels = tokenized_dataset["train"].features["labels"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

# Download the model from huggingface.co/models
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
)

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in ModernBertForSequenceClassification is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at ../../model/ModernBERT-base/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this

In [5]:
from transformers import Trainer, TrainingArguments
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# Define training args
training_args = TrainingArguments(
    output_dir= "ModernBERT-domain-classifier",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    learning_rate=5e-5,
    num_train_epochs=5,
    bf16=False, # bfloat16 training
    optim="adamw_torch_fused", # improved optimizer
    # logging & evaluation strategies
    logging_strategy="steps",
    eval_strategy="epoch",
    save_strategy="no",
    save_total_limit=2,
    # load_best_model_at_end=True,
    report_to='none'
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"]
)
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.511188
2,No log,0.721274
3,0.704200,0.400104
4,0.704200,0.49261
5,0.058800,0.48144


TrainOutput(global_step=1125, training_loss=0.34063020186954074, metrics={'train_runtime': 99.4246, 'train_samples_per_second': 45.26, 'train_steps_per_second': 11.315, 'total_flos': 1120294678932000.0, 'train_loss': 0.34063020186954074, 'epoch': 5.0})

In [19]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 900
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 100
    })
})

In [20]:
import numpy as np
predictions = trainer.predict(tokenized_dataset["test"])

# Process the prediction results (predictions, label_ids, metrics)
predicted_labels = np.argmax(predictions.predictions, axis=1)
for i in range(100):
    predicted_label = id2label[str(predicted_labels[i])]
    example_data = split_dataset['test']


    print(f"Predicted Label: {predicted_label}")
    print(f"Actual Label: {id2label[str(example_data[i]['labels'])]}")

Predicted Label: business-and-industrial
Actual Label: arts-and-entertainment
Predicted Label: business-and-industrial
Actual Label: computers-and-electronics
Predicted Label: business-and-industrial
Actual Label: finance
Predicted Label: business-and-industrial
Actual Label: real-estate
Predicted Label: business-and-industrial
Actual Label: sports
Predicted Label: business-and-industrial
Actual Label: food-and-drink
Predicted Label: business-and-industrial
Actual Label: people-and-society
Predicted Label: business-and-industrial
Actual Label: games
Predicted Label: business-and-industrial
Actual Label: pets-and-animals
Predicted Label: business-and-industrial
Actual Label: real-estate
Predicted Label: business-and-industrial
Actual Label: real-estate
Predicted Label: business-and-industrial
Actual Label: pets-and-animals
Predicted Label: business-and-industrial
Actual Label: internet-and-telecom
Predicted Label: business-and-industrial
Actual Label: autos-and-vehicles
Predicted Label:

In [16]:
len(predictions[1])

100