In [None]:
from transformers import (
    logging, set_seed, DataCollatorWithPadding, Trainer, TrainerCallback, TrainerState, TrainerControl,
    TrainingArguments, DistilBertTokenizer, DistilBertConfig, DistilBertForSequenceClassification
)
import pandas as pd
import matplotlib.pyplot as plt
import torch
import os
from datasets import Dataset, DatasetDict
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, roc_auc_score, recall_score, accuracy_score
import numpy as np
import torch
from typing import Dict, Any


device = torch.device("mps" if torch.backends.mps.is_available() else
                      "cuda" if torch.cuda.is_available() else "cpu")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.set_verbosity_error()

class Args:
    def __init__(self):
        self.model_name_or_path = "distilbert-base-uncased"
        self.tokenizer= DistilBertTokenizer.from_pretrained(self.model_name_or_path )
        self.max_seq_length = 512
        self.learning_rate = 2e-5
        self.num_epochs = 2
        self.per_gpu_batch_size = 16
        self.seed = 42
        self.output_dir = "./results"
        self.gradient_accumulation_steps = 1
        self.lr_scheduler_type = "linear"
        self.num_warmup_steps = 0
        self.weight_decay = 0.1
        self.push_to_hub = True
        self.model_hub_name = "a-scarlett/NLCvsBash"
        self.warmup_ratio = 0
        self.max_grad_norm = 1.0
        self.dropout_rate = 0.4
        self.save_steps = 100
        self.eval_steps = 100
        self.logging_steps = 10

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)

    if predictions.shape[1] > 1:
        one_hot_labels = np.eye(predictions.shape[1])[labels]
        roc_auc = roc_auc_score(one_hot_labels, predictions, multi_class='ovr', average='weighted')
    else:
        roc_auc = roc_auc_score(labels, predictions[:, 1])

    accuracy = accuracy_score(labels, preds)
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {
        "f1": f1,
        "accuracy": accuracy,
        "recall": recall,
        "roc_auc": roc_auc
    }

def preprocess_function(examples, tokenizer, max_seq_length):
   tokenized_examples = tokenizer(
       examples['value'],
       padding="max_length",
       truncation=True,
       max_length=max_seq_length
   )
   tokenized_examples['labels'] = examples['target']
   return tokenized_examples

def load_csv_as_dataset(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # Ensure the data has 'text' and 'label' columns
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Create a DatasetDict
    dataset = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

    return dataset

class CustomCallback(TrainerCallback):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.epochs = []
        self.texts = []
        self.predictions = []
        self.labels = []

    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        if state.is_world_process_zero:
            eval_dataloader = kwargs["eval_dataloader"]
            model = kwargs["model"]

            for batch in eval_dataloader:
                inputs = batch['input_ids'].to(model.device)
                attention_mask = batch['attention_mask'].to(model.device)
                labels = batch['labels'].cpu().numpy()

                with torch.no_grad():
                    outputs = model(input_ids=inputs, attention_mask=attention_mask)
                    preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

                decoded_texts = self.tokenizer.batch_decode(inputs, skip_special_tokens=True)

                for i in range(min(10, len(preds))):
                    self.epochs.append(state.epoch)
                    self.texts.append(decoded_texts[i])
                    self.predictions.append(preds[i])
                    self.labels.append(labels[i])

                break

    def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        fig, ax = plt.subplots()
        ax.plot(self.epochs, self.labels, 'bo-', label='Labels')
        ax.plot(self.epochs, self.predictions, 'ro-', label='Predictions')
        ax.set_xlabel('Epoch')
        ax.set_ylabel('Label/Predictions')
        ax.legend(loc='upper left')
        plt.show()

class MetricsLoggerCallback(TrainerCallback):
    def __init__(self):
        self.metrics = []

    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        if state.is_world_process_zero and "eval_predictions" in kwargs:
            predictions = kwargs["eval_predictions"].predictions
            labels = kwargs["eval_predictions"].label_ids

            if predictions.shape[-1] > 1:
                predictions = np.argmax(predictions, axis=-1)

            metrics = compute_metrics((predictions, labels))
            self.metrics.append(metrics)

    def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        epochs = list(range(1, len(self.metrics) + 1))

        f1_scores = [metric['f1'] for metric in self.metrics]
        accuracies = [metric['accuracy'] for metric in self.metrics]
        recalls = [metric['recall'] for metric in self.metrics]
        roc_aucs = [metric['roc_auc'] for metric in self.metrics]

        plt.figure()
        plt.plot(epochs, f1_scores, label='F1 Score', marker='o')
        plt.plot(epochs, accuracies, label='Accuracy', marker='x')
        plt.plot(epochs, recalls, label='Recall', marker='s')
        plt.plot(epochs, roc_aucs, label='ROC AUC', marker='d')
        plt.xlabel('Epoch')
        plt.ylabel('Metric Score')
        plt.legend()
        plt.title('Evaluation Metrics Over Epochs')
        plt.grid()
        plt.show()


def main():
    args = Args()
    set_seed(args.seed)

    train_csv_path = "data/train.csv"
    test_csv_path = "data/test.csv"

    dataset = load_csv_as_dataset(train_csv_path, test_csv_path)

    tokenizer = DistilBertTokenizer.from_pretrained(args.model_name_or_path)

    tokenized_dataset = dataset.map(
        lambda x: preprocess_function(x, tokenizer, args.max_seq_length),
        batched=True
    )

    model = DistilBertForSequenceClassification.from_pretrained(args.model_name_or_path, num_labels=2).to(device)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        learning_rate=args.learning_rate,
        lr_scheduler_type=args.lr_scheduler_type,
        warmup_ratio=args.warmup_ratio,
        eval_strategy="steps",
        save_strategy="steps",
        eval_steps=args.eval_steps,
        save_steps=args.save_steps,
        logging_steps=args.logging_steps,
        per_device_train_batch_size=args.per_gpu_batch_size,
        per_device_eval_batch_size=args.per_gpu_batch_size,
        num_train_epochs=args.num_epochs,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        weight_decay=args.weight_decay,
        save_total_limit=2,
        metric_for_best_model='f1',
        load_best_model_at_end=True,
        run_name="NLC2CMD",
        # report_to="wandb",
        max_grad_norm=args.max_grad_norm,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[MetricsLoggerCallback(), CustomCallback(tokenizer)]
    )

    trainer.evaluate()
    print("Training...")
    trainer.train()

    if args.push_to_hub:
        model.push_to_hub(args.model_hub_name)

if __name__ == "__main__":
    main()

Map:   0%|          | 0/11528 [00:00<?, ? examples/s]

Map:   0%|          | 0/5133 [00:00<?, ? examples/s]

  trainer = Trainer(


{'eval_loss': 0.6507952213287354, 'eval_model_preparation_time': 0.0006, 'eval_f1': 0.8597473736074243, 'eval_accuracy': 0.8988895382817066, 'eval_recall': 0.8988895382817066, 'eval_roc_auc': 0.7268220261731206, 'eval_runtime': 209.0813, 'eval_samples_per_second': 24.55, 'eval_steps_per_second': 1.535}
Training...
{'loss': 0.4148, 'grad_norm': 1.8826617002487183, 'learning_rate': 1.9861303744798893e-05, 'epoch': 0.013869625520110958}
{'loss': 0.3033, 'grad_norm': 1.9901610612869263, 'learning_rate': 1.9722607489597782e-05, 'epoch': 0.027739251040221916}
{'loss': 0.1992, 'grad_norm': 1.9535813331604004, 'learning_rate': 1.9583911234396674e-05, 'epoch': 0.04160887656033287}
{'loss': 0.122, 'grad_norm': 1.777485728263855, 'learning_rate': 1.9445214979195562e-05, 'epoch': 0.05547850208044383}
{'loss': 0.1498, 'grad_norm': 0.4365811347961426, 'learning_rate': 1.9306518723994454e-05, 'epoch': 0.06934812760055478}
{'loss': 0.0617, 'grad_norm': 0.14014537632465363, 'learning_rate': 1.916782246

Как можно увидеть уже спустя 1 эпоху все eval метрики больше 0.99.