In [None]:
!pip install -q transformers[torch] datasets pysentimiento accelerate evaluate optuna
from datasets import load_dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, \
    DataCollatorWithPadding
from pysentimiento.preprocessing import preprocess_tweet
import optuna
import torch

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
def get_metrics(predictions, labels):
    y_true = labels[:, 0]
    y_pred = predictions[:, 0]
    acc = accuracy_score(y_true, y_pred)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='binary', zero_division=0,
    )

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [None]:
def compute_metrics(predictions):
    outputs = predictions.predictions
    labels = predictions.label_ids

    binary_predictions = outputs > 0

    return get_metrics(binary_predictions, labels)

In [None]:
def custom_tokenizer(examples):
    return tokenizer(
        examples["comment"],
        examples["title"],
        padding=True,
        truncation=True,
        max_length=tokenizer.model_max_length,
    )

In [None]:
database_checkpoint = "amaiaruvi/news_racist_comments_spanish"
dataset = load_dataset(database_checkpoint)
modelo = "pysentimiento/robertuito-hate-speech"
tokenizer = AutoTokenizer.from_pretrained(modelo)
model = AutoModelForSequenceClassification.from_pretrained(modelo)

Downloading readme:   0%|          | 0.00/623 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/406k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/121k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3005 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/438 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/851 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/956 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/435M [00:00<?, ?B/s]

In [None]:
preprocessed_data = dataset.map(lambda ex: {
    "comment": preprocess_tweet(ex["comment"], lang="es"),
    "title": preprocess_tweet(ex["title"], lang="es"),
    "labels": torch.tensor([ex["racist"], 0, 0], dtype=torch.float)
})

Map:   0%|          | 0/3005 [00:00<?, ? examples/s]

Map:   0%|          | 0/438 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [None]:
encoded_data = preprocessed_data.map(custom_tokenizer, batched=True)
encoded_data = encoded_data.remove_columns(['link', 'title', 'comment', 'racist'])

Map:   0%|          | 0/3005 [00:00<?, ? examples/s]

Map:   0%|          | 0/438 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [None]:
def find_hyperparameters_robertuito_hate_speech(trial):
    epochs = trial.suggest_int("epochs", 3, 10)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1)
    warmup_steps_proportion = trial.suggest_float("warmup_steps_proportion", 0.0, 0.3)
    total_steps = epochs * len(encoded_data['train'])
    warmup_steps = int(warmup_steps_proportion * total_steps)

    training_args = TrainingArguments(
        output_dir='./output/tuned-robertuito-hate-speech',
        logging_dir='./logs/tuned-robertuito-hate-speech',
        eval_strategy="epoch",
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        eval_accumulation_steps=1,
        logging_steps=500,
        save_steps=1000,
        save_total_limit=2,
        fp16=True
    )

    trainer = Trainer(
        model=model,
        train_dataset=encoded_data['train'],
        eval_dataset=encoded_data['validation'],
        args=training_args,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
    )

    # Entrenar el modelo
    trainer.train()
    # Evaluar el modelo
    eval_result = trainer.evaluate(encoded_data['validation'])
    # Optuna maximiza, así que devolvemos la métrica negativa si se minimiza
    return eval_result['eval_f1']

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(find_hyperparameters_robertuito_hate_speech, n_trials=2)

# Imprimir los mejores hiperparámetros encontrados
print("Best hyperparameters: ", study.best_params)

# Evaluar el mejor modelo en el conjunto de prueba
best_trial = study.best_trial
best_hyperparameters = best_trial.params
print(best_hyperparameters)

[I 2024-05-25 22:33:40,016] A new study created in memory with name: no-name-44d0947e-1310-4ef2-87e0-bccb4549dabd
