In [1]:
# Imports
!pip install -q transformers[torch] datasets pysentimiento accelerate evaluate
from datasets import load_dataset, load_dataset_builder, get_dataset_split_names, load_dataset, concatenate_datasets, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline
from pysentimiento import create_analyzer
from pysentimiento.preprocessing import preprocess_tweet
import random
import torch
import re

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Eliminar saltos de línea y espacios repetidos
def delete_spaces(comment):
    spaces_pattern = r'[\n\r]+|\s+'
    return re.sub(spaces_pattern, ' ', comment)

# Convertir todo a minúsculas
def lower_text(comment):
    return comment.lower()

# Eliminar URL
def delete_urls(comment):
    url_pattern = r'http[s]?://\S+'
    return re.sub(url_pattern, '', comment)

# Eliminar consonantes repetidas y puntos suspensivos
def delete_repeated_consonants(comment):
    repeated_consonant_pattern = r'([^aeiou\s\r\n0-9])\1{1,}'
    def replace(match):
        char = match.group(1)
        if char in 'rcnl':
            return char * 2
        else:
            return char

    return re.sub(repeated_consonant_pattern, replace, comment, flags=re.IGNORECASE)

# Dejar como máximo 2 vocales iguales contiguas
def delete_repeated_vowels(comment):
    repeated_vowels_pattern = r'([aeiouAEIOU])\1{2,}'
    return re.sub(repeated_vowels_pattern, r'\1\1', comment, flags=re.IGNORECASE)

# Eliminar acentos no empleados en Español
def delete_accents (comment):
    comment = re.sub(r"[àâãäå]", "a", comment)
    comment = re.sub(r"ç", "c", comment)
    comment = re.sub(r"[èêë]", "e", comment)
    comment = re.sub(r"[ìîï]", "i", comment)
    comment = re.sub(r"[òôõö]", "o", comment)
    comment = re.sub(r"[ùû]", "u", comment)
    comment = re.sub(r"[ýÿ]", "y", comment)
    return comment

# Eliminar caracteres inusuales
def delete_characters(comment):
    special_characters = r'[ºª|·~¬\^`[\]¨´#\\\'\(\)*\<>_]'
    return re.sub(special_characters, '', comment)

# Eliminar emoticonos
def delete_emoticons(comment):
    emoticon_pattern = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F900-\U0001F9FF]'
    return re.sub(emoticon_pattern, '', comment)

# Unificar las distintas formas de expresar la risa
def unify_laughs (comment):
    laugh_pattern = r"\b(a*ha+h[ha]*|o?l+o+l+[ol]*|x+d+[x*d*]*|a*ja+[j+a+]+|j+e+j+[ej]*)\b"
    return re.sub(laugh_pattern, 'jaja', comment, flags=re.IGNORECASE)

# Función para preprocesar el texto
def preprocess_comment(comment):
    comment = delete_spaces(comment)
    comment = lower_text(comment)
    comment = delete_urls(comment)
    comment = delete_repeated_consonants(comment)
    comment = delete_repeated_vowels(comment)
    comment = delete_accents(comment)
    comment = delete_characters(comment)
    comment = delete_emoticons(comment)
    comment = unify_laughs(comment)
    return comment

In [3]:
def get_metrics(predictions, labels):
    y_true = labels[:, 0]
    y_pred = predictions[:, 0]
    acc = accuracy_score(y_true, y_pred)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='binary', zero_division=0,
    )

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

def compute_metrics(predictions):
    outputs = predictions.predictions
    labels = predictions.label_ids

    binary_predictions = outputs > 0

    return get_metrics(binary_predictions, labels)

In [4]:
# Cargar el dataset
database_checkpoint = "amaiaruvi/news_racist_comments_spanish"
dataset = load_dataset(database_checkpoint)
dataset

Downloading readme:   0%|          | 0.00/623 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/406k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/121k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3005 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/438 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/851 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['link', 'title', 'comment', 'racist'],
        num_rows: 3005
    })
    validation: Dataset({
        features: ['link', 'title', 'comment', 'racist'],
        num_rows: 438
    })
    test: Dataset({
        features: ['link', 'title', 'comment', 'racist'],
        num_rows: 851
    })
})

In [5]:
# Cargar el modelo
modelo = "pysentimiento/robertuito-hate-speech"
tokenizer = AutoTokenizer.from_pretrained(modelo)
model = AutoModelForSequenceClassification.from_pretrained(modelo)

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/956 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/435M [00:00<?, ?B/s]

In [6]:
# Configuración del modelo:
model.config

RobertaConfig {
  "_name_or_path": "pysentimiento/robertuito-hate-speech",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "hateful",
    "1": "targeted",
    "2": "aggressive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "aggressive": 2,
    "hateful": 0,
    "targeted": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.41.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 30002
}

In [7]:
tokenizer.all_special_tokens

['<s>', '</s>', '<unk>', '<pad>', '<mask>']

In [8]:
# Este modelo sólo admite 128 tokens por cada sentencia; por lo que hay que truncar
tokenizer.model_max_length

128

In [9]:
tokenizer.get_vocab()

{'▁oficial,': 25977,
 '▁medalla': 5927,
 'mula': 5601,
 '▁he': 723,
 '▁video,': 15824,
 '▁garchar': 29975,
 '▁tocará': 25075,
 '▁serio.': 13725,
 '▁paz': 2066,
 '▁af': 1177,
 "▁c'est": 13172,
 '▁chica': 3669,
 '▁gabriela': 20653,
 '▁ts': 7764,
 '▁explos': 10790,
 'dra': 4620,
 'íso': 11518,
 '▁balonmano': 22839,
 '▁sábado.': 20165,
 '▁paso.': 18353,
 '▁mix': 7457,
 '▁rcn': 25903,
 'late': 20106,
 '▁cerrado': 3506,
 'os"': 16866,
 'bul': 13304,
 '▁pár': 20763,
 '▁ído': 8082,
 '▁inmen': 9688,
 '▁oscura': 23715,
 'nada': 3067,
 'loto': 26793,
 '▁habia': 6903,
 '...¿': 24696,
 '็': 215,
 '▁tamb': 2594,
 'ende': 11564,
 '인': 382,
 '▁brian': 18182,
 '▁admira': 12127,
 'gate': 14061,
 'vaila': 6195,
 'rete': 15554,
 '▁encuentre': 16010,
 'cú': 13448,
 '을': 379,
 '▁bipol': 21745,
 '▁gordito': 20633,
 '▁hor': 1116,
 '▁queroirnoshowak': 28517,
 '▁intent': 4094,
 'entos': 5041,
 'cou': 20237,
 '▁incluidos': 29188,
 '▁い': 9059,
 '▁sevilla,': 28466,
 '▁pésame': 25223,
 '▁ching': 4850,
 'pot': 23453

In [10]:
# Se especifica que para utilizar el modelo ""pysentimiento/robertuito-hate-speech"
# antes hay que preprocesar el texto con su función "preprocess_tweet".

print("Preprocessing data...")
preprocessed_data = dataset.map(lambda ex: {
    "comment": preprocess_comment(preprocess_tweet(ex["comment"], lang="es")),
    "title": preprocess_comment(preprocess_tweet(ex["title"], lang="es")),
    "labels": torch.tensor([ex["racist"], 0, 0], dtype=torch.float)
})

Preprocessing data...


Map:   0%|          | 0/3005 [00:00<?, ? examples/s]

Map:   0%|          | 0/438 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [11]:
def custom_tokenizer(examples):
    return tokenizer(
        examples["comment"],
        examples["title"],
        padding=True,
        truncation=True,
        max_length=tokenizer.model_max_length,
        #max_length=1024
    )

'''
Se puede mofificar el parámetro max_length, pero es importante considerar que
aumentar max_length también puede afectar la precisión del modelo en algunas
tareas, especialmente si el modelo se preentrenó con secuencias más cortas.
Esto se debe a que las secuencias más largas pueden contener más ruido o
información irrelevante, lo que podría dificultar que el modelo aprenda
representaciones significativas.
'''

'\nSe puede mofificar el parámetro max_length, pero es importante considerar que\naumentar max_length también puede afectar la precisión del modelo en algunas\ntareas, especialmente si el modelo se preentrenó con secuencias más cortas.\nEsto se debe a que las secuencias más largas pueden contener más ruido o\ninformación irrelevante, lo que podría dificultar que el modelo aprenda\nrepresentaciones significativas.\n'

In [12]:
print("Tokenizing data...")
encoded_data = preprocessed_data.map(custom_tokenizer, batched=True)
encoded_data = encoded_data.remove_columns(['link', 'title', 'comment', 'racist'])

Tokenizing data...


Map:   0%|          | 0/3005 [00:00<?, ? examples/s]

Map:   0%|          | 0/438 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [13]:
preprocessed_data['test'][1]

{'link': 'https://okdiario.com/espana/vox-empapela-gerona-carteles-arabe-estas-espana-hombres-mujeres-tienen-mismos-derechos-12797483',
 'title': 'vox empapela gerona con carteles en árabe: "estás en españa, hombres y mujeres tienen los mismos derechos"',
 'comment': 'mira quien habla, los de los tiros en la nuca.',
 'racist': 0,
 'labels': [0.0, 0.0, 0.0]}

In [14]:
tokenized = custom_tokenizer(preprocessed_data['test'][1])
tokens_strings = tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

print("Texto a tokenizar:", preprocessed_data['test'][1]['comment'], ' + ', preprocessed_data['test'][1]['title'])
print("Tokens:", tokens_strings)
print("\n\ninput_ids:", tokenized['input_ids'])
print("token_type_ids:", tokenized['token_type_ids'])
print("attention_mask:", tokenized['attention_mask'])

Texto a tokenizar: mira quien habla, los de los tiros en la nuca.  +  vox empapela gerona con carteles en árabe: "estás en españa, hombres y mujeres tienen los mismos derechos"
Tokens: ['<s>', '▁mira', '▁quien', '▁habla,', '▁los', '▁de', '▁los', '▁tiros', '▁en', '▁la', '▁nu', 'ca.', '</s>', '</s>', '▁vox', '▁empa', 'pe', 'la', '▁ger', 'ona', '▁con', '▁carteles', '▁en', '▁ára', 'be', ':', '▁"', 'est', 'ás', '▁en', '▁españa,', '▁hombres', '▁y', '▁mujeres', '▁tienen', '▁los', '▁mismos', '▁derechos', '"', '</s>']


input_ids: [0, 1659, 1005, 24880, 497, 413, 497, 13470, 452, 446, 2255, 4531, 2, 2, 4601, 6103, 724, 486, 4387, 918, 461, 21312, 452, 17837, 773, 30, 576, 2005, 537, 452, 8478, 2976, 445, 2054, 1215, 497, 4212, 3186, 6, 2]
token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [15]:
# Aquí cambiaríamos los hiperparámetros
epochs = 5  # 8
batch_size = 16  # 8
learning_rate = 2e-5  # 4.5e-5
weight_decay = 0.01  # 0.16
warmup_proportion = 0.1  # 0.2
total_steps = (epochs * len(dataset['train'])) / batch_size
warmup_steps = int(warmup_proportion * total_steps)

training_args = TrainingArguments(
    output_dir='./output',
    logging_dir='./logs',
    evaluation_strategy="epoch",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps,
    eval_accumulation_steps=1,
    logging_steps=500,
    save_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_data['train'],
    eval_dataset=encoded_data['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
)



In [17]:
# Entrenamiento
print("Training the model...")
trainer.train()

Training the model...
{'eval_loss': 0.12490922957658768, 'eval_accuracy': 0.819634703196347, 'eval_f1': 0.4968152866242038, 'eval_precision': 0.6610169491525424, 'eval_recall': 0.3979591836734694, 'eval_runtime': 2.9111, 'eval_samples_per_second': 150.459, 'eval_steps_per_second': 9.618, 'epoch': 1.0}
{'eval_loss': 0.11551877111196518, 'eval_accuracy': 0.8538812785388128, 'eval_f1': 0.6444444444444445, 'eval_precision': 0.7073170731707317, 'eval_recall': 0.5918367346938775, 'eval_runtime': 3.0041, 'eval_samples_per_second': 145.801, 'eval_steps_per_second': 9.321, 'epoch': 2.0}
{'loss': 0.1058, 'grad_norm': 2.774268388748169, 'learning_rate': 1.0389610389610389e-05, 'epoch': 2.6595744680851063}
{'eval_loss': 0.14854760468006134, 'eval_accuracy': 0.865296803652968, 'eval_f1': 0.6740331491712708, 'eval_precision': 0.7349397590361446, 'eval_recall': 0.6224489795918368, 'eval_runtime': 3.0623, 'eval_samples_per_second': 143.029, 'eval_steps_per_second': 9.143, 'epoch': 3.0}
{'eval_loss': 0

TrainOutput(global_step=940, training_loss=0.0684297835573237, metrics={'train_runtime': 337.0432, 'train_samples_per_second': 44.579, 'train_steps_per_second': 2.789, 'train_loss': 0.0684297835573237, 'epoch': 5.0})

In [18]:
# Evaluar el modelo con el conjunto de validación: se queda con el mejor valor
print("Evaluating with validation set.")
trainer.evaluate()

Evaluating with validation set.
{'eval_loss': 0.18007436394691467, 'eval_accuracy': 0.863013698630137, 'eval_f1': 0.6470588235294118, 'eval_precision': 0.7638888888888888, 'eval_recall': 0.5612244897959183, 'eval_runtime': 2.8996, 'eval_samples_per_second': 151.054, 'eval_steps_per_second': 9.656, 'epoch': 5.0}


{'eval_loss': 0.18007436394691467,
 'eval_accuracy': 0.863013698630137,
 'eval_f1': 0.6470588235294118,
 'eval_precision': 0.7638888888888888,
 'eval_recall': 0.5612244897959183,
 'eval_runtime': 2.8996,
 'eval_samples_per_second': 151.054,
 'eval_steps_per_second': 9.656,
 'epoch': 5.0}

In [19]:
# Predicciones
print("Predictions:")
test_predictions = trainer.predict(encoded_data["test"])
y_true = test_predictions.label_ids[:, 0]
y_pred = (test_predictions.predictions > 0)[:, 0]
reporte = classification_report(y_true, y_pred, output_dict=False)
print(reporte)

Predictions:
              precision    recall  f1-score   support

         0.0       0.90      0.94      0.92       654
         1.0       0.77      0.67      0.72       197

    accuracy                           0.88       851
   macro avg       0.84      0.80      0.82       851
weighted avg       0.87      0.88      0.87       851



In [20]:
# Evaluación del modelo
# Crear el `Trainer` con el conjunto de evaluación y la función para métricas
print("Test evaluating with trainer:")
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=encoded_data["test"],
    tokenizer=custom_tokenizer,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
)

# Evaluar el modelo
trainer.evaluate()

Test evaluating with trainer:
{'eval_loss': 0.155148446559906, 'eval_accuracy': 0.8766157461809636, 'eval_f1': 0.7154471544715446, 'eval_precision': 0.7674418604651163, 'eval_recall': 0.6700507614213198, 'eval_runtime': 5.7251, 'eval_samples_per_second': 148.643, 'eval_steps_per_second': 9.432}


{'eval_loss': 0.155148446559906,
 'eval_accuracy': 0.8766157461809636,
 'eval_f1': 0.7154471544715446,
 'eval_precision': 0.7674418604651163,
 'eval_recall': 0.6700507614213198,
 'eval_runtime': 5.7251,
 'eval_samples_per_second': 148.643,
 'eval_steps_per_second': 9.432}