In [1]:
# Imports
!pip install -q transformers[torch] datasets pysentimiento accelerate evaluate
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline
import random
import torch
import torch.nn.functional as F
import re

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Eliminar saltos de línea y espacios repetidos
def delete_spaces(comment):
    spaces_pattern = r'[\n\r]+|\s+'
    return re.sub(spaces_pattern, ' ', comment)

# Convertir todo a minúsculas
def lower_text(comment):
    return comment.lower()

# Eliminar URL
def delete_urls(comment):
    url_pattern = r'http[s]?://\S+'
    return re.sub(url_pattern, '', comment)

# Eliminar consonantes repetidas y puntos suspensivos
def delete_repeated_consonants(comment):
    repeated_consonant_pattern = r'([^aeiou\s\r\n0-9])\1{1,}'
    def replace(match):
        char = match.group(1)
        if char in 'rcnl':
            return char * 2
        else:
            return char

    return re.sub(repeated_consonant_pattern, replace, comment, flags=re.IGNORECASE)

# Dejar como máximo 2 vocales iguales contiguas
def delete_repeated_vowels(comment):
    repeated_vowels_pattern = r'([aeiouAEIOU])\1{2,}'
    return re.sub(repeated_vowels_pattern, r'\1\1', comment, flags=re.IGNORECASE)

# Eliminar acentos no empleados en Español
def delete_accents (comment):
    comment = re.sub(r"[àâãäå]", "a", comment)
    comment = re.sub(r"ç", "c", comment)
    comment = re.sub(r"[èêë]", "e", comment)
    comment = re.sub(r"[ìîï]", "i", comment)
    comment = re.sub(r"[òôõö]", "o", comment)
    comment = re.sub(r"[ùû]", "u", comment)
    comment = re.sub(r"[ýÿ]", "y", comment)
    return comment

# Eliminar caracteres inusuales
def delete_characters(comment):
    special_characters = r'[ºª|·~¬\^`[\]¨´#\\\'\(\)*\<>_]'
    return re.sub(special_characters, '', comment)

# Eliminar emoticonos
def delete_emoticons(comment):
    emoticon_pattern = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F900-\U0001F9FF]'
    return re.sub(emoticon_pattern, '', comment)

# Unificar las distintas formas de expresar la risa
def unify_laughs (comment):
    laugh_pattern = r"\b(a*ha+h[ha]*|o?l+o+l+[ol]*|x+d+[x*d*]*|a*ja+[j+a+]+|j+e+j+[ej]*)\b"
    return re.sub(laugh_pattern, 'jaja', comment, flags=re.IGNORECASE)

# Función para preprocesar el texto
def preprocess_comment(comment):
    comment = delete_spaces(comment)
    comment = lower_text(comment)
    comment = delete_urls(comment)
    comment = delete_repeated_consonants(comment)
    comment = delete_repeated_vowels(comment)
    comment = delete_accents(comment)
    comment = delete_characters(comment)
    comment = delete_emoticons(comment)
    comment = unify_laughs(comment)
    return comment

In [3]:
def compute_metrics(pred):
    y_true = pred.label_ids                 # son las labels reales
    y_pred = pred.predictions.argmax(-1)    # son las predicciones
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [4]:
# Cargar el dataset
database = "amaiaruvi/news_racist_comments_spanish"
dataset = load_dataset(database)
dataset

Downloading readme:   0%|          | 0.00/623 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/406k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/121k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3005 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/438 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/851 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['link', 'title', 'comment', 'racist'],
        num_rows: 3005
    })
    validation: Dataset({
        features: ['link', 'title', 'comment', 'racist'],
        num_rows: 438
    })
    test: Dataset({
        features: ['link', 'title', 'comment', 'racist'],
        num_rows: 851
    })
})

In [5]:
# Cargar el modelo
modelo = "dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = AutoTokenizer.from_pretrained(modelo)
# BertForSequenceClassification ya incluye una capa de clasificación para tareas como clasificación de texto. Solo necesitas especificar el número de clases (en este caso, 2).
model = AutoModelForSequenceClassification.from_pretrained(modelo, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model.config

BertConfig {
  "_name_or_path": "dccuchile/bert-base-spanish-wwm-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31002
}

In [7]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [8]:
# Este modelo admite 125128 tokens por cada sentencia
tokenizer.model_max_length

512

In [9]:
tokenizer.get_vocab()

{'##rid': 4250,
 'az': 3129,
 'massachusetts': 19339,
 'llamaré': 8172,
 '##máticamente': 9892,
 'griff': 16672,
 'instan': 14928,
 'resultaba': 24949,
 'fundamentos': 20100,
 'hussein': 29367,
 'dong': 19739,
 '[unused887]': 893,
 '##ólogos': 10608,
 'agus': 12574,
 'oraciones': 16729,
 'macar': 22471,
 'activ': 1929,
 '##zano': 21871,
 'hídricos': 20711,
 'detendrá': 25334,
 'formaban': 21853,
 'pine': 23057,
 '900': 12269,
 'sung': 25972,
 'presidenciales': 17224,
 'amos': 20489,
 'escotilla': 29196,
 'ofrecí': 30378,
 'volver': 2294,
 'muévanse': 11008,
 'rode': 5458,
 'agreg': 9588,
 'rami': 16282,
 'solamente': 4635,
 '##emente': 5607,
 'estatuto': 7672,
 'caídos': 27179,
 'barbara': 18264,
 'sisté': 27849,
 'comportarse': 30770,
 'contengan': 20696,
 'society': 17533,
 'mango': 25956,
 'archipiélago': 18435,
 '##bao': 16936,
 'déjame': 3687,
 'culp': 5476,
 'dichos': 7142,
 'gatillo': 17696,
 'revol': 24165,
 'constatar': 30202,
 '[unused614]': 620,
 '##emp': 1374,
 'aplicado': 

In [12]:
print("Preprocessing data...")
preprocessed_data = dataset.map(lambda ex: {
    "comment": preprocess_comment(ex["comment"]),
    "title": preprocess_comment(ex["title"]),
    "label": ex["racist"]
})

Preprocessing data...


Map:   0%|          | 0/3005 [00:00<?, ? examples/s]

Map:   0%|          | 0/438 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [20]:
def custom_tokenizer(examples):
    return tokenizer(
        examples["comment"],
        examples["title"],
        padding=True,
        truncation=True,
        max_length=tokenizer.model_max_length,
    )

In [21]:
print("Tokenizing data...")
encoded_data = preprocessed_data.map(custom_tokenizer, batched=True)
encoded_data = encoded_data.remove_columns(['link', 'title', 'comment', 'racist'])

Tokenizing data...


Map:   0%|          | 0/3005 [00:00<?, ? examples/s]

Map:   0%|          | 0/438 [00:00<?, ? examples/s]

Map:   0%|          | 0/851 [00:00<?, ? examples/s]

In [22]:
preprocessed_data['test'][1]

{'link': 'https://okdiario.com/espana/vox-empapela-gerona-carteles-arabe-estas-espana-hombres-mujeres-tienen-mismos-derechos-12797483',
 'title': 'vox empapela gerona con carteles en árabe: «estás en españa, hombres y mujeres tienen los mismos derechos»',
 'comment': 'mira quien habla, los de los tiros en la nuca.',
 'racist': 0,
 'label': 0}

In [25]:
tokenized = custom_tokenizer(preprocessed_data['test'][1])
tokens_strings = tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

print("Texto a tokenizar:", preprocessed_data['test'][1]['comment'], ' + ', preprocessed_data['test'][1]['title'])
print("Tokens:", tokens_strings)
print("\n\ninput_ids:", tokenized['input_ids'])
print("token_type_ids:", tokenized['token_type_ids'])
print("attention_mask:", tokenized['attention_mask'])

Texto a tokenizar: mira quien habla, los de los tiros en la nuca.  +  vox empapela gerona con carteles en árabe: «estás en españa, hombres y mujeres tienen los mismos derechos»
Tokens: ['[CLS]', 'mira', 'quien', 'habla', ',', 'los', 'de', 'los', 'tiros', 'en', 'la', 'nuca', '.', '[SEP]', 'vo', '##x', 'empa', '##pel', '##a', 'ger', '##ona', 'con', 'carteles', 'en', 'árabe', ':', '[UNK]', 'estás', 'en', 'españa', ',', 'hombres', 'y', 'mujeres', 'tienen', 'los', 'mismos', 'derechos', '[UNK]', '[SEP]']


input_ids: [4, 2065, 1925, 2892, 1019, 1067, 1009, 1067, 15737, 1035, 1032, 27509, 1008, 5, 2107, 30991, 7431, 13722, 30956, 4601, 1791, 1048, 19949, 1035, 6338, 995, 3, 1499, 1035, 2942, 1019, 2305, 1040, 2209, 2018, 1067, 4549, 1889, 3, 5]
token_type_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [26]:
# Aquí cambiaríamos los hiperparámetros
epochs = 5  # 8
batch_size = 16  # 8
learning_rate = 2e-5  # 4.5e-5
weight_decay = 0.01  # 0.16
warmup_proportion = 0.1  # 0.2
total_steps = (epochs * len(dataset['train'])) / batch_size
warmup_steps = int(warmup_proportion * total_steps)

training_args = TrainingArguments(
    output_dir='./results',
    logging_dir='./logs',
    evaluation_strategy="epoch",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps,
    eval_accumulation_steps=1,
    logging_steps=500,
    save_steps=1000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    train_dataset=encoded_data['train'],
    eval_dataset=encoded_data['validation'],
    args=training_args,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

)



In [27]:
# Entrenamiento
print("Training the model...")
trainer.train()

Training the model...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.406916,0.828767,0.686603,0.796353,0.657293
2,No log,0.473297,0.844749,0.762261,0.782475,0.747479
3,0.251700,0.759501,0.847032,0.764786,0.786855,0.74895
4,0.251700,0.874431,0.863014,0.795231,0.807955,0.784664
5,0.251700,0.902995,0.863014,0.79023,0.812483,0.77377


TrainOutput(global_step=940, training_loss=0.14203561569782014, metrics={'train_runtime': 787.327, 'train_samples_per_second': 19.084, 'train_steps_per_second': 1.194, 'total_flos': 2408514489655200.0, 'train_loss': 0.14203561569782014, 'epoch': 5.0})

In [28]:
# Evaluar el modelo con el conjunto de validación: se queda con el mejor valor
print("Evaluating with validation set.")
trainer.evaluate()

Evaluating with validation set.


{'eval_loss': 0.9029948711395264,
 'eval_accuracy': 0.863013698630137,
 'eval_f1': 0.7902298850574712,
 'eval_precision': 0.8124828720197315,
 'eval_recall': 0.7737695078031213,
 'eval_runtime': 5.6872,
 'eval_samples_per_second': 77.014,
 'eval_steps_per_second': 4.923,
 'epoch': 5.0}

In [29]:
# Predicciones
print("Predictions:")
test_predictions = trainer.predict(encoded_data["test"])
y_true = test_predictions.label_ids

logits = test_predictions.predictions
# Convertir los logits a un tensor de PyTorch
logits_tensor = torch.tensor(logits)
# Aplicar la función softmax a los logits para obtener probabilidades
probabilities = F.softmax(logits_tensor, dim=1)
# Obtener las clases predichas (índice de la probabilidad más alta)
y_pred = torch.argmax(probabilities, dim=1)
reporte = classification_report(y_true, y_pred, output_dict=False)
print(reporte)

Predictions:


              precision    recall  f1-score   support

           0       0.91      0.92      0.91       654
           1       0.72      0.69      0.71       197

    accuracy                           0.87       851
   macro avg       0.82      0.81      0.81       851
weighted avg       0.87      0.87      0.87       851



In [30]:
# Evaluación del modelo
# Crear el `Trainer` con el conjunto de evaluación y la función para métricas
print("Test evaluating with trainer:")
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=encoded_data["test"],  # Conjunto de evaluación
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,  # Función para calcular métricas
)

# Evaluar el modelo
trainer.evaluate()

Test evaluating with trainer:


{'eval_loss': 0.8027085661888123,
 'eval_accuracy': 0.8672150411280846,
 'eval_f1': 0.8103462217357433,
 'eval_precision': 0.8156991110683226,
 'eval_recall': 0.8054223132926621,
 'eval_runtime': 21.8053,
 'eval_samples_per_second': 39.027,
 'eval_steps_per_second': 2.476}