Coger el dataset de piuba-bigdata/contextualized_hate_speech. Creamos un dataset nuevo no balanceado: tendrá un 20 % de comentarios racistas y un 80 % de comentarios no racistas.

In [None]:
!pip install -q datasets
!pip install -q transformers[torch] datasets
!pip install huggingface_hub
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset, load_dataset_builder, get_dataset_split_names, load_dataset, concatenate_datasets, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import random
import torch



In [None]:
database = "piuba-bigdata/contextualized_hate_speech"
ds_builder = load_dataset_builder(database)
dataset = load_dataset(database)

In [None]:
# Quedarnos con los que son racistas:
racism_dataset = dataset.filter(lambda example: example["RACISM"] == 1)
racism_dataset = racism_dataset.remove_columns(["id", "context_tweet", "CALLS", "WOMEN", "LGBTI", "CLASS", "POLITICS",
                                                "DISABLED", "APPEARANCE", "CRIMINAL", "body", "HATEFUL"])
# Quedarnos con los que no son de odio:
no_hateful_dataset = dataset.filter(lambda example: example["HATEFUL"] == 0)
no_hateful_dataset = no_hateful_dataset.remove_columns(["id", "context_tweet", "CALLS", "WOMEN", "LGBTI", "CLASS",
                                                        "POLITICS", "DISABLED", "APPEARANCE", "CRIMINAL", "body",
                                                        "HATEFUL"])

In [None]:
racism_dataset.num_rows

{'train': 1562, 'test': 485, 'dev': 422}

In [None]:
no_hateful_dataset.num_rows

{'train': 30889, 'test': 9546, 'dev': 7719}

In [None]:
# Función para eliminar registros de manera aleatoria
def remove_records(dataset, num_keep):
    num_remove = dataset.num_rows - num_keep
    indices = list(range(len(dataset)))  # Obtener todos los índices del dataset
    random.seed(42)
    random.shuffle(indices)  # Desordenar los índices para obtener una selección aleatoria
    indices_to_keep = indices[num_remove:]  # Índices que se mantendrán, quitando los primeros 'num_remove'
    return dataset.select(indices_to_keep)  # Crear un nuevo dataset con los índices seleccionados

# Primero borrar elementos de no_hateful_dataset
reduced_no_hateful_dataset = DatasetDict()
reduced_no_hateful_dataset['train'] = remove_records(no_hateful_dataset['train'], round(racism_dataset['train'].num_rows * 80 / 20))
reduced_no_hateful_dataset['validation'] = remove_records(no_hateful_dataset['dev'], round(racism_dataset['dev'].num_rows * 80 / 20))
reduced_no_hateful_dataset['test'] = remove_records(no_hateful_dataset['test'], round(racism_dataset['test'].num_rows * 80 / 20))

In [None]:
reduced_no_hateful_dataset.num_rows

{'train': 6248, 'validation': 1688, 'test': 1940}

In [None]:
unbalanced_dataset = DatasetDict()
unbalanced_dataset['train'] = concatenate_datasets([reduced_no_hateful_dataset['train'], racism_dataset['train']])
unbalanced_dataset['validation'] = concatenate_datasets([reduced_no_hateful_dataset['validation'], racism_dataset['dev']])
unbalanced_dataset['test'] = concatenate_datasets([reduced_no_hateful_dataset['test'], racism_dataset['test']])
# Desordenar los registros
unbalanced_dataset['train'] = unbalanced_dataset['train'].shuffle(seed=42)
unbalanced_dataset['validation'] = unbalanced_dataset['validation'].shuffle(seed=42)
unbalanced_dataset['test'] = unbalanced_dataset['test'].shuffle(seed=42)

In [None]:
unbalanced_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'RACISM'],
        num_rows: 7810
    })
    validation: Dataset({
        features: ['title', 'text', 'RACISM'],
        num_rows: 2110
    })
    test: Dataset({
        features: ['title', 'text', 'RACISM'],
        num_rows: 2425
    })
})

In [None]:
# Renombrar columnas
import datasets

def rename_column(dataset, old_name, new_name):
    # Crear un diccionario con los datos
    new_dict = {new_name if k == old_name else k: v for k, v in dataset.to_dict().items()}
    # Devolver un nuevo dataset con el nombre de la columna modificado
    return datasets.Dataset.from_dict(new_dict)

unbalanced_dataset['train'] = rename_column(unbalanced_dataset['train'], 'RACISM', 'racist')
unbalanced_dataset['validation'] = rename_column(unbalanced_dataset['validation'], 'RACISM', 'racist')
unbalanced_dataset['test'] = rename_column(unbalanced_dataset['test'], 'RACISM', 'racist')

unbalanced_dataset['train'] = rename_column(unbalanced_dataset['train'], 'text', 'comment')
unbalanced_dataset['validation'] = rename_column(unbalanced_dataset['validation'], 'text', 'comment')
unbalanced_dataset['test'] = rename_column(unbalanced_dataset['test'], 'text', 'comment')

In [None]:
# Reemplazar valores 'YES' y 'NO'
def replace_classlabel(example):
    if example['racist'] == 'YES':
        example['racist'] = 1
    if example['racist'] == 'NO':
      example['racist'] = 0
    return example

unbalanced_dataset['train'] = unbalanced_dataset['train'].map(replace_classlabel)
unbalanced_dataset['validation'] = unbalanced_dataset['validation'].map(replace_classlabel)
unbalanced_dataset['test'] = unbalanced_dataset['test'].map(replace_classlabel)

Map:   0%|          | 0/7810 [00:00<?, ? examples/s]

Map:   0%|          | 0/2110 [00:00<?, ? examples/s]

Map:   0%|          | 0/2425 [00:00<?, ? examples/s]

In [None]:
unbalanced_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'comment', 'racist'],
        num_rows: 7810
    })
    validation: Dataset({
        features: ['title', 'comment', 'racist'],
        num_rows: 2110
    })
    test: Dataset({
        features: ['title', 'comment', 'racist'],
        num_rows: 2425
    })
})

In [None]:
unbalanced_dataset['train'][0]

{'title': 'Quién es Kyle Rittenhouse, el adolescente de 17 años acusado de los homicidios durante las protestas en Wisconsin',
 'comment': '@usuario Y después acá piden que el pueblo este armado para matar delicuentes. Ese tipo de doctrinas, tienen efectos colaterales: este es uno de ellos',
 'racist': 0}

In [None]:
unbalanced_dataset.push_to_hub("amaiaruvi/racist_tweets_spanish_rioplatense")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/amaiaruvi/racist_tweets_spanish_rioplatense/commit/a964c59c9ebf24c6d2f822c36ddbfb1ca4301017', commit_message='Upload dataset', commit_description='', oid='a964c59c9ebf24c6d2f822c36ddbfb1ca4301017', pr_url=None, pr_revision=None, pr_num=None)