In [30]:
#!pip install evaluate datasets pyspark transformers 

In [31]:
from evaluate import load as evaluate_load

from datasets import load_dataset

from numpy import argmax

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, trim, when

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Limpieza de datos con PySpark

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
# Cargamos los dos datasets, sólo los registros españoles

df_train = spark.read.csv('./EXIST2021_training.tsv', sep=r'\t', header=True) \
  .select('language', 'text', 'task1') \
  .where(col('language') == 'es')


df_test = spark.read.csv('./EXIST2021_test_labeled.tsv', sep=r'\t', header=True) \
  .select('language', 'text', 'task1') \
  .where(col('language') == 'es')

In [None]:
url_pattern = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
hash_menc_pattern = '(@\w)\w+|(#[\w])\w+'
non_letters_pattern = r'(^\W+)|(\W+$)'

def process_df(df):
  # Eliminamos enlaces, hashtag, menciones o cualquier carácter que no sea una
  # letra al principio o final del texto. Categorizamos los registros en 0
  # (non-sexist) ó 1 (sexist)

  df_w_urls = df.select(
      regexp_replace(lower(trim(col('text'))), url_pattern, '').alias('text'),
      when(col('task1') == 'non-sexist', 0) \
        .otherwise(1).alias('labels')
  )

  df_w_hash_menc = df_w_urls.select(
      regexp_replace(col('text'), hash_menc_pattern, '').alias('text'),
      'labels'
  )

  df_w_non_letters = df_w_hash_menc.select(
      regexp_replace(col('text'), non_letters_pattern, '').alias('text'),
      'labels'
  ).where(col("text") != '') 

  return df_w_non_letters

In [None]:
df_train = process_df(df_train)
df_test = process_df(df_test)

In [None]:
df_train.coalesce(1).write.csv('train', sep=r'\t', header=True)
df_test.coalesce(1).write.csv('test', sep=r'\t', header=True)

# NLP 

In [6]:
# Cargamos el dataset

dataset = load_dataset(
    "csv", 
    data_files={
        'train': './train.csv',
        'test': './test.csv'
    },
    delimiter="\t"
)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-0b5472aa072d3966/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-0b5472aa072d3966/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
dataset['train'][0]

{'text': 'nadie te va a tratar tan bien como un hombre que te lo quiere meter por primera vez',
 'labels': 1}

## Preparamos el modelo y tokenizador

In [8]:
# Usamos un modelo de fill mask para el aprendizaje

model_checkpoint= 'PlanTL-GOB-ES/roberta-base-bne'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

In [9]:
# Definimos la función que va a "tokenizar" cada texto del dataset

def tokenize_text(text):
  return tokenizer(text['text'], truncation=True)

In [11]:
# Eliminamos las columnas que ya no necesitamos y aplicamos la función anterior
# a todo el dataset

encoded_dataset = dataset.map(
    tokenize_text, 
    batched=True,
    remove_columns=['text']
)

# encoded_dataset['train'][0]

Map:   0%|          | 0/3541 [00:00<?, ? examples/s]



In [33]:
# Definimos métricas

def compute_metrics(eval_pred):
  predictions, labels = eval_pred

  accuracy = evaluate_load("accuracy")

  return accuracy.compute(
      predictions=argmax(predictions, axis=1), 
      references=labels
  )


model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=2
)


# Ajustando parámetros del entrenador

training_args = TrainingArguments(
    output_dir="model_results",
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps = len(encoded_dataset['train']) // (2 * 16 * 2), # 2 * batch_size * epochs
    )


trainer = Trainer(
    model=model, 
    args=training_args, 
    compute_metrics=compute_metrics,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    tokenizer=tokenizer,
)

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.weight', 'classifie

## Entrenando el modelo y guardándolo

In [34]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4399,0.476087,0.797758
2,0.2874,0.548604,0.794956


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=444, training_loss=0.3924004544277449, metrics={'train_runtime': 146.9032, 'train_samples_per_second': 48.209, 'train_steps_per_second': 3.022, 'total_flos': 253116432477780.0, 'train_loss': 0.3924004544277449, 'epoch': 2.0})

In [15]:
trainer.save_model('./modelo_sexismo')