# Configuración (importar dependencias, librerías, ...)

In [1]:
# Hiperparámetros
model_checkpoint = 'xlm-roberta-base'
#model_checkpoint = 'PlanTL-GOB-ES/roberta-base-biomedical-es'
#model_checkpoint = 'Jacinto/autotrain-i2c-edos-1988966268'
#model_checkpoint = 'bert-base-uncased'
#model_checkpoint = 'nghuyong/ernie-2.0-base-en'

BATCH_SIZE = 8
NUM_TRAIN_EPOCHS = 10
LEARNING_RATE = 2e-5
MAX_LENGTH = 64
WEIGHT_DECAY = 0.1

In [2]:
# Set the seed value all over the place to make this reproducible.
# esto hay que ponerlo justo antes de importar para que los experimentos
# sean reproducible

!pip install pytorch-lightning
import random
import torch
import numpy as np
import os
from pytorch_lightning import seed_everything

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)# Store the average loss after eachepoch so we can plot them.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["TF_DETERMINISTIC_OPS"] = "1" # See:https://github.com/NVIDIA/tensorflow-determinism#confirmed-current-gpu-specific-sources-of-non-determinism-with-solutions
seed_everything(42, workers=True)

!pip install transformers datasets
!pip install --upgrade accelerate
!pip install sentencepiece
!pip install contractions
!pip install textblob
from google.colab import drive
from datasets import Dataset, DatasetDict, load_metric
import pandas as pd
import sklearn as sk
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
 TrainingArguments, Trainer, pipeline, EarlyStoppingCallback

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.0.2-py3-none-any.whl (719 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.0/719.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.7.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.8.0-py3-none-any.whl (20 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]>2021.06.0->pytorch-lightning)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting multidict

INFO:lightning_fabric.utilities.seed:Global seed set to 42


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m11

In [3]:
# Check that pyTorch is identifying the GPU
if torch.cuda.device_count() > 0:
  print(f'GPU detected. Currently using: "{torch.cuda.get_device_name(0)}"')
else:
  print('Currently using CPU, change the type of the runtime in the \'runtime\' tab')

GPU detected. Currently using: "Tesla T4"


# Preparación de los datos

## Lectura de los ficheros

In [None]:
# SOLO SE HACE UNA VEZ PARA CREAR LOS FICHEROS tsv
# Para el EXIST 2023 #
# Se carga los datos en json y se pasan a tsv
import json
train_data_json_path = '/content/drive/MyDrive/EXIST 2023/EXIST2023_training_etiquetas.json'
test_data_json_path = '/content/drive/MyDrive/EXIST 2023/EXIST2023_dev_etiquetas.json'

with open(test_data_json_path, 'r') as f:
    data = json.load(f)

# Convierte el JSON en un DataFrame de Pandas
df = pd.DataFrame(data)

df_transposed = df.T

# Imprime el DataFrame
df_transposed

# Guardar el DataFrame en un archivo TSV
ruta_archivo = '/content/drive/MyDrive/EXIST 2023/EXIST_2023_test.tsv'
df_transposed.to_csv(ruta_archivo, sep='\t', index=False)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Cargamos los datos de entrenamiento y test 
#train_data_path = '/content/drive/MyDrive/EXIST 2023/EXIST_2023_train.tsv'
#test_data_path = '/content/drive/MyDrive/EXIST 2023/EXIST_2023_test.tsv'

train_data_path = '/content/drive/MyDrive/DueloSano/Nuevos-2023/New_ES_DS_Entrenamiento.tsv'
test_data_path = '/content/drive/MyDrive/DueloSano/Nuevos-2023/New_ES_DS_Test.tsv'

# Los pasamos a dataframes
train_df_full = pd.read_csv(train_data_path, encoding = 'UTF-8', sep='\t')
test_df = pd.read_csv(test_data_path, encoding = 'UTF-8', sep='\t')

nombre_etiqueta = 'label'
campo_texto = 'tweet'

#### Sólo para el EXIST 2023
#### Elimino las columnas que no se van a usar
#columnas_a_eliminar = ['lang','number_annotators','annotators','gender_annotators','age_annotators','labels_task1','labels_task2','labels_task3','split']
#train_df_full = train_df_full.drop(columnas_a_eliminar, axis=1)
#test_df = test_df.drop(columnas_a_eliminar, axis=1)

#Limpiamos el dataset de training para "eliminar" las columnas con label 2
#valor_a_eliminar = 2
#train_df_full = train_df_full.drop(train_df_full[train_df_full[nombre_etiqueta] == valor_a_eliminar].index)
#test_df = test_df.drop(test_df[test_df[nombre_etiqueta] == valor_a_eliminar].index)
#####################################################



######## Undersampling manual ########################
# Para hacer un undersampling manual, se construye un dataframe para cada clase
# Por ejemplo, si se quiere hacer undersampling de la clase mayoritaria (0), se guarda
# en df_0 el número de filas de clase 0 que se quiere mantener y en df_1 todas las filas de clase 1

# df_0 = train_df_full[train_df_full[nombre_etiqueta]==0][:] 
# df_1 = train_df_full[train_df_full[nombre_etiqueta]==1][:]

# Se vuelve a construir el fichero de entrenamiento concatenando los 2 dataframes
# train_df_full = pd.concat([df_0,df_1])
# print("Distribución despues del undersampling: ", train_df_full.value_counts(nombre_etiqueta))
######################################################


###### División train/valid/test #####################
# Si hay un único fichero
#train_df, auxiliar_df = train_test_split(train_df_full, test_size = 0.2, shuffle = True, stratify=train_df_full[[nombre_etiqueta]])
#valid_df, test_df = train_test_split(auxiliar_df, test_size = 0.3, shuffle = True, stratify=auxiliar_df[[nombre_etiqueta]])

# Si hay ficheros de train y test independientes, sólo se hace división train/valid 
train_df, valid_df = train_test_split(train_df_full, test_size = 0.2, shuffle = True, stratify=train_df_full[[nombre_etiqueta]])
######################################################


print("Ejemplos del conjunto completo de entrenamiento ", len(train_df_full))
print("Ejemplos usados para entrenar: ", len(train_df))
print("Ejemplos usados para validar: ", len(valid_df))
print("Ejemplos usados para test: ", len(test_df))

print("Distribución original - Train completo: ", train_df_full.value_counts(nombre_etiqueta))


Ejemplos del conjunto completo de entrenamiento  6064
Ejemplos usados para entrenar:  4851
Ejemplos usados para validar:  1213
Ejemplos usados para test:  934
Distribución original - Train completo:  etiqueta1
0    3367
1    2697
dtype: int64


In [6]:
# Para saber el número de filas de cada clase en cada división
print("distribución original - Train: ",train_df.value_counts(nombre_etiqueta))
print("distribución original - Valid: ",valid_df.value_counts(nombre_etiqueta))
print("distribución original - Test: ",test_df.value_counts(nombre_etiqueta))

distribución original - Train:  etiqueta1
0    2693
1    2158
dtype: int64
distribución original - Valid:  etiqueta1
0    674
1    539
dtype: int64
distribución original - Test:  etiqueta1
0    479
1    455
dtype: int64


In [7]:
train_df

Unnamed: 0,id_EXIST,tweet,etiqueta1
672,100673,@gueronieves @jcoppel @chargers Eso es bullyin...,0
2187,102188,@JohnArandia Y que pasó con el lenguaje inclus...,0
2703,102704,@andremarinpuig @Amauryvz si en verdad te preo...,0
4021,200362,@WellsLeane @GayHutch3 I didn't talk about an ...,0
6719,203060,@mikekotn @_dirt_vonnegut @JackPosobiec Becaus...,1
...,...,...,...
2981,102982,"Fear Street: Part 3 - 1666 (2021), inicio y fi...",0
465,100466,⌛😉⌛😏⌛😁Tener 6 horas de exámenes en un mismo dí...,0
5526,201867,100% of my work colleagues are vaccinated100% ...,0
4889,201230,Tall skinny Holly hotwife gangbanged #Foxyhot#...,1


## Limpieza de datos

In [8]:
# Esto solo es para ver la longitud (en palabras) de los tweets
def divide(texto):
  return texto.split()

def cuenta_tokens(lista):
  return len(lista)

train_df_palabras = train_df.copy()
train_df_palabras['text_split'] = train_df_palabras[campo_texto].apply(divide)
train_df_palabras['num_palabras'] = train_df_palabras['text_split'].apply(cuenta_tokens)
train_df_palabras

Unnamed: 0,id_EXIST,tweet,etiqueta1,text_split,num_palabras
672,100673,@gueronieves @jcoppel @chargers Eso es bullyin...,0,"[@gueronieves, @jcoppel, @chargers, Eso, es, b...",15
2187,102188,@JohnArandia Y que pasó con el lenguaje inclus...,0,"[@JohnArandia, Y, que, pasó, con, el, lenguaje...",20
2703,102704,@andremarinpuig @Amauryvz si en verdad te preo...,0,"[@andremarinpuig, @Amauryvz, si, en, verdad, t...",47
4021,200362,@WellsLeane @GayHutch3 I didn't talk about an ...,0,"[@WellsLeane, @GayHutch3, I, didn't, talk, abo...",36
6719,203060,@mikekotn @_dirt_vonnegut @JackPosobiec Becaus...,1,"[@mikekotn, @_dirt_vonnegut, @JackPosobiec, Be...",19
...,...,...,...,...,...
2981,102982,"Fear Street: Part 3 - 1666 (2021), inicio y fi...",0,"[Fear, Street:, Part, 3, -, 1666, (2021),, ini...",36
465,100466,⌛😉⌛😏⌛😁Tener 6 horas de exámenes en un mismo dí...,0,"[⌛😉⌛😏⌛😁Tener, 6, horas, de, exámenes, en, un, ...",33
5526,201867,100% of my work colleagues are vaccinated100% ...,0,"[100%, of, my, work, colleagues, are, vaccinat...",43
4889,201230,Tall skinny Holly hotwife gangbanged #Foxyhot#...,1,"[Tall, skinny, Holly, hotwife, gangbanged, #Fo...",11


In [9]:
max = train_df_palabras.max()['num_palabras']
print(f'El texto de mayor longitud tiene {max} palabras')

El texto de mayor longitud tiene 64 palabras


In [10]:
# Funciones de limpieza
import re

def remove_links(tweet):
    """Takes a string and removes web links from it"""
    tweet = re.sub(r'http\S+', '', tweet)        # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet)     # remove bitly links
    tweet = re.sub(r'\[link\]', '', tweet )      # remove [link]
    tweet = re.sub(r'\[url\]', '', tweet )       # remove [url]
    tweet = re.sub(r'pic.twitter\S+','', tweet)
    return tweet

def remove_users(tweet):
    """Takes a string and removes retweet and @user information"""
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove re-tweet
    tweet = re.sub('(@[A-Za-z_]+[A-Za-z0-9-_]+)', '', tweet)     # remove tweeted at
    tweet = re.sub(r'\[user\]', '', tweet )                      # remove [user]
    return tweet

def remove_hashtags(tweet):
    """Takes a string and removes any hash tags"""
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)      # remove hash tags
    return tweet

def remove_av(tweet):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    tweet = re.sub('VIDEO:', '', tweet)  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', tweet)  # remove 'AUDIO:' from start of tweet
    return tweet

def remove_emojis(tweet):
    emoj = re.compile("["
        u"\U00002700-\U000027BF"  # Dingbats
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U00002600-\U000026FF"  # Miscellaneous Symbols
        u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols And Pictographs
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00010000-\U0010FFFF"
        u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"  
        u"\U000024C2-\U0001F251"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u231B"
        u"\ufe0f"  # dingbats

                      "]+", re.UNICODE)
    return re.sub(emoj, '', tweet)

# Función de eliminación de contracción
import contractions
def expand_contraction(tweet):
    tweet = contractions.fix(tweet)
    return tweet

# Función para corregir los typos
from textblob import TextBlob
def correct_spelling(tweet):
    tweetBlob = TextBlob(tweet)
    tweet = tweetBlob.correct()
    return tweet

In [11]:
train_df[campo_texto] = train_df[campo_texto].str.lower()
valid_df[campo_texto] = valid_df[campo_texto].str.lower()
test_df[campo_texto] = test_df[campo_texto].str.lower()

train_df[campo_texto] = train_df[campo_texto].apply(remove_links)
valid_df[campo_texto] = valid_df[campo_texto].apply(remove_links)
test_df[campo_texto] = test_df[campo_texto].apply(remove_links)

train_df[campo_texto] = train_df[campo_texto].apply(remove_users)
valid_df[campo_texto] = valid_df[campo_texto].apply(remove_users)
test_df[campo_texto] = test_df[campo_texto].apply(remove_users)

train_df[campo_texto] = train_df[campo_texto].apply(remove_hashtags)
valid_df[campo_texto] = valid_df[campo_texto].apply(remove_hashtags)
test_df[campo_texto] = test_df[campo_texto].apply(remove_hashtags)

train_df[campo_texto] = train_df[campo_texto].apply(expand_contraction)
valid_df[campo_texto] = valid_df[campo_texto].apply(expand_contraction)
test_df[campo_texto] = test_df[campo_texto].apply(expand_contraction)

#train_df[campo_texto] = train_df[campo_texto].apply(correct_spelling)
#valid_df[campo_texto] = valid_df[campo_texto].apply(correct_spelling)
#test_df[campo_texto] = test_df[campo_texto].apply(correct_spelling)

train_df[campo_texto] = train_df[campo_texto].apply(remove_emojis)
valid_df[campo_texto] = valid_df[campo_texto].apply(remove_emojis)
test_df[campo_texto] = test_df[campo_texto].apply(remove_emojis)

train_df

Unnamed: 0,id_EXIST,tweet,etiqueta1
672,100673,eso es bullying!!!“patear a una persona cua...,0
2187,102188,y que pasó con el lenguaje inclusivo del comu...,0
2703,102704,si en verdad te preocupa la situación actual...,0
4021,200362,"i did not talk about an 'anti vax' attitude,...",0
6719,203060,because most women do not know they are pre...,1
...,...,...,...
2981,102982,"fear street: part 3 - 1666 (2021), inicio y fi...",0
465,100466,tener 6 horas de exámenes en un mismo día tien...,0
5526,201867,100% of my work colleagues are vaccinated100% ...,0
4889,201230,tall skinny holly hotwife gangbanged,1


In [12]:
# Se convierten los dataframes en objetos datasets para que los acepten los transformers
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)
test_dataset = Dataset.from_pandas(test_df)

print(train_dataset, valid_dataset, test_dataset)

Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__'],
    num_rows: 4851
}) Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__'],
    num_rows: 1213
}) Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__'],
    num_rows: 934
})


In [13]:
# Los objetos de tipo Dataset también se pueden mostrar en formato pandas
train_dataset.set_format("pandas")
train_dataset[:]

Unnamed: 0,id_EXIST,tweet,etiqueta1,__index_level_0__
0,100673,eso es bullying!!!“patear a una persona cua...,0,672
1,102188,y que pasó con el lenguaje inclusivo del comu...,0,2187
2,102704,si en verdad te preocupa la situación actual...,0,2703
3,200362,"i did not talk about an 'anti vax' attitude,...",0,4021
4,203060,because most women do not know they are pre...,1,6719
...,...,...,...,...
4846,102982,"fear street: part 3 - 1666 (2021), inicio y fi...",0,2981
4847,100466,tener 6 horas de exámenes en un mismo día tien...,0,465
4848,201867,100% of my work colleagues are vaccinated100% ...,0,5526
4849,201230,tall skinny holly hotwife gangbanged,1,4889


In [None]:
# Se pueden eliminar los dataframes puesto que no se van a usar más
# del train_df_palabras
# del train_df_full
# del train_df
# del valid_df
# del test_df

# Preparación de los conjuntos para el entrenamiento

In [14]:
# Se asigna una etiqueta numérica en función de la etiqueta principal
# Por ejemplo, para un conjunto de entrenamiento cuyas etiquetas son:
#   'SEXIST' --> positivo
#   'NO SEXIST' --> negativo
#def set_labels(records):
#  if records[nombre_etiqueta] == 'NO SEXIST':
#    label = 0
#  else:
#    label = 1
#  return {'labels': label}

# Por ejemplo, para un conjunto de entrenamiento cuyas etiquetas son 0 y 1 originalmente
def set_labels(records):
  if records[nombre_etiqueta] == 0:
    label = 0
  else:
    label = 1
  return {'labels': label}

In [15]:
# Reseteamos el formato para que no haya fallos
train_dataset.reset_format() 
valid_dataset.reset_format() 
test_dataset.reset_format() 

In [16]:
# Map the functions to the dataset
train_dataset = train_dataset.map(set_labels)
valid_dataset = valid_dataset.map(set_labels)

print(train_dataset, valid_dataset)

Map:   0%|          | 0/4851 [00:00<?, ? examples/s]

Map:   0%|          | 0/1213 [00:00<?, ? examples/s]

Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__', 'labels'],
    num_rows: 4851
}) Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__', 'labels'],
    num_rows: 1213
})


In [17]:
# Reseteamos el formato para que no haya fallos
train_dataset.reset_format() 
valid_dataset.reset_format() 
test_dataset.reset_format() 

# Proceso de clasificación

## Tokenización

In [18]:
#model_checkpoint = 'dccuchile/bert-base-spanish-wwm-uncased'
#model_checkpoint = 'davidmasip/racism'
#model_checkpoint = 'PlanTL-GOB-ES/roberta-base-bne'
#model_checkpoint = 'PlanTL-GOB-ES/roberta-base-biomedical-es'
#model_checkpoint = 'roberta-base'
#model_checkpoint = 'Jacinto/autotrain-i2c-edos-1988966268'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_auth_token='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')
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [19]:
# Tamaño del vocabulario del tokenizador
tokenizer.vocab_size

250002

In [20]:
# Esto es para ver cómo queda el texto una vez tokenizado
# La función tokenizer() hace el tokenizado y devuelve los 'inputs_id' y los 'attention_mask'

print(train_dataset[1][campo_texto])
tokenizado = tokenizer.tokenize(train_dataset[1][campo_texto])
print(tokenizado)

 y que pasó con el lenguaje inclusivo del comunicado?y que pasó con las que andan quejando?   
['▁y', '▁que', '▁pas', 'ó', '▁con', '▁el', '▁lenguaje', '▁inclusiv', 'o', '▁del', '▁comunicado', '?', 'y', '▁que', '▁pas', 'ó', '▁con', '▁las', '▁que', '▁and', 'an', '▁que', 'ja', 'ndo', '?']


In [21]:
# Función para tokenizar un dataset
def tokenize_data(examples):
  #return tokenizer(examples[campo_texto], truncation=True, padding='longest')
  return tokenizer(examples[campo_texto], truncation=True, max_length=MAX_LENGTH, padding=True)

In [22]:
# Celda para para construir los ficheros codificados (encoded)
columns_train = train_dataset.column_names  # Coge todas las columnas
columns_valid = valid_dataset.column_names  # Coge todas las columnas
columns_train.remove("labels") # Elimina la columna "labels"
columns_valid.remove("labels") # Elimina la columna "labels"


# Hace el tokenizado y elimina todas las columnas que no se necesitan
encoded_train_dataset = train_dataset.map(tokenize_data, batched=True, remove_columns=columns_train)
encoded_valid_dataset = valid_dataset.map(tokenize_data, batched=True, remove_columns=columns_valid)
encoded_train_dataset

Map:   0%|          | 0/4851 [00:00<?, ? examples/s]

Map:   0%|          | 0/1213 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 4851
})

In [None]:
len(encoded_train_dataset[3]['input_ids'])

In [None]:
encoded_train_dataset[4]['input_ids']


In [None]:
encoded_train_dataset[4]

## Carga del modelo

In [23]:
# Se carga el modelo preentrenado
n_labels = 2

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, 
                                                              num_labels = n_labels ) 
                                                              # use_auth_token = 'token propio de HugginFace')

In [None]:
# Esto es para obtener el token [CLS] que devuelve el modelo
# Se puede utilizar como entrada para otro clasificador clásico (XGBOOST, Regression, etc.)
# inputs = tokenizer("Hola, mi perro es de raza dálmata", return_tensors='pt')
# outputs = model(**inputs, output_hidden_states=True)
# last_hidden_states = outputs.hidden_states[-1]
# print(last_hidden_states.shape)
# features = last_hidden_states[0,0,:]
# print(features)

## Definición de la métricas

In [24]:
# Función para realizar distintas métricas en ejecución

def compute_metrics(eval_pred):
   
  ############## 
  ## preds son logits, que son tuplas de la forma [valor1, valor2]
  ## Por ejemplo [-1.5606991,  1.6122842] significa que ha predicho eso para un documento
  ## Eso es lo que pasa a la última capa del transformer (softmax si es binario)
  ## Por eso se utiliza el índice del valor máximo de la tupla, para decir que esa es la clase que predice
  ##############

  labels = eval_pred.label_ids
  preds = eval_pred.predictions.argmax(-1)

  precision, recall, f1, _ = sk.metrics.precision_recall_fscore_support(labels, preds, average="macro")
  f1_minoritaria = f1_score(labels, preds, pos_label=1)
  f1_mayoritaria = f1_score(labels, preds, pos_label=0)
  acc = sk.metrics.accuracy_score(labels, preds)
  AUC = roc_auc_score(labels, preds)
  PREC_REC = average_precision_score(labels, preds)
  return { 'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'AUC': AUC,
           'f1_minoritaria': f1_minoritaria, 'f1_mayoritaria': f1_mayoritaria, 'PREC_REC': PREC_REC }

In [None]:
##### Otra forma de definir las métricas
'''
accuracy = load_metric('accuracy')
f1 = load_metric('f1')

def compute_metric(eval_pred, test=False):
  predictions, labels = eval_pred

  if test == False:
    predictions = np.argmax(predictions, axis=1)

  result_acc = accuracy.compute(predictions=predictions, references=labels)['accuracy']
  result_f1 = f1.compute(predictions=predictions, references=labels)['f1']

  return {'accuracy': result_acc, 'f1-score': result_f1}
  '''
##### Otra forma de definir las métricas

## Fine-tuning

In [25]:
model_name = model_checkpoint.split("/")[-1]
model_name

'xlm-roberta-base'

In [26]:
def maximum(a, b):
     
    if a >= b:
        return a
    else:
        return b
# Se definen los parámetros del Trainer()
num_train_samples = int(len(encoded_train_dataset))
num_evaluation_samples = int(len(encoded_valid_dataset))

#logging_steps = max(1,len(encoded_train_dataset) // (2 * BATCH_SIZE * NUM_TRAIN_EPOCHS))
value = len(encoded_train_dataset) // (2 * BATCH_SIZE * NUM_TRAIN_EPOCHS)
logging_steps = maximum(1,value) #para que funcione con modelos que no admiten logging steps 0. creo la funcion maximum pq max me da error


optim = ["adamw_hf", "adamw_torch", "adamw_apex_fused","adafactor","adamw_torch_xla"] 

training_args = TrainingArguments(
    output_dir = 'results',
    num_train_epochs = NUM_TRAIN_EPOCHS,
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    load_best_model_at_end = True,
    metric_for_best_model = 'f1',
    #metric_for_best_model = 'eval_loss',
    weight_decay = WEIGHT_DECAY,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    #logging_steps = logging_steps,
    save_total_limit = 3,
    optim = optim[1],
    push_to_hub = False
    #push_to_hub=True,
    #push_to_hub_model_id=f"{model_name}-finetuned-amazon_reviews_multi"
)

In [27]:
# Se crea el objeto Trainer()
trainer = Trainer(
    model_init = model_init,
    #model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
    train_dataset = encoded_train_dataset,
    eval_dataset = encoded_valid_dataset,
    tokenizer = tokenizer
)

Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [28]:
# A entrenar
trainer.train()

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auc,F1 Minoritaria,F1 Mayoritaria,Prec Rec
1,0.5793,0.459725,0.80709,0.805519,0.804709,0.807086,0.807086,0.788043,0.822995,0.707095
2,0.4548,0.446261,0.812861,0.811681,0.810932,0.814137,0.814137,0.796777,0.826585,0.713122
3,0.3758,0.581269,0.806265,0.806131,0.810587,0.813405,0.813405,0.801016,0.811245,0.700955
4,0.3342,0.838368,0.826051,0.821954,0.827444,0.819317,0.819317,0.794947,0.848962,0.740549
5,0.2315,0.979132,0.817807,0.812923,0.8204,0.809855,0.809855,0.782694,0.843151,0.731063
6,0.162,0.995542,0.824402,0.820538,0.825054,0.818205,0.818205,0.794203,0.846873,0.737372
7,0.1154,1.050754,0.829349,0.825764,0.829745,0.823585,0.823585,0.80077,0.850757,0.743539
8,0.0975,1.1361,0.830173,0.826892,0.829924,0.82507,0.82507,0.803059,0.850725,0.743612
9,0.0661,1.178434,0.831822,0.829898,0.829568,0.83027,0.83027,0.811808,0.847988,0.740668
10,0.0301,1.223887,0.832646,0.830516,0.830578,0.830455,0.830455,0.811513,0.849518,0.742644


TrainOutput(global_step=6070, training_loss=0.23093290536171915, metrics={'train_runtime': 1508.8328, 'train_samples_per_second': 32.151, 'train_steps_per_second': 4.023, 'total_flos': 1595439661939200.0, 'train_loss': 0.23093290536171915, 'epoch': 10.0})

In [29]:
eval = trainer.evaluate()
# Se pasa el resultado a dataframe
dfeval = pd.DataFrame(list(eval.items()), columns = ['Nombre','Valor'])
dfeval

Unnamed: 0,Nombre,Valor
0,eval_loss,1.223887
1,eval_accuracy,0.832646
2,eval_f1,0.830516
3,eval_precision,0.830578
4,eval_recall,0.830455
5,eval_AUC,0.830455
6,eval_f1_minoritaria,0.811513
7,eval_f1_mayoritaria,0.849518
8,eval_PREC_REC,0.742644
9,eval_runtime,4.7993


In [30]:
# Se graba el modelo entrenado
trainer.save_model('/content/drive/MyDrive/EXIST 2023/Modelos/bert_prueba')

# Evaluar el modelo con el test

In [31]:
test_df

Unnamed: 0,id_EXIST,tweet,etiqueta1
1,300002,"no me acuerdo de los detalles de gamergate, ...",1
2,300003,lo digo cada pocos dias y lo repito: todo est...,0
3,300004,also mientras les decia eso la señalaba y deci...,1
4,300005,"and all people killed, attacked, harassed by ...",0
5,300006,on this i am thinking of journalists &amp; me...,0
...,...,...,...
1033,400485,"“do not wear a black bra with a white vest, y...",1
1034,400486,""" get changed , you look like a prostitute . ""...",1
1035,400487,made this top and my mom gave me the “you look...,1
1036,400488,i have not seen anything that makes you look ...,1


In [32]:
print(train_dataset, valid_dataset, test_dataset)

Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__', 'labels'],
    num_rows: 4851
}) Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__', 'labels'],
    num_rows: 1213
}) Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__'],
    num_rows: 934
})


In [33]:
# Se mapean las etiquetas del test
test_dataset = test_dataset.map(set_labels)  # La función set_labels ya se definió en el entrenamiento 
print(train_dataset, valid_dataset, test_dataset)

Map:   0%|          | 0/934 [00:00<?, ? examples/s]

Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__', 'labels'],
    num_rows: 4851
}) Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__', 'labels'],
    num_rows: 1213
}) Dataset({
    features: ['id_EXIST', 'tweet', 'etiqueta1', '__index_level_0__', 'labels'],
    num_rows: 934
})


In [None]:
test_dataset[5]
valid_dataset[5]

{'id_EXIST': 200752,
 'tweet': ' radical, marxist, liberal, &amp; difference feminism, out of these 4 the are many other types of feminism. so i dont get woman talking about people who don\'t know what feminism is. "the belief and aim that women should have the same rights and opportunities as men" is the definition',
 'etiqueta1': 0,
 '__index_level_0__': 4411,
 'labels': 0}

## Hacer las predicciones

In [34]:
# Al usar model_init es necesario cargar el modelo entrenado previamente
model_path = '/content/drive/MyDrive/EXIST 2023/Modelos/bert_prueba'

model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [35]:
# Predicción con pipeline
# device = 0 se utiliza para que haga las predicciones con la GPU
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [36]:
# Se hacen las prediciones
'''def get_predictions(records):
  p = pipe(str(records["text"]),truncation=True)
  
  return {"pred_label":int(p[0]["label"][-1])}  #Nos quedamos con el número de la etiqueta solo
'''

def get_predictions(records):
  result = pipe(records[campo_texto], truncation=True)
  pred_label = result[0]['label']
  score_label = result[0]['score']

  if pred_label == 'LABEL_0':
    pred_label = 0
  else:
    pred_label = 1
    
  return {'pred_label': pred_label, 'score_label': score_label}

In [37]:
test_dataset_predicted = test_dataset.map(get_predictions)
#valid_dataset_predicted = valid_dataset.map(get_predictions)
test_dataset_predicted[0]
#valid_dataset_predicted[0]

Map:   0%|          | 0/934 [00:00<?, ? examples/s]



{'id_EXIST': 300002,
 'tweet': '  no me acuerdo de los detalles de gamergate, pero ella estuvo en el ojo del huracán recibiendo acoso de hombres indignados (y sus medios frikis) y creo que también acosaron a brianna wu, q es transfemenino. seguramente tuvo eso que ver y quiso "cerrar filas".',
 'etiqueta1': 1,
 '__index_level_0__': 1,
 'labels': 1,
 'pred_label': 1,
 'score_label': 0.9997636675834656}

In [38]:
test_dataset_predicted.set_format('pandas')
df_test = test_dataset_predicted[:]
df_test

#valid_dataset_predicted.set_format('pandas')
#df_valid = valid_dataset_predicted[:]

Unnamed: 0,id_EXIST,tweet,etiqueta1,__index_level_0__,labels,pred_label,score_label
0,300002,"no me acuerdo de los detalles de gamergate, ...",1,1,1,1,0.999764
1,300003,lo digo cada pocos dias y lo repito: todo est...,0,2,0,0,0.999880
2,300004,also mientras les decia eso la señalaba y deci...,1,3,1,1,0.999682
3,300005,"and all people killed, attacked, harassed by ...",0,4,0,0,0.999899
4,300006,on this i am thinking of journalists &amp; me...,0,5,0,0,0.999903
...,...,...,...,...,...,...,...
929,400485,"“do not wear a black bra with a white vest, y...",1,1033,1,1,0.999771
930,400486,""" get changed , you look like a prostitute . ""...",1,1034,1,1,0.999770
931,400487,made this top and my mom gave me the “you look...,1,1035,1,1,0.999775
932,400488,i have not seen anything that makes you look ...,1,1036,1,1,0.999773


In [39]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
# Añadimos la función de evaluación

def compute_metrics(pred):

  labels = pred[0]
  preds = pred[1]
  precision, recall, f1, _ = sk.metrics.precision_recall_fscore_support(labels, preds, average="macro")
  acc = sk.metrics.accuracy_score(labels, preds)
  AUC = roc_auc_score(labels, preds)
  PREC_REC = average_precision_score(labels, preds)
  return { 'accuracy': acc, 'f1': f1, 'precision': precision, 
          'recall': recall, 'AUC': AUC, 'PREC_REC': PREC_REC }

In [41]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
# Convert the pandas series to python list to apply the compute_metric function
test_labels = df_test['labels'].values.tolist()
test_predictions = df_test['pred_label'].values.tolist()
eval_pred_test = [test_labels, test_predictions]

#valid_labels = df_valid['labels'].values.tolist()
#valid_predictions = df_valid['pred_label'].values.tolist()
#eval_pred_valid = [valid_labels, valid_predictions]

In [43]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
# Informe de resultados
#print(f'Modelo entrenado: {model_name}')
#print(f'Epochs: {}')
#print(f'Tamaño de batch: {}')
p_test = compute_metrics(eval_pred_test)
dftest = pd.DataFrame([[key, p_test[key]] for key in p_test.keys()], columns=['Name', 'Value'])

#p_valid = compute_metrics(eval_pred_valid)
#dfvalid = pd.DataFrame([[key, p_valid[key]] for key in p_valid.keys()], columns=['Name', 'Value'])

dftest

Unnamed: 0,Name,Value
0,accuracy,0.830835
1,f1,0.830492
2,precision,0.831377
3,recall,0.830228
4,AUC,0.830228
5,PREC_REC,0.771609


In [None]:
#### Esto es cuando ya tengo las etiquetas de las predicciones y del test (gold standard)
predictions_data_path = '/content/drive/MyDrive/EDOS/Predicciones/pred_model3_autotrain2.csv'   
predictions_df = pd.read_csv(predictions_data_path, encoding = 'UTF-8', sep=',')
test_df = test_df.replace({"not sexist": 0, "sexist": 1})
predictions_df = predictions_df.replace({"not sexist": 0, "sexist": 1})
test_labels = test_df['label'].values.tolist()
test_predictions = predictions_df['label_pred'].values.tolist()

In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
print(f'AUC del test: {roc_auc_score(test_labels, test_predictions)}')
print(f'AUC del valid: {roc_auc_score(valid_labels, valid_predictions)}')
print('*********************************')
print(f'PREC_REC del test: {average_precision_score(test_labels, test_predictions)}')
print(f'PREC_REC del valid: {average_precision_score(valid_labels, valid_predictions)}')

AUC del test: 0.8337998118791439
AUC del valid: 0.8056049503696812
*********************************
PREC_REC del test: 0.7724662917382403
PREC_REC del valid: 0.7015329297039343


In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
### Resultados para el test
print("epoch ", NUM_TRAIN_EPOCHS)
print("batch size:", BATCH_SIZE)
print("max_len :", MAX_LENGTH)

print(classification_report(test_labels, test_predictions))

print('Matriz de confusión')
print(confusion_matrix(test_labels, test_predictions))
print(f'AUC: {roc_auc_score(test_labels, test_predictions)}')
print(f'PREC_REC: {average_precision_score(test_labels, test_predictions)}')

epoch  10
batch size: 16
max_len : 128
              precision    recall  f1-score   support

           0       0.83      0.84      0.84       479
           1       0.83      0.82      0.83       455

    accuracy                           0.83       934
   macro avg       0.83      0.83      0.83       934
weighted avg       0.83      0.83      0.83       934

Matriz de confusión
[[404  75]
 [ 80 375]]
AUC: 0.8337998118791439
PREC_REC: 0.7724662917382403


In [None]:
### SOLO CUANDO ESTAMOS EVALUANDO UN TEST ETIQUETADO
### Resultados para el valid
print("epoch ", NUM_TRAIN_EPOCHS)
print("batch size:", BATCH_SIZE)
print("max_len :", MAX_LENGTH)

print(classification_report(valid_labels, valid_predictions))

print('Matriz de confusión')
print(confusion_matrix(valid_labels, valid_predictions))
print(f'AUC: {roc_auc_score(valid_labels, valid_predictions)}')
print(f'PREC_REC: {average_precision_score(valid_labels, valid_predictions)}')

epoch  10
batch size: 16
max_len : 128
              precision    recall  f1-score   support

           0       0.85      0.79      0.82       674
           1       0.76      0.82      0.79       539

    accuracy                           0.80      1213
   macro avg       0.80      0.81      0.80      1213
weighted avg       0.81      0.80      0.80      1213

Matriz de confusión
[[532 142]
 [ 96 443]]
AUC: 0.8056049503696812
PREC_REC: 0.7015329297039343


In [None]:
# Construir el fichero de salida que pida cada competición

#predicciones = df.drop(['id','reply_to','sentence','stereotype','labels','score_label'], axis=1)
#predicciones = df.drop([campo_texto,'label_sexist','label_vector','label_category','__index_level_0__','labels','score_label'], axis=1)
predicciones = df_test.drop([campo_texto,'score_label'], axis=1)
predicciones.rename(columns={'pred_label':'label_pred'}, inplace=True)
predicciones

In [None]:
predicciones['label_pred'] = predicciones['label_pred'].map({0:'not sexist',
                             1:'sexist'},na_action=None)
predicciones

In [None]:
# Guardamos el fichero de predicciones
fichero_pred = '/content/drive/MyDrive/EDOS/Predicciones/pred_model4_autotrainFive.csv'
predicciones.to_csv(fichero_pred, index=False, encoding='utf-8',header=True, sep=',')