In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

In [17]:
df_youtube= pd.read_csv("https://raw.githubusercontent.com/Yael-Parra/sentiment-analyzer-inc/feature/model-XLM-RoBERTa-base/etl/data/youtube_comments_hp2QZ0BGaps.csv.gz")


In [18]:
df_youtube.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   threadId            647 non-null    object 
 1   commentId           647 non-null    object 
 2   videoId             647 non-null    object 
 3   author              647 non-null    object 
 4   authorChannelId     647 non-null    object 
 5   isReply             647 non-null    bool   
 6   parentCommentId     246 non-null    object 
 7   publishedAtComment  647 non-null    object 
 8   text                647 non-null    object 
 9   likeCountComment    647 non-null    int64  
 10  replyCount          401 non-null    float64
dtypes: bool(1), float64(1), int64(1), object(8)
memory usage: 51.3+ KB


## Simplificamos el dataset porque sólo necesitamos el text para etiquetarlo:

In [19]:
df_simple = df_youtube[['text']].copy()

- Limpiamos: 

In [20]:
df_simple['text'] = df_simple['text'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [21]:
df_simple = df_simple.dropna(subset=['text'])
df_simple = df_simple[df_simple['text'] != ""]

## Creamos las columnnas de etiqueta vacías: 

In [22]:
etiquetas = [
    'is_toxic', 'is_abusive', 'is_threat', 'is_provocative', 'is_obscene',
    'is_hatespeech', 'is_racist', 'is_nationalist', 'is_sexist',
    'is_homophobic', 'is_religious_hate', 'is_radicalism'
]

for etiqueta in etiquetas:
    df_simple[etiqueta] = None 

## Guardamos el dataset: 

In [23]:
df_simple.to_csv("dataset_youtube_para_etiquetar.csv", index=False)

print("✅ Dataset para etiquetar creado.")

✅ Dataset para etiquetar creado.


In [24]:
df_simple.head()

Unnamed: 0,text,is_toxic,is_abusive,is_threat,is_provocative,is_obscene,is_hatespeech,is_racist,is_nationalist,is_sexist,is_homophobic,is_religious_hate,is_radicalism
0,"Its fitting, a stupid name for a stupid facility.",,,,,,,,,,,,
1,It’s heartbreaking to see 2 Thessalonians 2:8–...,,,,,,,,,,,,
2,Desantes should come out of the closet already,,,,,,,,,,,,
3,"These people help put an evil, heartless devil...",,,,,,,,,,,,
4,There are so few alligator left out there that...,,,,,,,,,,,,


## Etiquetado:

In [25]:
from IPython.display import display, clear_output
import ipywidgets as widgets

- Para recorrer los comemtarios: 

In [28]:
current_index = 0


output = widgets.Output()
checkboxes = {etiqueta: widgets.Checkbox(value=False, description=etiqueta) for etiqueta in etiquetas}
button = widgets.Button(description="Guardar etiquetas y siguiente")

def show_comments(idx):
    with output:
        clear_output(wait=True)
        print(f"\n🔹 Comentario {idx+1}/{len(df_simple)}")
        print(df_simple.iloc[idx]['text'])


display(output)
display(*(checkboxes[etiqueta] for etiqueta in etiquetas))
display(button)

show_comments(current_index)

Output()

Checkbox(value=False, description='is_toxic')

Checkbox(value=False, description='is_abusive')

Checkbox(value=False, description='is_threat')

Checkbox(value=False, description='is_provocative')

Checkbox(value=False, description='is_obscene')

Checkbox(value=False, description='is_hatespeech')

Checkbox(value=False, description='is_racist')

Checkbox(value=False, description='is_nationalist')

Checkbox(value=False, description='is_sexist')

Checkbox(value=False, description='is_homophobic')

Checkbox(value=False, description='is_religious_hate')

Checkbox(value=False, description='is_radicalism')

Button(description='Guardar etiquetas y siguiente', style=ButtonStyle())

In [29]:
def tag_comment(btn):
    global current_index
    for etiqueta in etiquetas:
        df_simple.at[current_index, etiqueta] = checkboxes[etiqueta].value
    
    current_index += 1

    if current_index < len(df_simple):
        # Resetea checkboxes
        for cb in checkboxes.values():
            cb.value = False
        show_comments(current_index)
    else:
        with output:
            clear_output(wait=True)
            print("✅ Terminaste de etiquetar todos los comentarios.")

button.on_click(tag_comment)

- Mostramos primer comentario y los checkboxes

In [34]:
# show_comments(current_index)
# display(*(checkboxes[etiqueta] for etiqueta in etiquetas))
# display(button)

In [32]:
def guardar_dataset():
    df_simple.to_csv("dataset_youtube_etiquetado_parcial.csv", index=False, encoding='utf-8')
    print("💾 Dataset parcial guardado.")


In [33]:
df_simple.head(5)

Unnamed: 0,text,is_toxic,is_abusive,is_threat,is_provocative,is_obscene,is_hatespeech,is_racist,is_nationalist,is_sexist,is_homophobic,is_religious_hate,is_radicalism
0,"Its fitting, a stupid name for a stupid facility.",True,False,False,True,False,True,False,False,False,False,False,False
1,It’s heartbreaking to see 2 Thessalonians 2:8–...,False,False,False,False,False,False,False,False,False,False,False,False
2,Desantes should come out of the closet already,True,False,False,True,False,False,False,False,True,True,False,False
3,"These people help put an evil, heartless devil...",False,False,False,True,False,False,False,False,False,False,False,False
4,There are so few alligator left out there that...,,,,,,,,,,,,
