In [15]:
import pandas as pd

In [16]:
df = pd.read_csv('HateBR.csv')
df

Unnamed: 0,instagram_comments,offensive_language,offensiveness_levels,hate_speech
0,este lixo ...,1,1,-1
1,Mais um lixo,1,1,-1
2,Essa nao tem vergonha na cara!!,1,2,-1
3,Essa mulher é doente.pilantra!,1,3,-1
4,Comunista safada...,1,2,58
...,...,...,...,...
6995,Time perdendo a credibilidade,0,0,0
6996,Siga em frente Presidente Bolsonaro! Afinal o ...,0,0,0
6997,Tantas coisas importantes para resolver e fica...,0,0,0
6998,"A TIME escolhe quem eles quiser, isso não sign...",0,0,0


In [17]:
df['offensive_language'].value_counts()

offensive_language
1    3500
0    3500
Name: count, dtype: int64

In [18]:
len(df['instagram_comments'])

7000

*train test split*

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['offensive_language'])
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['offensive_language'])

train_texts, train_labels = train_df['instagram_comments'].tolist(), train_df['offensive_language'].tolist()
val_texts, val_labels = val_df['instagram_comments'].tolist(), val_df['offensive_language'].tolist()
test_texts, test_labels = test_df['instagram_comments'].tolist(), test_df['offensive_language'].tolist()

In [21]:
print(f"Treino: {train_df.shape[0]} amostras")
print(f"Validação: {val_df.shape[0]} amostras")
print(f"Teste: {test_df.shape[0]} amostras")

Treino: 5040 amostras
Validação: 560 amostras
Teste: 1400 amostras


In [22]:
# Contar exemplos por classe em cada conjunto
print("\nDistribuição das classes em cada conjunto:")

print("\nTreino:")
print(train_df['offensive_language'].value_counts())

print("\nValidação:")
print(val_df['offensive_language'].value_counts())

print("\nTeste:")
print(test_df['offensive_language'].value_counts())


Distribuição das classes em cada conjunto:

Treino:
offensive_language
0    2520
1    2520
Name: count, dtype: int64

Validação:
offensive_language
1    280
0    280
Name: count, dtype: int64

Teste:
offensive_language
0    700
1    700
Name: count, dtype: int64


*#1 teste - sem nenhum pré-processamento nos textos (bert-12)*

In [23]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [25]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [26]:
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [27]:
# criando os datasets e dataloaders
train_dataset = HateSpeechDataset(train_texts, train_labels, tokenizer)
val_dataset = HateSpeechDataset(val_texts, val_labels, tokenizer)
test_dataset = HateSpeechDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [29]:
from torch.optim import AdamW

#modelo BERT para classificação
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [96]:
#treino e validação
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            total_loss += outputs.loss.item()
    
    # Calcular a perda média com 4 dígitos
    avg_loss = round(total_loss / len(dataloader), 4)
    
    return avg_loss, all_preds, all_labels

In [97]:
epochs = 3
for epoch in range(epochs):
    # Treinamento por época
    train_loss = train_epoch(model, train_loader, optimizer, device)
    
    # Avaliação no conjunto de validação
    val_loss, val_preds, val_labels = evaluate(model, val_loader, device)
    
    # Exibir métricas por época
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"  Treino Loss: {train_loss:.4f}")
    print(f"  Validação Loss: {val_loss:.4f}")
    #print(classification_report(val_labels, val_preds))

Epoch 1/3
  Treino Loss: 0.6340
  Validação Loss: 0.6715
Epoch 2/3
  Treino Loss: 0.6650
  Validação Loss: 0.6576
Epoch 3/3
  Treino Loss: 0.6684
  Validação Loss: 0.6683


In [98]:
# Avaliação final no conjunto de teste
test_loss, test_preds, test_labels = evaluate(model, test_loader, device)
print("\nAvaliação Final no Conjunto de Teste:")
print(f"Teste Loss: {test_loss:.4f}")
print(classification_report(test_labels, test_preds))


Avaliação Final no Conjunto de Teste:
Teste Loss: 0.6701
              precision    recall  f1-score   support

           0       0.97      0.08      0.15       700
           1       0.52      1.00      0.68       700

    accuracy                           0.54      1400
   macro avg       0.74      0.54      0.42      1400
weighted avg       0.74      0.54      0.42      1400



*com pré-processamento*

In [99]:
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from nltk.corpus import stopwords
import spacy

In [100]:
comments = df["instagram_comments"].tolist()
labels = df["offensive_language"].tolist()

In [101]:
import nltk
nltk.download("stopwords")
stop_words = set(stopwords.words("portuguese"))  # Stop words para português
nlp = spacy.load("pt_core_news_sm")  # Modelo Spacy para português

# Funções de pré-processamento
def remove_special_characters(text):
    """Remove caracteres especiais, emojis, links e menções"""
    text = re.sub(r"http\S+", "", text)  # Remove links
    text = re.sub(r"@\w+", "", text)  # Remove menções
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove caracteres especiais
    return text

def normalize_text(text):
    """Converte texto para minúsculas e remove espaços extras"""
    text = text.lower()  # Converte para minúsculas
    text = re.sub(r"\s+", " ", text).strip()  # Remove espaços extras
    return text

def remove_stop_words(text):
    """Remove palavras irrelevantes (stop words)"""
    return " ".join([word for word in text.split() if word not in stop_words])

def lemmatize_text(text):
    """Reduz as palavras à sua forma base usando Spacy"""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_punct])

def preprocess_text(text):
    """Aplica todas as etapas de pré-processamento no texto"""
    text = remove_special_characters(text)
    text = normalize_text(text)
    text = remove_stop_words(text)
    text = lemmatize_text(text)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\celso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [102]:
# Aplicar o pré-processamento nos conjuntos de treino, teste e validação
train_texts = [preprocess_text(text) for text in train_texts]
val_texts = [preprocess_text(text) for text in val_texts]
test_texts = [preprocess_text(text) for text in test_texts]

In [103]:
# Criar os datasets após o pré-processamento
train_dataset = HateSpeechDataset(train_texts, train_labels, tokenizer)
val_dataset = HateSpeechDataset(val_texts, val_labels, tokenizer)
test_dataset = HateSpeechDataset(test_texts, test_labels, tokenizer)

# Criar os DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [104]:
epochs = 3
for epoch in range(epochs):
    # Treinamento por época
    train_loss = train_epoch(model, train_loader, optimizer, device)
    
    # Avaliação no conjunto de validação
    val_loss, val_preds, val_labels = evaluate(model, val_loader, device)
    
    # Exibir métricas por época
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"  Treino Loss: {train_loss:.4f}")
    print(f"  Validação Loss: {val_loss:.4f}")
    #print(classification_report(val_labels, val_preds))

Epoch 1/3
  Treino Loss: 0.6705
  Validação Loss: 0.6738
Epoch 2/3
  Treino Loss: 0.6476
  Validação Loss: 0.6322
Epoch 3/3
  Treino Loss: 0.6274
  Validação Loss: 0.7084


In [105]:
# Avaliação final no conjunto de teste
test_loss, test_preds, test_labels = evaluate(model, test_loader, device)
print("\nAvaliação Final no Conjunto de Teste:")
print(f"Teste Loss: {test_loss:.4f}")
print(classification_report(test_labels, test_preds))


Avaliação Final no Conjunto de Teste:
Teste Loss: 0.7066
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       700
           1       0.86      0.02      0.03       700

    accuracy                           0.51      1400
   macro avg       0.68      0.51      0.35      1400
weighted avg       0.68      0.51      0.35      1400

