In [2]:
%pip install scikit-learn scikit-multilearn nltk pandas

Note: you may need to restart the kernel to use updated packages.


**Carregar e Explorar os Dados**

In [3]:
import pandas as pd

data = pd.read_csv("dataset/train.csv")

data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


**Pré-processamento do Texto**

In [4]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os

# Definir o diretório de dados do NLTK
nltk_data_dir = os.path.join(os.getcwd(), ".venv", "nltk_data")
nltk.data.path.append(nltk_data_dir)

# Baixar recursos do NLTK
nltk.download('punkt_tab', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('wordnet', download_dir=nltk_data_dir)

def preprocess_text(text):
	# Remover quebras de linha e espaços extras
	text = text.replace('\n', ' ').replace('\r', ' ').strip()

	# Remover URLs e menções
	text = re.sub(r'http\S+', '', text)
	text = re.sub(r'@\S+', '', text)

	# Remover caracteres especiais e números
	text = re.sub(r'[^a-zA-Z\s]', '', text)

	# Converter para minúsculas
	text = text.lower()

	# Tokenização
	tokens = word_tokenize(text)

	# Remover stopwords
	stop_words = set(stopwords.words('english'))
	tokens = [word for word in tokens if word not in stop_words]

	# # Lemmatização
	# lemmatizer = WordNetLemmatizer()
	# tokens = [lemmatizer.lemmatize(word) for word in tokens]

	return ' '.join(tokens)

data['comment_text'] = data['comment_text'].apply(preprocess_text)
data['comment_text'].head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/andrels/projects/exercises/ai/text-
[nltk_data]     mining/.venv/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/andrels/projects/exercises/ai/text-
[nltk_data]     mining/.venv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/andrels/projects/exercises/ai/text-
[nltk_data]     mining/.venv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    explanation edits made username hardcore metal...
1    daww matches background colour im seemingly st...
2    hey man im really trying edit war guy constant...
3    cant make real suggestions improvement wondere...
4                  sir hero chance remember page thats
Name: comment_text, dtype: object

**Train e Test Split**

In [5]:
# import train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
	data['comment_text'],
	data.drop(columns=['id', 'comment_text']),
	test_size=0.3,
	random_state=42
)

**Vetorização dos Textos**

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vetorização dos textos
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

**Treinamento com Classificação Multirrótulo**

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, hamming_loss, classification_report
model = OneVsRestClassifier(LogisticRegression(
    class_weight='balanced',
    max_iter=300,
    solver='liblinear'  # mais leve
))

model.fit(X_train_tfidf, y_train)


# Fazer previsões
y_pred = model.predict(X_test_tfidf)

# Avaliar o desempenho
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("Hamming Loss: ", hamming_loss(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=y_train.columns))


Accuracy Score:  0.8640332553475936
Hamming Loss:  0.0353442513368984
Classification Report:
                precision    recall  f1-score   support

        toxic       0.61      0.85      0.71      4582
 severe_toxic       0.26      0.86      0.40       486
      obscene       0.65      0.88      0.75      2556
       threat       0.19      0.75      0.30       136
       insult       0.51      0.87      0.65      2389
identity_hate       0.20      0.78      0.32       432

    micro avg       0.51      0.86      0.64     10581
    macro avg       0.40      0.83      0.52     10581
 weighted avg       0.56      0.86      0.67     10581
  samples avg       0.06      0.08      0.07     10581



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
